From 8e294ba36081470ff35ee3d99efe727886517170 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 08:54:55 -0400
Subject: [PATCH 01/71] First draft of task sorter

---
 CMakeLists.txt            |   1 +
 csrc/graph/task_graph.cpp | 222 ++++++++++++++++++++++++++++++++++++++
 csrc/graph/task_graph.h   | 105 ++++++++++++++++++
 3 files changed, 328 insertions(+)
 create mode 100644 csrc/graph/task_graph.cpp
 create mode 100644 csrc/graph/task_graph.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5f275d43d1..0f845d3de9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -229,6 +229,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/fusion_guard.cpp
   ${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp
   ${NVFUSER_SRCS_DIR}/global_allocator.cpp
+  ${NVFUSER_SRCS_DIR}/graph/task_graph.cpp
   ${NVFUSER_SRCS_DIR}/grouped_reduction.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/container.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/executor.cpp
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
new file mode 100644
index 00000000000..efd212679e8
--- /dev/null
+++ b/csrc/graph/task_graph.cpp
@@ -0,0 +1,222 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <exceptions.h>
+#include <graph/task_graph.h>
+#include <utils.h>
+
+#include <set>
+
+namespace nvfuser {
+
+void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
+  // First find any Data in the graph that has no definition. This must be
+  // preallocated before running the program, so we initialize allocated and
+  // high_water_mark to the sum of their sizes.
+  TaskGraph::Size allocated = getInitialAllocation();
+  TaskGraph::Size high_water_mark = allocated;
+
+  std::vector<TaskId> future_uses = num_uses_;
+  std::vector<DataId> outstanding_dependencies = num_dependencies_;
+
+  // Now we are ready to process steps
+  for (const Step& step : steps) {
+    const Task& task = getTask(step.task);
+
+    // Allocate outputs
+    for (const DataId output_id : task.outputs) {
+      const Data& data = getData(output_id);
+      if (!data.input_alias.has_value()) {
+        // Don't allocate outputs if they are reusing input memory
+        allocated += data.size;
+      }
+    }
+
+    // Add temporary space
+    allocated += task.temp_space;
+
+    // This is the most space we will use, so update high water mark here
+    high_water_mark = std::max(high_water_mark, allocated);
+    NVF_ERROR(step.high_water_mark == high_water_mark);
+
+    // reduce use count for inputs and free them if possible
+    for (const DataId input_id : task.inputs) {
+      if (--future_uses.at((size_t)input_id)) {
+        // There are no more uses for this Data, so free it if we're allowed to
+        const Data& data = getData(input_id);
+        if (data.can_free) {
+          allocated -= data.size;
+        }
+      }
+    }
+
+    // step.allocated indicates how much space is allocated _upon completion_ of this step
+    NVF_ERROR(step.allocated == allocated);
+  }
+}
+
+namespace {
+
+//! [Backtracking algorithm to find optimal topological ordering]
+//!
+//! If validate==true, then we will validate the steps vector after every
+//! backtracking step.
+//!
+//! c.f. https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
+class TaskSorter {
+ public:
+  TaskSorter(const TaskGraph& graph, bool validate, int64_t max_iters)
+      : graph_(graph), validate_(validate), max_iters_(max_iters) {
+    sort();
+  }
+
+  const std::vector<TaskGraph::Step>& steps() const {
+    return steps_;
+  }
+
+ private:
+  inline void validate() const {
+    if (validate_) {
+      graph_.validateSteps(steps_);
+    }
+  }
+
+  //! This pushes a step indicating that we should execute the given task next.
+  void advance(TaskGraph::TaskId task_id) {
+    TaskGraph::Size allocated = 0;
+    TaskGraph::Size high_water_mark = 0;
+    if (steps_.empty()) {
+      // (Re-)Initialize allocated and high_water_mark to starting values
+      allocated = graph_.getInitialAllocation();
+      high_water_mark = allocated;
+    } else {
+      allocated = steps_.back().allocated;
+      high_water_mark = steps_.back().high_water_mark;
+    }
+
+    // Compute the new allocated amount and high water mark for this step
+    const TaskGraph::Task& task = graph_.getTask(task_id);
+    
+    for (const TaskGraph::DataId output_id : task.outputs) {
+      const TaskGraph::Data& output = graph_.getData(output_id);
+      // Allocate outputs if not aliased
+      if (!output.input_alias.has_value()) {
+        allocated += output.size;
+      }
+
+      // Update outstanding_dependencies_ and ready_tasks_ for each use
+      for (const TaskGraph::TaskId use_id : output.uses) {
+        if (--outstanding_dependencies_.at((size_t)use_id) == 0) {
+          ready_tasks_.insert(use_id);
+        }
+      }
+    }
+
+    // Add temp space
+    allocated += task.temp_space;
+
+    // Update high water mark
+    high_water_mark = std::max(high_water_mark, allocated);
+
+    // Decrement future_uses_ and deallocate dead inputs
+    for (const TaskGraph::DataId input_id : task.inputs) {
+      const TaskGraph::Data& input = graph_.getData(input_id);
+      if (--future_uses_.at((size_t)input_id) == 0) {
+        if (input.can_free) {
+          allocated -= input.size;
+        }
+      }
+    }
+
+    steps_.emplace_back(task_id, allocated, high_water_mark);
+  }
+
+  //! Backtrack a single step. This returns the TaskId of the step that was
+  //! popped.
+  TaskGraph::TaskId backtrack() {
+    validate();
+    TaskGraph::TaskId last_task_id = steps_.back().task;
+    const TaskGraph::Task& last_task = graph_.getTask(last_task_id);
+    steps_.pop_back();
+
+    ready_tasks_.erase(last_task_id);
+
+    // Update outstanding_dependencies to reflect that the outputs of last_task are no longer available
+    for (const TaskGraph::DataId& output_id: last_task.outputs) {
+      const TaskGraph::Data& output = graph_.getData(output_id);
+      for (const TaskGraph::TaskId use_id : output.uses) {
+        outstanding_dependencies_.at((size_t)use_id)++;
+      }
+    }
+
+    // Update future_uses to reflect that the inputs to last_task will need to compute last_task later
+    for (const TaskGraph::DataId& input_id: last_task.inputs) {
+      future_uses_.at((size_t)input_id)++;
+    }
+
+    return last_task_id;
+  }
+
+  void sort() {
+    // Set up outstanding_dependencies_, future_uses_, and ready_tasks_
+    outstanding_dependencies_.reserve(graph_.numTasks());
+    for (const TaskGraph::TaskId task_id : arange(graph_.numTasks())) {
+      const TaskGraph::Task& task = graph_.getTask(task_id);
+      TaskGraph::DataId inputs_to_compute = 0;
+      for (const TaskGraph::DataId data_id : task.inputs) {
+        const TaskGraph::Data& data = graph_.getData(data_id);
+        if (data.definition.has_value()) {
+          // Skip counting input data since these are available before we start
+          inputs_to_compute++; 
+        }
+      }
+      outstanding_dependencies_.push_back(inputs_to_compute);
+      if (inputs_to_compute == 0) {
+        ready_tasks_.insert(task_id);
+      }
+    }
+
+    future_uses_.reserve(graph_.numData());
+    for (const TaskGraph::DataId data_id : arange(graph_.numData())) {
+      const TaskGraph::Data& data = graph_.getData(data_id);
+      future_uses_.push_back(data.uses.size());
+    }
+
+    for (int64_t _ : arange(max_iters_)) {
+    }
+
+    // Validate final result
+    NVF_ERROR(steps_.size() == graph_.numTasks());
+    validate();
+  }
+
+ private:
+  const TaskGraph& graph_;
+  bool validate_;
+  int64_t max_iters_;
+  std::vector<TaskGraph::Step> steps_;
+
+  //! There is one entry here for each task and indicating how many
+  //! dependencies are currently unmet. When this reaches zero the task becomes ready.
+  std::vector<TaskGraph::DataId> outstanding_dependencies_;
+
+  //! There is one entry here for each Data and indicating how many uses there
+  //! are remaining. When it reaches zero, the Data can be freed if allowed.
+  std::vector<TaskGraph::TaskId> future_uses_;
+
+  //! This holds all candidates for the next step, sorted by ID
+  std::set<TaskGraph::TaskId> ready_tasks_;
+};
+
+} // namespace
+
+std::vector<TaskGraph::Step> TaskGraph::findOptimalOrder() const {
+  TaskSorter sorter(*this, /*validate=*/true);
+  return sorter.steps();
+}
+
+} // namespace nvfuser
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
new file mode 100644
index 00000000000..c987f319702
--- /dev/null
+++ b/csrc/graph/task_graph.h
@@ -0,0 +1,105 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+namespace nvfuser {
+
+class TaskGraph {
+ public:
+  using TaskId = int16_t;
+  using DataId = int16_t;
+  using Size = int64_t;
+
+  //! A Task consumes some input Data and produces some output Data. To do so, it might use some intermediate space.
+  struct Task {
+    std::vector<DataId> inputs;
+    std::vector<DataId> outputs;
+    //! This amount of temporary space is required only while executing the Task and is immediately freed afterward
+    Size temp_space = 0;
+  };
+
+  struct Data {
+    std::optional<TaskId> definition;
+    std::vector<TaskId> uses;
+    // If set, this means we do not allocate a new output when executing this Data's definition, instead we re-use the space from the specified input. Note that this implies an ordering constraint which we will check, since the definition must be the last use of the aliased input.
+    std::optional<DataId> input_alias;
+    Size size;
+
+    //! This indicates whether we are able to free this data after its last use. For a segmented fusion, unsegmented fusion inputs and outputs cannot be freed (with the exception of an aliased input), while any intermediate tensors should be freed as soon as possible.
+    bool can_free = true;
+  };
+
+  TaskGraph(const std::vector<Task>& tasks, const std::vector<Data>& data) : tasks_(tasks), data_(data) {
+    // Initialize the counts of future uses of data and unmet dependencies of tasks. These are the out-degrees of Data and in-degrees of Tasks, respectively.
+    num_dependencies_.reserve(tasks_.size());
+    for (const Task& task : tasks_) {
+      num_dependencies_.push_back((DataId)task.inputs.size());
+    }
+    num_uses_.reserve(data_.size());
+    for (const Data& data : data_) {
+      num_dependencies_.push_back((TaskId)data.uses.size());
+      if (!data.definition.has_value()) {
+        initial_allocation_ += (Size)data.size;
+      }
+    }
+  }
+
+  //! This represents the execution of a single Task in a given ordering. It tracks some cumulative state representing the amount of space required up to this point.
+  struct Step {
+    TaskId task;
+
+    //! This is the sum of all Data that is active _after_ execution of this task and after any inputs with no more uses are freed.
+    Size allocated;
+
+    //! This is the maximum active space used until this step is completed.
+    Size high_water_mark;
+  };
+
+  TaskId numTasks() const {
+    return (TaskId)tasks_.size();
+  }
+
+  const Task& getTask(TaskId id) const {
+    return tasks_.at((size_t)id);
+  }
+
+  TaskId numData() const {
+    return (DataId)data_.size();
+  }
+
+  const Data& getData(DataId id) const {
+    return data_.at((size_t)id);
+  }
+
+  Size getInitialAllocation() const {
+    return initial_allocation_;
+  }
+
+  //! Given a list of steps, recompute the active space and high water mark. This is useful for validating that our backtracking algorithm does not corrupt this data. Raises an exception if corruption is detected.
+  void validateSteps(const std::vector<Step>& steps) const;
+
+  //! This does an exhaustive search of all possible orderings using a modified Kahn's algorithm to efficiently traverse the set of possible topological orderings.
+  std::vector<Step> findOptimalOrder() const;
+
+ private:
+  std::vector<Task> tasks_;
+  std::vector<Data> data_;
+
+  //! How much data is allocated by data that has no definition, i.e. input data
+  Size initial_allocation_;
+
+  std::vector<TaskId> num_uses_;
+  std::vector<DataId> num_dependencies_;
+};
+
+
+} // namespace nvfuser

From 7b279c200325b1db395c25ca55464494414ea21c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 09:09:42 -0400
Subject: [PATCH 02/71] Completed first draft of sort() algorithm

---
 csrc/graph/task_graph.cpp | 66 +++++++++++++++++++++++++++++++++------
 csrc/graph/task_graph.h   | 39 ++++++++++++++++-------
 2 files changed, 85 insertions(+), 20 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index efd212679e8..ca630265c5e 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -54,7 +54,8 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
       }
     }
 
-    // step.allocated indicates how much space is allocated _upon completion_ of this step
+    // step.allocated indicates how much space is allocated _upon completion_ of
+    // this step
     NVF_ERROR(step.allocated == allocated);
   }
 }
@@ -100,7 +101,7 @@ class TaskSorter {
 
     // Compute the new allocated amount and high water mark for this step
     const TaskGraph::Task& task = graph_.getTask(task_id);
-    
+
     for (const TaskGraph::DataId output_id : task.outputs) {
       const TaskGraph::Data& output = graph_.getData(output_id);
       // Allocate outputs if not aliased
@@ -145,16 +146,18 @@ class TaskSorter {
 
     ready_tasks_.erase(last_task_id);
 
-    // Update outstanding_dependencies to reflect that the outputs of last_task are no longer available
-    for (const TaskGraph::DataId& output_id: last_task.outputs) {
+    // Update outstanding_dependencies to reflect that the outputs of last_task
+    // are no longer available
+    for (const TaskGraph::DataId& output_id : last_task.outputs) {
       const TaskGraph::Data& output = graph_.getData(output_id);
       for (const TaskGraph::TaskId use_id : output.uses) {
         outstanding_dependencies_.at((size_t)use_id)++;
       }
     }
 
-    // Update future_uses to reflect that the inputs to last_task will need to compute last_task later
-    for (const TaskGraph::DataId& input_id: last_task.inputs) {
+    // Update future_uses to reflect that the inputs to last_task will need to
+    // compute last_task later
+    for (const TaskGraph::DataId& input_id : last_task.inputs) {
       future_uses_.at((size_t)input_id)++;
     }
 
@@ -171,7 +174,7 @@ class TaskSorter {
         const TaskGraph::Data& data = graph_.getData(data_id);
         if (data.definition.has_value()) {
           // Skip counting input data since these are available before we start
-          inputs_to_compute++; 
+          inputs_to_compute++;
         }
       }
       outstanding_dependencies_.push_back(inputs_to_compute);
@@ -186,9 +189,51 @@ class TaskSorter {
       future_uses_.push_back(data.uses.size());
     }
 
+    // Initialize best_usage
+    TaskGraph::Size best_usage = std::numeric_limits<TaskGraph::Size>::max();
+    std::vector<TaskGraph::Step> best_steps;
+
+    // This is the main optimization loop
+    TaskGraph::TaskId backtracked_task_id = -1;
     for (int64_t _ : arange(max_iters_)) {
+      NVF_ERROR(
+          !ready_tasks_.empty() || steps_.size() == (size_t)graph_.numTasks(),
+          "Ran out of ready tasks before completing ordering");
+
+      TaskGraph::TaskId next_task_id = -1;
+      for (const TaskGraph::TaskId ready_id : ready_tasks_) {
+        if (ready_id > backtracked_task_id) {
+          next_task_id = ready_id;
+          break;
+        }
+      }
+
+      if (next_task_id == -1) {
+        // There are no ready tasks with ID above the backtracked_task_id. This
+        // means it is time to backtrack
+        backtracked_task_id = backtrack();
+        continue;
+      }
+
+      advance(next_task_id);
+
+      // If our high water mark is above best_usage, terminate early and
+      // backtrack
+      if (steps_.back().high_water_mark > best_usage) {
+        backtracked_task_id = backtrack();
+        continue;
+      }
+
+      // Our usage is at or below best_usage. Have we completed an ordering? If
+      // so, update best_steps
+      if (steps_.size() == (size_t)graph_.numTasks()) {
+        best_steps = steps_;
+      }
     }
 
+    // Record our best found steps
+    steps_ = best_steps;
+
     // Validate final result
     NVF_ERROR(steps_.size() == graph_.numTasks());
     validate();
@@ -201,7 +246,8 @@ class TaskSorter {
   std::vector<TaskGraph::Step> steps_;
 
   //! There is one entry here for each task and indicating how many
-  //! dependencies are currently unmet. When this reaches zero the task becomes ready.
+  //! dependencies are currently unmet. When this reaches zero the task becomes
+  //! ready.
   std::vector<TaskGraph::DataId> outstanding_dependencies_;
 
   //! There is one entry here for each Data and indicating how many uses there
@@ -215,7 +261,9 @@ class TaskSorter {
 } // namespace
 
 std::vector<TaskGraph::Step> TaskGraph::findOptimalOrder() const {
-  TaskSorter sorter(*this, /*validate=*/true);
+  // TODO: Find a reasonable default number of iterations. Note that one
+  // iteration equals one task, not one ordering
+  TaskSorter sorter(*this, /*validate=*/true, /*max_iters=*/2000);
   return sorter.steps();
 }
 
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index c987f319702..d959d1cd018 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -19,27 +19,38 @@ class TaskGraph {
   using DataId = int16_t;
   using Size = int64_t;
 
-  //! A Task consumes some input Data and produces some output Data. To do so, it might use some intermediate space.
+  //! A Task consumes some input Data and produces some output Data. To do so,
+  //! it might use some intermediate space.
   struct Task {
     std::vector<DataId> inputs;
     std::vector<DataId> outputs;
-    //! This amount of temporary space is required only while executing the Task and is immediately freed afterward
+    //! This amount of temporary space is required only while executing the Task
+    //! and is immediately freed afterward
     Size temp_space = 0;
   };
 
   struct Data {
     std::optional<TaskId> definition;
     std::vector<TaskId> uses;
-    // If set, this means we do not allocate a new output when executing this Data's definition, instead we re-use the space from the specified input. Note that this implies an ordering constraint which we will check, since the definition must be the last use of the aliased input.
+    // If set, this means we do not allocate a new output when executing this
+    // Data's definition, instead we re-use the space from the specified input.
+    // Note that this implies an ordering constraint which we will check, since
+    // the definition must be the last use of the aliased input.
     std::optional<DataId> input_alias;
     Size size;
 
-    //! This indicates whether we are able to free this data after its last use. For a segmented fusion, unsegmented fusion inputs and outputs cannot be freed (with the exception of an aliased input), while any intermediate tensors should be freed as soon as possible.
+    //! This indicates whether we are able to free this data after its last use.
+    //! For a segmented fusion, unsegmented fusion inputs and outputs cannot be
+    //! freed (with the exception of an aliased input), while any intermediate
+    //! tensors should be freed as soon as possible.
     bool can_free = true;
   };
 
-  TaskGraph(const std::vector<Task>& tasks, const std::vector<Data>& data) : tasks_(tasks), data_(data) {
-    // Initialize the counts of future uses of data and unmet dependencies of tasks. These are the out-degrees of Data and in-degrees of Tasks, respectively.
+  TaskGraph(const std::vector<Task>& tasks, const std::vector<Data>& data)
+      : tasks_(tasks), data_(data) {
+    // Initialize the counts of future uses of data and unmet dependencies of
+    // tasks. These are the out-degrees of Data and in-degrees of Tasks,
+    // respectively.
     num_dependencies_.reserve(tasks_.size());
     for (const Task& task : tasks_) {
       num_dependencies_.push_back((DataId)task.inputs.size());
@@ -53,11 +64,14 @@ class TaskGraph {
     }
   }
 
-  //! This represents the execution of a single Task in a given ordering. It tracks some cumulative state representing the amount of space required up to this point.
+  //! This represents the execution of a single Task in a given ordering. It
+  //! tracks some cumulative state representing the amount of space required up
+  //! to this point.
   struct Step {
     TaskId task;
 
-    //! This is the sum of all Data that is active _after_ execution of this task and after any inputs with no more uses are freed.
+    //! This is the sum of all Data that is active _after_ execution of this
+    //! task and after any inputs with no more uses are freed.
     Size allocated;
 
     //! This is the maximum active space used until this step is completed.
@@ -84,10 +98,14 @@ class TaskGraph {
     return initial_allocation_;
   }
 
-  //! Given a list of steps, recompute the active space and high water mark. This is useful for validating that our backtracking algorithm does not corrupt this data. Raises an exception if corruption is detected.
+  //! Given a list of steps, recompute the active space and high water mark.
+  //! This is useful for validating that our backtracking algorithm does not
+  //! corrupt this data. Raises an exception if corruption is detected.
   void validateSteps(const std::vector<Step>& steps) const;
 
-  //! This does an exhaustive search of all possible orderings using a modified Kahn's algorithm to efficiently traverse the set of possible topological orderings.
+  //! This does an exhaustive search of all possible orderings using a modified
+  //! Kahn's algorithm to efficiently traverse the set of possible topological
+  //! orderings.
   std::vector<Step> findOptimalOrder() const;
 
  private:
@@ -101,5 +119,4 @@ class TaskGraph {
   std::vector<DataId> num_dependencies_;
 };
 
-
 } // namespace nvfuser

From 6012846aab8028dc8bcebf8d0431ed3c5b74772f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 11:26:48 -0400
Subject: [PATCH 03/71] Start building into FusionSegmenter

---
 csrc/fusion_segmenter.cpp | 113 ++++++++++++++++++++++++++++++++++++++
 csrc/graph/task_graph.cpp |  18 +++---
 csrc/graph/task_graph.h   |  13 ++++-
 3 files changed, 136 insertions(+), 8 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index bdfe2a9da8c..97ea4e22eca 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -6,13 +6,16 @@
  */
 // clang-format on
 #include <algorithm>
+#include <limits>
 #include <sstream>
 
 #include <debug.h>
 #include <device_lower/utils.h>
 #include <disjoint_set.h>
+#include <exceptions.h>
 #include <fusion.h>
 #include <fusion_segmenter.h>
+#include <graph/task_graph.h>
 #include <instrumentation.h>
 #include <ir/all_nodes.h>
 #include <ir/cloner.h>
@@ -1982,6 +1985,116 @@ bool SegmentCandidateFinder::hasSegmentHints(Fusion* fusion) {
 }
 
 namespace {
+
+std::vector<SegmentedGroup*> optimalTopoSort(
+    const std::vector<SegmentedGroup*>& groups) {
+  NVF_ERROR(
+      groups.size() <= std::numeric_limits<TaskGraph::TaskId>::max(),
+      "There are too many tasks to represent with TaskGraph::TaskId");
+
+  std::vector<TaskGraph::Data> all_data;
+  std::unordered_map<TensorView*, TaskGraph::DataId> tv2dataid;
+
+  const auto maybe_register_tv = [&](TensorView* tv) -> TaskGraph::DataId {
+    auto it = tv2dataid.find(tv);
+    if (it == tv2dataid.end()) {
+      // Register this TV
+      TaskGraph::DataId new_id = (TaskGraph::DataId)all_data.size();
+      tv2dataid[tv] = new_id;
+
+      // TODO: Pass runtime info so we can use actual sizes here, or at least
+      // use a better estimate
+      TaskGraph::Size size = 256;
+
+      all_data.emplace_back(
+          /*definition=*/std::nullopt,
+          /*uses=*/std::vector<TaskGraph::TaskId>{},
+          /*input_alias=*/std::nullopt,
+          size,
+          /*can_free=*/true);
+      return new_id;
+    } else {
+      return it->second;
+    }
+  };
+
+  std::vector<TaskGraph::Task> all_tasks;
+  all_tasks.reserve(groups.size());
+  for (SegmentedGroup* group : groups) {
+    TaskGraph::TaskId task_id = (TaskGraph::TaskId)all_tasks.size();
+
+    std::vector<TaskGraph::DataId> inputs;
+    // These are fusion inputs, so they are not edges between segments
+    for (Val* v : group->inputs()) {
+      if (auto* tv = dynamic_cast<TensorView*>(v)) {
+        // Ignore scalar inputs
+        TaskGraph::DataId data_id = maybe_register_tv(tv);
+        TaskGraph::Data& data = all_data.at((size_t)data_id);
+        data.uses.push_back(task_id);
+        data.can_free = false;
+        inputs.push_back(data_id);
+      }
+    }
+    // Now look at producer edges i.e. inputs that are intermediates and can
+    // likely be freed
+    for (SegmentedEdge* edge : group->producer_edges) {
+      if (auto* tv = dynamic_cast<TensorView*>(edge->val)) {
+        TaskGraph::DataId data_id = maybe_register_tv(tv);
+        TaskGraph::Data& data = all_data.at((size_t)data_id);
+        data.uses.push_back(task_id);
+        inputs.push_back(data_id);
+      }
+    }
+    // Now look at fusion outputs coming from this task. Like unaliased inputs,
+    // we never free these even after their last use
+    std::vector<TaskGraph::DataId> outputs;
+    for (Val* v : group->outputs()) {
+      if (auto* tv = dynamic_cast<TensorView*>(v)) {
+        // Ignore scalar inputs
+        TaskGraph::DataId data_id = maybe_register_tv(tv);
+        TaskGraph::Data& data = all_data.at((size_t)data_id);
+        data.uses.push_back(task_id);
+        data.can_free = false;
+        inputs.push_back(data_id);
+        if (Val* aliased_input = tv->fusion()->getOutputAlias(tv).aliased_io) {
+          TaskGraph::DataId alias_id = maybe_register_tv(tv);
+          data.input_alias = alias_id;
+        }
+        outputs.push_back(data_id);
+      }
+    }
+    for (SegmentedEdge* edge : group->consumer_edges) {
+      if (auto* tv = dynamic_cast<TensorView*>(edge->val)) {
+        TaskGraph::DataId data_id = maybe_register_tv(tv);
+        TaskGraph::Data& data = all_data.at((size_t)data_id);
+        data.uses.push_back(task_id);
+        outputs.push_back(data_id);
+      }
+    }
+
+    std::vector<TaskGraph::DataId> outputs;
+
+    // TODO: inspect compiled segment executors to determine temp gmem needed
+    TaskGraph::Size temp_space = 0;
+
+    all_tasks.emplace_back(inputs, outputs, temp_space);
+  }
+
+  NVF_ERROR(
+      all_data.size() <= std::numeric_limits<TaskGraph::TaskId>::max(),
+      "There are too many tensors to represent with TaskGraph::DataId");
+
+  TaskGraph graph(all_tasks, all_data);
+
+  TaskGraph::SortResult result = graph.findOptimalOrder();
+
+  std::vector<SegmentedGroup*> order;
+  order.reserve(groups.size());
+  for (const TaskGraph::Step& step : result.steps) {
+    order.push_back(groups.at((size_t)step.task));
+  }
+}
+
 std::vector<SegmentedGroup*> toposort(
     const std::vector<SegmentedGroup*>& groups) {
   std::deque<SegmentedGroup*> to_visit;
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index ca630265c5e..2b36836c757 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -75,8 +75,8 @@ class TaskSorter {
     sort();
   }
 
-  const std::vector<TaskGraph::Step>& steps() const {
-    return steps_;
+  const TaskGraph::SortResult& result() const {
+    return result_;
   }
 
  private:
@@ -195,7 +195,9 @@ class TaskSorter {
 
     // This is the main optimization loop
     TaskGraph::TaskId backtracked_task_id = -1;
-    for (int64_t _ : arange(max_iters_)) {
+    int64_t iter = 0;
+    while (iter < max_iters_) {
+      iter++;
       NVF_ERROR(
           !ready_tasks_.empty() || steps_.size() == (size_t)graph_.numTasks(),
           "Ran out of ready tasks before completing ordering");
@@ -230,12 +232,13 @@ class TaskSorter {
         best_steps = steps_;
       }
     }
+    result_.iterations = iter;
 
     // Record our best found steps
-    steps_ = best_steps;
+    result_.steps = best_steps;
 
     // Validate final result
-    NVF_ERROR(steps_.size() == graph_.numTasks());
+    NVF_ERROR(result_.steps.size() == graph_.numTasks());
     validate();
   }
 
@@ -243,6 +246,7 @@ class TaskSorter {
   const TaskGraph& graph_;
   bool validate_;
   int64_t max_iters_;
+  TaskGraph::SortResult result_;
   std::vector<TaskGraph::Step> steps_;
 
   //! There is one entry here for each task and indicating how many
@@ -260,11 +264,11 @@ class TaskSorter {
 
 } // namespace
 
-std::vector<TaskGraph::Step> TaskGraph::findOptimalOrder() const {
+TaskGraph::SortResult TaskGraph::findOptimalOrder() const {
   // TODO: Find a reasonable default number of iterations. Note that one
   // iteration equals one task, not one ordering
   TaskSorter sorter(*this, /*validate=*/true, /*max_iters=*/2000);
-  return sorter.steps();
+  return sorter.result();
 }
 
 } // namespace nvfuser
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index d959d1cd018..766cf6be685 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -103,10 +103,21 @@ class TaskGraph {
   //! corrupt this data. Raises an exception if corruption is detected.
   void validateSteps(const std::vector<Step>& steps) const;
 
+  struct SortResult {
+    std::vector<Step> steps;
+
+    //! Number of iterations computed
+    int64_t iterations;
+
+    //! Whether the search was exhaustive. If not, then it was likely cut off
+    //! early because of an iteration limit.
+    bool exhaustive;
+  };
+
   //! This does an exhaustive search of all possible orderings using a modified
   //! Kahn's algorithm to efficiently traverse the set of possible topological
   //! orderings.
-  std::vector<Step> findOptimalOrder() const;
+  SortResult findOptimalOrder() const;
 
  private:
   std::vector<Task> tasks_;

From da536051684865102ca34fdb92f7892ea6555860 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 11:29:04 -0400
Subject: [PATCH 04/71] Use optimalTopoSort

---
 csrc/fusion_segmenter.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 97ea4e22eca..1510b3606c3 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -5292,7 +5292,8 @@ RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
     }
   }
 
-  runtime_workspace.group_run_order = toposort(segmented_fusion.groups());
+  runtime_workspace.group_run_order = optimalTopoSort(segmented_fusion.groups());
+  //runtime_workspace.group_run_order = toposort(segmented_fusion.groups());
 
   return runtime_workspace;
 }

From daeb547d7a08df631e6c33b7daa3e2ff8dc21821 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 11:44:36 -0400
Subject: [PATCH 05/71] Fixes

---
 csrc/fusion_segmenter.cpp | 13 +++++++------
 csrc/graph/task_graph.cpp |  8 ++++++--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 1510b3606c3..c3354551663 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2056,8 +2056,9 @@ std::vector<SegmentedGroup*> optimalTopoSort(
         data.uses.push_back(task_id);
         data.can_free = false;
         inputs.push_back(data_id);
-        if (Val* aliased_input = tv->fusion()->getOutputAlias(tv).aliased_io) {
-          TaskGraph::DataId alias_id = maybe_register_tv(tv);
+        if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
+                tv->fusion()->getOutputAlias(tv).aliased_io)) {
+          TaskGraph::DataId alias_id = maybe_register_tv(aliased_input_tv);
           data.input_alias = alias_id;
         }
         outputs.push_back(data_id);
@@ -2072,8 +2073,6 @@ std::vector<SegmentedGroup*> optimalTopoSort(
       }
     }
 
-    std::vector<TaskGraph::DataId> outputs;
-
     // TODO: inspect compiled segment executors to determine temp gmem needed
     TaskGraph::Size temp_space = 0;
 
@@ -2093,6 +2092,7 @@ std::vector<SegmentedGroup*> optimalTopoSort(
   for (const TaskGraph::Step& step : result.steps) {
     order.push_back(groups.at((size_t)step.task));
   }
+  return order;
 }
 
 std::vector<SegmentedGroup*> toposort(
@@ -5292,8 +5292,9 @@ RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
     }
   }
 
-  runtime_workspace.group_run_order = optimalTopoSort(segmented_fusion.groups());
-  //runtime_workspace.group_run_order = toposort(segmented_fusion.groups());
+  runtime_workspace.group_run_order =
+      optimalTopoSort(segmented_fusion.groups());
+  // runtime_workspace.group_run_order = toposort(segmented_fusion.groups());
 
   return runtime_workspace;
 }
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 2b36836c757..78ad3605e9a 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -99,6 +99,10 @@ class TaskSorter {
       high_water_mark = steps_.back().high_water_mark;
     }
 
+    NVF_ERROR(
+        ready_tasks_.erase(task_id) == 1,
+        "Attempted to advance to task that was not marked ready");
+
     // Compute the new allocated amount and high water mark for this step
     const TaskGraph::Task& task = graph_.getTask(task_id);
 
@@ -144,7 +148,7 @@ class TaskSorter {
     const TaskGraph::Task& last_task = graph_.getTask(last_task_id);
     steps_.pop_back();
 
-    ready_tasks_.erase(last_task_id);
+    ready_tasks_.insert(last_task_id);
 
     // Update outstanding_dependencies to reflect that the outputs of last_task
     // are no longer available
@@ -238,7 +242,7 @@ class TaskSorter {
     result_.steps = best_steps;
 
     // Validate final result
-    NVF_ERROR(result_.steps.size() == graph_.numTasks());
+    NVF_ERROR(result_.steps.size() == (size_t)graph_.numTasks());
     validate();
   }
 

From 8076f0a74eb2124418442faf1d987d3d4b2a78ff Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 12:02:21 -0400
Subject: [PATCH 06/71] Add missing exhaustive check

---
 csrc/graph/task_graph.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 78ad3605e9a..7dd6451cabf 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -217,6 +217,13 @@ class TaskSorter {
       if (next_task_id == -1) {
         // There are no ready tasks with ID above the backtracked_task_id. This
         // means it is time to backtrack
+
+        if (steps_.empty()) {
+          // If there is nowhere to backtrack it means we are done with the
+          // search
+          result_.exhaustive = true;
+          break;
+        }
         backtracked_task_id = backtrack();
         continue;
       }

From b29ad5e058f6c2c76b33dc8b614d14058585e3f8 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 13:33:04 -0400
Subject: [PATCH 07/71] Fix definition of graph. Now have error in validation

---
 csrc/fusion_segmenter.cpp | 36 +++++++-------------
 csrc/graph/task_graph.cpp | 69 +++++++++++++++++++++++++++++++++++++--
 csrc/graph/task_graph.h   | 39 ++++++++++++++++++++--
 3 files changed, 115 insertions(+), 29 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index c3354551663..53090d8e206 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2024,51 +2024,33 @@ std::vector<SegmentedGroup*> optimalTopoSort(
     TaskGraph::TaskId task_id = (TaskGraph::TaskId)all_tasks.size();
 
     std::vector<TaskGraph::DataId> inputs;
+    std::cout << "Task " << task_id
+              << " is segmented groupId=" << group->groupId() << std::endl;
     // These are fusion inputs, so they are not edges between segments
     for (Val* v : group->inputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
+        std::cout << "  Group input " << tv->toString() << std::endl;
         // Ignore scalar inputs
         TaskGraph::DataId data_id = maybe_register_tv(tv);
         TaskGraph::Data& data = all_data.at((size_t)data_id);
         data.uses.push_back(task_id);
-        data.can_free = false;
+        data.can_free = !tv->isFusionInput();
         inputs.push_back(data_id);
       }
     }
-    // Now look at producer edges i.e. inputs that are intermediates and can
-    // likely be freed
-    for (SegmentedEdge* edge : group->producer_edges) {
-      if (auto* tv = dynamic_cast<TensorView*>(edge->val)) {
-        TaskGraph::DataId data_id = maybe_register_tv(tv);
-        TaskGraph::Data& data = all_data.at((size_t)data_id);
-        data.uses.push_back(task_id);
-        inputs.push_back(data_id);
-      }
-    }
-    // Now look at fusion outputs coming from this task. Like unaliased inputs,
-    // we never free these even after their last use
     std::vector<TaskGraph::DataId> outputs;
     for (Val* v : group->outputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
-        // Ignore scalar inputs
+        std::cout << "  Group output " << tv->toString() << std::endl;
         TaskGraph::DataId data_id = maybe_register_tv(tv);
         TaskGraph::Data& data = all_data.at((size_t)data_id);
-        data.uses.push_back(task_id);
-        data.can_free = false;
-        inputs.push_back(data_id);
+        data.definition = task_id;
         if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
                 tv->fusion()->getOutputAlias(tv).aliased_io)) {
           TaskGraph::DataId alias_id = maybe_register_tv(aliased_input_tv);
           data.input_alias = alias_id;
         }
-        outputs.push_back(data_id);
-      }
-    }
-    for (SegmentedEdge* edge : group->consumer_edges) {
-      if (auto* tv = dynamic_cast<TensorView*>(edge->val)) {
-        TaskGraph::DataId data_id = maybe_register_tv(tv);
-        TaskGraph::Data& data = all_data.at((size_t)data_id);
-        data.uses.push_back(task_id);
+        data.can_free = !tv->isFusionOutput();
         outputs.push_back(data_id);
       }
     }
@@ -2085,8 +2067,12 @@ std::vector<SegmentedGroup*> optimalTopoSort(
 
   TaskGraph graph(all_tasks, all_data);
 
+  std::cout << graph << std::endl;
+
   TaskGraph::SortResult result = graph.findOptimalOrder();
 
+  std::cout << result << std::endl;
+
   std::vector<SegmentedGroup*> order;
   order.reserve(groups.size());
   for (const TaskGraph::Step& step : result.steps) {
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 7dd6451cabf..ffa982892de 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -10,6 +10,8 @@
 #include <utils.h>
 
 #include <set>
+#include <sstream>
+#include <string>
 
 namespace nvfuser {
 
@@ -41,7 +43,9 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
 
     // This is the most space we will use, so update high water mark here
     high_water_mark = std::max(high_water_mark, allocated);
-    NVF_ERROR(step.high_water_mark == high_water_mark);
+    NVF_ERROR(
+        step.high_water_mark == high_water_mark,
+        "Mismatch in high water mark during validation");
 
     // reduce use count for inputs and free them if possible
     for (const DataId input_id : task.inputs) {
@@ -56,7 +60,8 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
 
     // step.allocated indicates how much space is allocated _upon completion_ of
     // this step
-    NVF_ERROR(step.allocated == allocated);
+    NVF_ERROR(
+        step.allocated == allocated, "Mismatch in allocated during validation");
   }
 }
 
@@ -275,6 +280,66 @@ class TaskSorter {
 
 } // namespace
 
+std::string TaskGraph::Task::toString() const {
+  std::stringstream ss;
+  ss << "Task{";
+  ss << "input ids={" << inputs << "}";
+  ss << ", output ids={" << outputs << "}";
+  ss << ", temp space=" << temp_space;
+  ss << "}";
+  return ss.str();
+}
+
+std::string TaskGraph::Data::toString() const {
+  std::stringstream ss;
+  ss << "Data{";
+  ss << "definition="
+     << (definition.has_value() ? std::to_string(definition.value()) : "none");
+  ss << ", uses={" << uses << "}";
+  ss << ", size=" << size;
+  ss << ", input alias="
+     << (input_alias.has_value() ? std::to_string(input_alias.value())
+                                 : "none");
+  ss << ", can_free=" << (can_free ? "yes" : "no");
+  ss << "}";
+  return ss.str();
+}
+
+std::string TaskGraph::Step::toString() const {
+  std::stringstream ss;
+  ss << "Step{";
+  ss << "task id=" << task;
+  ss << ", allocated=" << allocated;
+  ss << ", high water mark=" << high_water_mark;
+  ss << "}";
+  return ss.str();
+}
+
+std::string TaskGraph::SortResult::toString() const {
+  std::stringstream ss;
+  ss << "SortResult{";
+  ss << "steps={" << steps << "}";
+  ss << ", iterations=" << iterations;
+  ss << ", exhaustive=" << (exhaustive ? "yes" : "no");
+  ss << "}";
+  return ss.str();
+}
+
+std::string TaskGraph::toString() const {
+  std::stringstream ss;
+  ss << "TaskGraph{\n";
+  ss << "  data:\n";
+  for (DataId i : arange(numData())) {
+    ss << "    " << i << " = " << getData(i) << "\n";
+  }
+  ss << "  tasks:\n";
+  for (TaskId j : arange(numTasks())) {
+    ss << "    " << j << " = " << getTask(j) << "\n";
+  }
+  ss << "}";
+  return ss.str();
+}
+
 TaskGraph::SortResult TaskGraph::findOptimalOrder() const {
   // TODO: Find a reasonable default number of iterations. Note that one
   // iteration equals one task, not one ordering
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 766cf6be685..67fe86e96fb 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -9,6 +9,8 @@
 
 #include <cstdint>
 #include <optional>
+#include <ostream>
+#include <string>
 #include <vector>
 
 namespace nvfuser {
@@ -27,6 +29,8 @@ class TaskGraph {
     //! This amount of temporary space is required only while executing the Task
     //! and is immediately freed afterward
     Size temp_space = 0;
+
+    std::string toString() const;
   };
 
   struct Data {
@@ -44,6 +48,8 @@ class TaskGraph {
     //! freed (with the exception of an aliased input), while any intermediate
     //! tensors should be freed as soon as possible.
     bool can_free = true;
+
+    std::string toString() const;
   };
 
   TaskGraph(const std::vector<Task>& tasks, const std::vector<Data>& data)
@@ -57,7 +63,7 @@ class TaskGraph {
     }
     num_uses_.reserve(data_.size());
     for (const Data& data : data_) {
-      num_dependencies_.push_back((TaskId)data.uses.size());
+      num_uses_.push_back((TaskId)data.uses.size());
       if (!data.definition.has_value()) {
         initial_allocation_ += (Size)data.size;
       }
@@ -76,6 +82,8 @@ class TaskGraph {
 
     //! This is the maximum active space used until this step is completed.
     Size high_water_mark;
+
+    std::string toString() const;
   };
 
   TaskId numTasks() const {
@@ -112,6 +120,8 @@ class TaskGraph {
     //! Whether the search was exhaustive. If not, then it was likely cut off
     //! early because of an iteration limit.
     bool exhaustive;
+
+    std::string toString() const;
   };
 
   //! This does an exhaustive search of all possible orderings using a modified
@@ -119,15 +129,40 @@ class TaskGraph {
   //! orderings.
   SortResult findOptimalOrder() const;
 
+  std::string toString() const;
+
  private:
   std::vector<Task> tasks_;
   std::vector<Data> data_;
 
   //! How much data is allocated by data that has no definition, i.e. input data
-  Size initial_allocation_;
+  Size initial_allocation_ = 0;
 
   std::vector<TaskId> num_uses_;
   std::vector<DataId> num_dependencies_;
 };
 
+inline std::ostream& operator<<(std::ostream& os, const TaskGraph::Task& task) {
+  os << task.toString();
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const TaskGraph::Data& data) {
+  os << data.toString();
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const TaskGraph& graph) {
+  os << graph.toString();
+  return os;
+}
+inline std::ostream& operator<<(std::ostream& os, const TaskGraph::Step& step) {
+  os << step.toString();
+  return os;
+}
+inline std::ostream& operator<<(
+    std::ostream& os,
+    const TaskGraph::SortResult& result) {
+  os << result.toString();
+  return os;
+}
+
 } // namespace nvfuser

From 6488c249ada8198f84b524fb2bc1499f8c32252e Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 13:54:06 -0400
Subject: [PATCH 08/71] Fix error in validation. Working

---
 csrc/fusion_segmenter.cpp |  2 +-
 csrc/graph/task_graph.cpp | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 53090d8e206..e9e54fc71de 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2004,7 +2004,7 @@ std::vector<SegmentedGroup*> optimalTopoSort(
 
       // TODO: Pass runtime info so we can use actual sizes here, or at least
       // use a better estimate
-      TaskGraph::Size size = 256;
+      TaskGraph::Size size = 1;
 
       all_data.emplace_back(
           /*definition=*/std::nullopt,
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index ffa982892de..04b093d2826 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -25,38 +25,55 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
   std::vector<TaskId> future_uses = num_uses_;
   std::vector<DataId> outstanding_dependencies = num_dependencies_;
 
+  std::cout << "Validating " << steps << std::endl;
+  std::cout << "    allocated=" << allocated << std::endl;
+
   // Now we are ready to process steps
   for (const Step& step : steps) {
     const Task& task = getTask(step.task);
+    std::cout << "  " << step << "  " << task << std::endl;
 
     // Allocate outputs
     for (const DataId output_id : task.outputs) {
       const Data& data = getData(output_id);
       if (!data.input_alias.has_value()) {
         // Don't allocate outputs if they are reusing input memory
+        std::cout << "    adding " << data.size << " to allocated for output "
+                  << output_id << ": " << data << std::endl;
         allocated += data.size;
       }
     }
 
     // Add temporary space
+    std::cout << "    adding " << task.temp_space
+              << " to allocated for temp space " << std::endl;
     allocated += task.temp_space;
 
     // This is the most space we will use, so update high water mark here
     high_water_mark = std::max(high_water_mark, allocated);
+    std::cout << "    high water mark is " << high_water_mark << std::endl;
     NVF_ERROR(
         step.high_water_mark == high_water_mark,
         "Mismatch in high water mark during validation");
 
     // reduce use count for inputs and free them if possible
     for (const DataId input_id : task.inputs) {
-      if (--future_uses.at((size_t)input_id)) {
+      std::cout << "    predecrement future uses="
+                << future_uses.at((size_t)input_id) << " for input id "
+                << input_id << std::endl;
+      if (--future_uses.at((size_t)input_id) == 0) {
         // There are no more uses for this Data, so free it if we're allowed to
         const Data& data = getData(input_id);
+        std::cout << "    input with no future uses: " << data << std::endl;
         if (data.can_free) {
+          std::cout << "    subtracting " << data.size
+                    << " from allocated for input " << input_id << ": " << data
+                    << std::endl;
           allocated -= data.size;
         }
       }
     }
+    std::cout << "    allocated=" << allocated << std::endl;
 
     // step.allocated indicates how much space is allocated _upon completion_ of
     // this step

From e7cf966863b43ae0d659b5634b239be75dd5fa25 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 14:01:46 -0400
Subject: [PATCH 09/71] Add repro to test_repro.py

---
 tests/python/test_repro.py | 72 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/python/test_repro.py b/tests/python/test_repro.py
index e6ce55564a9..00e566e55e6 100644
--- a/tests/python/test_repro.py
+++ b/tests/python/test_repro.py
@@ -1912,3 +1912,75 @@ def nvfuser_fusion_id10(fd: FusionDefinition) -> None:
             torch.testing.make_tensor((), dtype=torch.float32, device="cpu"),
         ]
         fd.execute(inputs)
+
+    # https://github.com/NVIDIA/Fuser/issues/3290
+    def test_execution_order(self):
+        import gc
+
+        N_PARALLEL_PATHS = 10
+
+        with FusionDefinition() as fd:
+            T0s = [
+                fd.define_tensor(
+                    shape=[256, 256],
+                    contiguity=[True, True],
+                    dtype=DataType.Float,
+                    is_cpu=False,
+                    stride_order=[1, 0],
+                )
+                for _ in range(N_PARALLEL_PATHS)
+            ]
+            a = fd.define_tensor(
+                shape=[256, 256],
+                contiguity=[True, True],
+                dtype=DataType.Float,
+                is_cpu=False,
+                stride_order=[1, 0],
+            )
+            for T0 in T0s:
+                T1 = fd.ops.relu(T0)
+                T2 = fd.ops.matmul(T1, T1)
+                T3 = fd.ops.relu(T2)
+                a = fd.ops.matmul(T3, a)
+            fd.add_output(a)
+
+        t0s = [
+            torch.randn(256, 256, device="cuda") for _ in range(N_PARALLEL_PATHS)
+        ]  # 0.25 MiB * N_PARALLEL_PATHS
+        a = torch.randn(256, 256, device="cuda")  # 0.25 MiB
+
+        # Record peak memory usage
+        fd.execute([*t0s, a])
+        gc.collect(0)
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        before_in_MiB = torch.cuda.max_memory_allocated() / (1024 * 1024)
+        fd.execute([*t0s, a])
+        nvf_max_allocated_in_MiB = (
+            torch.cuda.max_memory_allocated() / (1024 * 1024) - before_in_MiB
+        )
+
+        def eager_func(t0s, a):
+            for t0 in t0s:
+                t1 = torch.nn.functional.relu(t0)
+                del t0
+                t2 = torch.matmul(t1, t1)
+                del t1
+                t3 = torch.nn.functional.relu(t2)
+                del t2
+                a = torch.matmul(t3, a)
+                del t3
+            return a
+
+        # Record peak memory usage
+        eager_func(t0s, a)
+        gc.collect(0)
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        before_in_MiB = torch.cuda.max_memory_allocated() / (1024 * 1024)
+        eager_func(t0s, a)
+        eager_max_allocated_in_MiB = (
+            torch.cuda.max_memory_allocated() / (1024 * 1024) - before_in_MiB
+        )
+
+        assert nvf_max_allocated_in_MiB == eager_max_allocated_in_MiB

From 589653f7453a73cef8692430f9026ff4f82ad6d0 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 14:59:18 -0400
Subject: [PATCH 10/71] Check that ordering is topological

---
 csrc/fusion_segmenter.cpp |  6 +++--
 csrc/graph/task_graph.cpp | 57 ++++++++++++++++++++++++++-------------
 csrc/graph/task_graph.h   | 18 +------------
 3 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index e9e54fc71de..99f837ea49b 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2029,19 +2029,19 @@ std::vector<SegmentedGroup*> optimalTopoSort(
     // These are fusion inputs, so they are not edges between segments
     for (Val* v : group->inputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
-        std::cout << "  Group input " << tv->toString() << std::endl;
         // Ignore scalar inputs
         TaskGraph::DataId data_id = maybe_register_tv(tv);
         TaskGraph::Data& data = all_data.at((size_t)data_id);
         data.uses.push_back(task_id);
         data.can_free = !tv->isFusionInput();
         inputs.push_back(data_id);
+        std::cout << "  Group input " << data_id << " = " << tv->toString()
+                  << std::endl;
       }
     }
     std::vector<TaskGraph::DataId> outputs;
     for (Val* v : group->outputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
-        std::cout << "  Group output " << tv->toString() << std::endl;
         TaskGraph::DataId data_id = maybe_register_tv(tv);
         TaskGraph::Data& data = all_data.at((size_t)data_id);
         data.definition = task_id;
@@ -2052,6 +2052,8 @@ std::vector<SegmentedGroup*> optimalTopoSort(
         }
         data.can_free = !tv->isFusionOutput();
         outputs.push_back(data_id);
+        std::cout << "  Group output " << data_id << " = " << tv->toString()
+                  << std::endl;
       }
     }
 
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 04b093d2826..ba2d3cf7901 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -15,6 +15,31 @@
 
 namespace nvfuser {
 
+TaskGraph::TaskGraph(
+    const std::vector<Task>& tasks,
+    const std::vector<Data>& data)
+    : tasks_(tasks), data_(data) {
+  // Initialize the counts of future uses of data and unmet dependencies of
+  // tasks. These are the out-degrees of Data and in-degrees of Tasks,
+  // respectively.
+  num_dependencies_.reserve(tasks_.size());
+  for (const Task& task : tasks_) {
+    // Only count task inputs that are not already available (i.e. they have no
+    // definition)
+    num_dependencies_.push_back((DataId)std::count_if(
+        task.inputs.begin(), task.inputs.end(), [&](DataId data_id) {
+          return getData(data_id).definition.has_value();
+        }));
+  }
+  num_uses_.reserve(data_.size());
+  for (const Data& data : data_) {
+    num_uses_.push_back((TaskId)data.uses.size());
+    if (!data.definition.has_value()) {
+      initial_allocation_ += (Size)data.size;
+    }
+  }
+}
+
 void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
   // First find any Data in the graph that has no definition. This must be
   // preallocated before running the program, so we initialize allocated and
@@ -25,55 +50,51 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
   std::vector<TaskId> future_uses = num_uses_;
   std::vector<DataId> outstanding_dependencies = num_dependencies_;
 
-  std::cout << "Validating " << steps << std::endl;
-  std::cout << "    allocated=" << allocated << std::endl;
-
   // Now we are ready to process steps
   for (const Step& step : steps) {
+    NVF_ERROR(
+        outstanding_dependencies.at((size_t)step.task) == 0,
+        "Invalid ordering found: task id ",
+        step.task,
+        " is executed before all its dependencies are available");
+
     const Task& task = getTask(step.task);
-    std::cout << "  " << step << "  " << task << std::endl;
 
     // Allocate outputs
     for (const DataId output_id : task.outputs) {
       const Data& data = getData(output_id);
       if (!data.input_alias.has_value()) {
         // Don't allocate outputs if they are reusing input memory
-        std::cout << "    adding " << data.size << " to allocated for output "
-                  << output_id << ": " << data << std::endl;
         allocated += data.size;
       }
     }
 
     // Add temporary space
-    std::cout << "    adding " << task.temp_space
-              << " to allocated for temp space " << std::endl;
     allocated += task.temp_space;
 
     // This is the most space we will use, so update high water mark here
     high_water_mark = std::max(high_water_mark, allocated);
-    std::cout << "    high water mark is " << high_water_mark << std::endl;
     NVF_ERROR(
         step.high_water_mark == high_water_mark,
         "Mismatch in high water mark during validation");
 
     // reduce use count for inputs and free them if possible
     for (const DataId input_id : task.inputs) {
-      std::cout << "    predecrement future uses="
-                << future_uses.at((size_t)input_id) << " for input id "
-                << input_id << std::endl;
       if (--future_uses.at((size_t)input_id) == 0) {
         // There are no more uses for this Data, so free it if we're allowed to
         const Data& data = getData(input_id);
-        std::cout << "    input with no future uses: " << data << std::endl;
         if (data.can_free) {
-          std::cout << "    subtracting " << data.size
-                    << " from allocated for input " << input_id << ": " << data
-                    << std::endl;
           allocated -= data.size;
         }
       }
     }
-    std::cout << "    allocated=" << allocated << std::endl;
+
+    for (const DataId output_id : task.outputs) {
+      const Data& data = getData(output_id);
+      for (const TaskId use_id : data.uses) {
+        --outstanding_dependencies.at((size_t)use_id);
+      }
+    }
 
     // step.allocated indicates how much space is allocated _upon completion_ of
     // this step
@@ -314,7 +335,7 @@ std::string TaskGraph::Data::toString() const {
      << (definition.has_value() ? std::to_string(definition.value()) : "none");
   ss << ", uses={" << uses << "}";
   ss << ", size=" << size;
-  ss << ", input alias="
+  ss << ", alias="
      << (input_alias.has_value() ? std::to_string(input_alias.value())
                                  : "none");
   ss << ", can_free=" << (can_free ? "yes" : "no");
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 67fe86e96fb..fd189eeb5e4 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -52,23 +52,7 @@ class TaskGraph {
     std::string toString() const;
   };
 
-  TaskGraph(const std::vector<Task>& tasks, const std::vector<Data>& data)
-      : tasks_(tasks), data_(data) {
-    // Initialize the counts of future uses of data and unmet dependencies of
-    // tasks. These are the out-degrees of Data and in-degrees of Tasks,
-    // respectively.
-    num_dependencies_.reserve(tasks_.size());
-    for (const Task& task : tasks_) {
-      num_dependencies_.push_back((DataId)task.inputs.size());
-    }
-    num_uses_.reserve(data_.size());
-    for (const Data& data : data_) {
-      num_uses_.push_back((TaskId)data.uses.size());
-      if (!data.definition.has_value()) {
-        initial_allocation_ += (Size)data.size;
-      }
-    }
-  }
+  TaskGraph(const std::vector<Task>& tasks, const std::vector<Data>& data);
 
   //! This represents the execution of a single Task in a given ordering. It
   //! tracks some cumulative state representing the amount of space required up

From 9e7c2ba4c2976e50c15ea4314ba899306153a19d Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 15:08:15 -0400
Subject: [PATCH 11/71] Erase uses from ready_tasks_ when backtracking

---
 csrc/graph/task_graph.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index ba2d3cf7901..e9392ae72b6 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -198,7 +198,10 @@ class TaskSorter {
     for (const TaskGraph::DataId& output_id : last_task.outputs) {
       const TaskGraph::Data& output = graph_.getData(output_id);
       for (const TaskGraph::TaskId use_id : output.uses) {
-        outstanding_dependencies_.at((size_t)use_id)++;
+        if (outstanding_dependencies_.at((size_t)use_id)++ == 0) {
+          // This task _was_ ready but not it is not
+          ready_tasks_.erase((size_t)use_id);
+        }
       }
     }
 

From 647fdb5e9fb9648d78c97aa39eee76372657cc26 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 16:57:28 -0400
Subject: [PATCH 12/71] Refactor conversion into
 SegmentedGroupTaskGraphConverter

---
 csrc/fusion_segmenter.cpp | 100 ++++++++++++++++++++++----------------
 csrc/graph/task_graph.cpp |   8 +++
 2 files changed, 65 insertions(+), 43 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 99f837ea49b..c1b26a67b75 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1986,42 +1986,23 @@ bool SegmentCandidateFinder::hasSegmentHints(Fusion* fusion) {
 
 namespace {
 
-std::vector<SegmentedGroup*> optimalTopoSort(
-    const std::vector<SegmentedGroup*>& groups) {
-  NVF_ERROR(
-      groups.size() <= std::numeric_limits<TaskGraph::TaskId>::max(),
-      "There are too many tasks to represent with TaskGraph::TaskId");
-
-  std::vector<TaskGraph::Data> all_data;
-  std::unordered_map<TensorView*, TaskGraph::DataId> tv2dataid;
-
-  const auto maybe_register_tv = [&](TensorView* tv) -> TaskGraph::DataId {
-    auto it = tv2dataid.find(tv);
-    if (it == tv2dataid.end()) {
-      // Register this TV
-      TaskGraph::DataId new_id = (TaskGraph::DataId)all_data.size();
-      tv2dataid[tv] = new_id;
-
-      // TODO: Pass runtime info so we can use actual sizes here, or at least
-      // use a better estimate
-      TaskGraph::Size size = 1;
+class SegmentedGroupTaskGraphConverter {
+ public:
+  TaskGraph run(const std::vector<SegmentedGroup*>& groups) {
+    NVF_ERROR(
+        groups.size() <= std::numeric_limits<TaskGraph::TaskId>::max(),
+        "There are too many tasks to represent with TaskGraph::TaskId");
 
-      all_data.emplace_back(
-          /*definition=*/std::nullopt,
-          /*uses=*/std::vector<TaskGraph::TaskId>{},
-          /*input_alias=*/std::nullopt,
-          size,
-          /*can_free=*/true);
-      return new_id;
-    } else {
-      return it->second;
+    for (SegmentedGroup* group : groups) {
+      processGroup(group);
     }
-  };
 
-  std::vector<TaskGraph::Task> all_tasks;
-  all_tasks.reserve(groups.size());
-  for (SegmentedGroup* group : groups) {
-    TaskGraph::TaskId task_id = (TaskGraph::TaskId)all_tasks.size();
+    return TaskGraph(all_tasks_, all_data_);
+  }
+
+ private:
+  void processGroup(SegmentedGroup* group) {
+    TaskGraph::TaskId task_id = (TaskGraph::TaskId)all_tasks_.size();
 
     std::vector<TaskGraph::DataId> inputs;
     std::cout << "Task " << task_id
@@ -2030,8 +2011,8 @@ std::vector<SegmentedGroup*> optimalTopoSort(
     for (Val* v : group->inputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
         // Ignore scalar inputs
-        TaskGraph::DataId data_id = maybe_register_tv(tv);
-        TaskGraph::Data& data = all_data.at((size_t)data_id);
+        TaskGraph::DataId data_id = maybeRegisterTv(tv);
+        TaskGraph::Data& data = all_data_.at((size_t)data_id);
         data.uses.push_back(task_id);
         data.can_free = !tv->isFusionInput();
         inputs.push_back(data_id);
@@ -2042,12 +2023,12 @@ std::vector<SegmentedGroup*> optimalTopoSort(
     std::vector<TaskGraph::DataId> outputs;
     for (Val* v : group->outputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
-        TaskGraph::DataId data_id = maybe_register_tv(tv);
-        TaskGraph::Data& data = all_data.at((size_t)data_id);
+        TaskGraph::DataId data_id = maybeRegisterTv(tv);
+        TaskGraph::Data& data = all_data_.at((size_t)data_id);
         data.definition = task_id;
         if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
                 tv->fusion()->getOutputAlias(tv).aliased_io)) {
-          TaskGraph::DataId alias_id = maybe_register_tv(aliased_input_tv);
+          TaskGraph::DataId alias_id = maybeRegisterTv(aliased_input_tv);
           data.input_alias = alias_id;
         }
         data.can_free = !tv->isFusionOutput();
@@ -2060,14 +2041,47 @@ std::vector<SegmentedGroup*> optimalTopoSort(
     // TODO: inspect compiled segment executors to determine temp gmem needed
     TaskGraph::Size temp_space = 0;
 
-    all_tasks.emplace_back(inputs, outputs, temp_space);
+    all_tasks_.emplace_back(inputs, outputs, temp_space);
   }
 
-  NVF_ERROR(
-      all_data.size() <= std::numeric_limits<TaskGraph::TaskId>::max(),
-      "There are too many tensors to represent with TaskGraph::DataId");
+  TaskGraph::DataId maybeRegisterTv(TensorView* tv) {
+    auto it = tv2dataid_.find(tv);
+    if (it == tv2dataid_.end()) {
+      // Register this TV
+      TaskGraph::DataId new_id = (TaskGraph::DataId)all_data_.size();
+      tv2dataid_[tv] = new_id;
+
+      // TODO: Pass runtime info so we can use actual sizes here, or at least
+      // use a better estimate
+      TaskGraph::Size size = 1;
+
+      all_data_.emplace_back(
+          /*definition=*/std::nullopt,
+          /*uses=*/std::vector<TaskGraph::TaskId>{},
+          /*input_alias=*/std::nullopt,
+          size,
+          /*can_free=*/true);
+      return new_id;
+    } else {
+      return it->second;
+    }
+  }
+
+ private:
+  std::vector<TaskGraph::Data> all_data_;
+  std::unordered_map<TensorView*, TaskGraph::DataId> tv2dataid_;
+  std::vector<TaskGraph::Task> all_tasks_;
+};
+
+std::vector<SegmentedGroup*> optimalTopoSort(
+    const std::vector<SegmentedGroup*>& groups) {
+  if (groups.size() == 1) {
+    // Skip setting up the graph and doing the whole analysis when there's just
+    // a single group
+    return groups;
+  }
 
-  TaskGraph graph(all_tasks, all_data);
+  TaskGraph graph = SegmentedGroupTaskGraphConverter().run(groups);
 
   std::cout << graph << std::endl;
 
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index e9392ae72b6..7383eaebab1 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -9,6 +9,7 @@
 #include <graph/task_graph.h>
 #include <utils.h>
 
+#include <numeric_limits>
 #include <set>
 #include <sstream>
 #include <string>
@@ -19,6 +20,13 @@ TaskGraph::TaskGraph(
     const std::vector<Task>& tasks,
     const std::vector<Data>& data)
     : tasks_(tasks), data_(data) {
+  NVF_ERROR(
+      tasks.size() <= std::numeric_limits<TaskGraph::TaskId>::max(),
+      "There are too many tasks to represent with TaskGraph::TaskId");
+  NVF_ERROR(
+      data.size() <= std::numeric_limits<TaskGraph::DataId>::max(),
+      "There are too many data objects to represent with TaskGraph::DataId");
+
   // Initialize the counts of future uses of data and unmet dependencies of
   // tasks. These are the out-degrees of Data and in-degrees of Tasks,
   // respectively.

From 1d9b750257ade37b2f7914130315e7d3b9827413 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 18:10:12 -0400
Subject: [PATCH 13/71] Avoid errors with aliasing

---
 csrc/fusion_segmenter.cpp | 34 ++++++++++++++++++++++++----------
 csrc/graph/task_graph.cpp |  2 +-
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index c1b26a67b75..9a79fbe90d1 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1988,22 +1988,31 @@ namespace {
 
 class SegmentedGroupTaskGraphConverter {
  public:
-  TaskGraph run(const std::vector<SegmentedGroup*>& groups) {
-    NVF_ERROR(
-        groups.size() <= std::numeric_limits<TaskGraph::TaskId>::max(),
-        "There are too many tasks to represent with TaskGraph::TaskId");
-
+  static TaskGraph convert(const std::vector<SegmentedGroup*>& groups) {
+    SegmentedGroupTaskGraphConverter conv;
     for (SegmentedGroup* group : groups) {
-      processGroup(group);
+      conv.processGroup(group);
     }
-
-    return TaskGraph(all_tasks_, all_data_);
+    return TaskGraph(conv.all_tasks_, conv.all_data_);
   }
 
  private:
   void processGroup(SegmentedGroup* group) {
     TaskGraph::TaskId task_id = (TaskGraph::TaskId)all_tasks_.size();
 
+    // When there are aliased inputs, they will appear as _outputs_ of the
+    // SegmentedGroup. To avoid actually adding those as outputs, we record them
+    // here first
+    std::unordered_set<TensorView*> aliased_input_tvs;
+    for (Val* v : group->outputs()) {
+      if (auto* tv = dynamic_cast<TensorView*>(v)) {
+        if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
+                tv->fusion()->getOutputAlias(tv).aliased_io)) {
+          aliased_input_tvs.insert(aliased_input_tv);
+        }
+      }
+    }
+
     std::vector<TaskGraph::DataId> inputs;
     std::cout << "Task " << task_id
               << " is segmented groupId=" << group->groupId() << std::endl;
@@ -2023,6 +2032,11 @@ class SegmentedGroupTaskGraphConverter {
     std::vector<TaskGraph::DataId> outputs;
     for (Val* v : group->outputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
+        if (aliased_input_tvs.count(tv)) {
+          // These are counted as outputs but are actually _inputs_ to this
+          // group
+          continue;
+        }
         TaskGraph::DataId data_id = maybeRegisterTv(tv);
         TaskGraph::Data& data = all_data_.at((size_t)data_id);
         data.definition = task_id;
@@ -2078,10 +2092,10 @@ std::vector<SegmentedGroup*> optimalTopoSort(
   if (groups.size() == 1) {
     // Skip setting up the graph and doing the whole analysis when there's just
     // a single group
-    return groups;
+    return {groups.front()};
   }
 
-  TaskGraph graph = SegmentedGroupTaskGraphConverter().run(groups);
+  TaskGraph graph = SegmentedGroupTaskGraphConverter::convert(groups);
 
   std::cout << graph << std::endl;
 
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 7383eaebab1..5a1ebe5ea7e 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -9,7 +9,7 @@
 #include <graph/task_graph.h>
 #include <utils.h>
 
-#include <numeric_limits>
+#include <limits>
 #include <set>
 #include <sstream>
 #include <string>

From 29b684cb63f6b428468c4157386a4e48a1061075 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 13 Aug 2025 18:18:29 -0400
Subject: [PATCH 14/71] Add printout with timing of topo sorting

Not surprisingly we are slower, maybe 500x-1000x slower than the single
shot method. Max time in test_repro.py tests is still <1ms though.
---
 csrc/fusion_segmenter.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 9a79fbe90d1..14131b4fabd 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -5308,9 +5308,16 @@ RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
     }
   }
 
+  using Clock = std::chrono::high_resolution_clock;
+  auto start = Clock::now();
   runtime_workspace.group_run_order =
       optimalTopoSort(segmented_fusion.groups());
   // runtime_workspace.group_run_order = toposort(segmented_fusion.groups());
+  auto stop = Clock::now();
+  std::cout << "Sorting segments took "
+            << std::chrono::duration_cast<std::chrono::microseconds>(
+                   stop - start)
+            << " us" << std::endl;
 
   return runtime_workspace;
 }

From 082d379db78c796ac9198c4d79231a035eb7da09 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 14 Aug 2025 09:14:37 -0400
Subject: [PATCH 15/71] Add PTX ranges, remove debug prints

---
 csrc/fusion_segmenter.cpp | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index d1a0c882463..e0f7feaca59 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2014,8 +2014,6 @@ class SegmentedGroupTaskGraphConverter {
     }
 
     std::vector<TaskGraph::DataId> inputs;
-    std::cout << "Task " << task_id
-              << " is segmented groupId=" << group->groupId() << std::endl;
     // These are fusion inputs, so they are not edges between segments
     for (Val* v : group->inputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
@@ -2025,8 +2023,6 @@ class SegmentedGroupTaskGraphConverter {
         data.uses.push_back(task_id);
         data.can_free = !tv->isFusionInput();
         inputs.push_back(data_id);
-        std::cout << "  Group input " << data_id << " = " << tv->toString()
-                  << std::endl;
       }
     }
     std::vector<TaskGraph::DataId> outputs;
@@ -2047,8 +2043,6 @@ class SegmentedGroupTaskGraphConverter {
         }
         data.can_free = !tv->isFusionOutput();
         outputs.push_back(data_id);
-        std::cout << "  Group output " << data_id << " = " << tv->toString()
-                  << std::endl;
       }
     }
 
@@ -2089,6 +2083,7 @@ class SegmentedGroupTaskGraphConverter {
 
 std::vector<SegmentedGroup*> optimalTopoSort(
     const std::vector<SegmentedGroup*>& groups) {
+  FUSER_PERF_SCOPE("optimalTopoSort");
   if (groups.size() == 1) {
     // Skip setting up the graph and doing the whole analysis when there's just
     // a single group
@@ -2097,12 +2092,8 @@ std::vector<SegmentedGroup*> optimalTopoSort(
 
   TaskGraph graph = SegmentedGroupTaskGraphConverter::convert(groups);
 
-  std::cout << graph << std::endl;
-
   TaskGraph::SortResult result = graph.findOptimalOrder();
 
-  std::cout << result << std::endl;
-
   std::vector<SegmentedGroup*> order;
   order.reserve(groups.size());
   for (const TaskGraph::Step& step : result.steps) {
@@ -2113,6 +2104,7 @@ std::vector<SegmentedGroup*> optimalTopoSort(
 
 std::vector<SegmentedGroup*> toposort(
     const std::vector<SegmentedGroup*>& groups) {
+  FUSER_PERF_SCOPE("toposort");
   std::deque<SegmentedGroup*> to_visit;
   std::unordered_map<SegmentedGroup*, int64_t> num_producer_edges;
   for (SegmentedGroup* group : groups) {
@@ -5493,6 +5485,7 @@ void SegmentedFusion::annotateFP16IntermediateTensors() {
 }
 
 RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
+  FUSER_PERF_SCOPE("prepareRuntimeOrder");
   RuntimeWorkSpace runtime_workspace;
 
   // setup the order tensor dimensions are bound
@@ -5507,16 +5500,8 @@ RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
     }
   }
 
-  using Clock = std::chrono::high_resolution_clock;
-  auto start = Clock::now();
   runtime_workspace.group_run_order =
       optimalTopoSort(segmented_fusion.groups());
-  // runtime_workspace.group_run_order = toposort(segmented_fusion.groups());
-  auto stop = Clock::now();
-  std::cout << "Sorting segments took "
-            << std::chrono::duration_cast<std::chrono::microseconds>(
-                   stop - start)
-            << " us" << std::endl;
 
   return runtime_workspace;
 }

From 5de46837b3e4f67d913597a664ca49fc34fc4122 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 14 Aug 2025 09:29:42 -0400
Subject: [PATCH 16/71] Add comments. Check aliasing condition

---
 csrc/graph/task_graph.cpp |  9 ++++++++-
 csrc/graph/task_graph.h   | 27 +++++++++++++++++++++++----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 5a1ebe5ea7e..eee8067751b 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -71,7 +71,14 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
     // Allocate outputs
     for (const DataId output_id : task.outputs) {
       const Data& data = getData(output_id);
-      if (!data.input_alias.has_value()) {
+      if (data.input_alias.has_value()) {
+        // Check that the aliased input has no further uses
+        // Note that we will decrement this use count later in this function
+        NVF_ERROR(
+            num_uses_.at((size_t)data.input_alias.value()) == 1,
+            "Tried to execute segment that would overwrite input alias before "
+            "some of its uses");
+      } else {
         // Don't allocate outputs if they are reusing input memory
         allocated += data.size;
       }
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index fd189eeb5e4..208a631952f 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -15,6 +15,24 @@
 
 namespace nvfuser {
 
+//! A task graph is a stripped-down representation of a data flow graph. It was
+//! originally intended to model runtime order optimization during segmentation,
+//! but might have applications in other contexts.
+//!
+//! TensorViews are represented as Data and each contains a size and might be
+//! aliased to another Data, modeling input/output aliasing in a Fusion. A
+//! segment from a segmented fusion is represented here as a Task. Every task
+//! has inputs and outputs and also might require some temporary space to do its
+//! computation. For example when doing grid reductions we require a gmem buffer
+//! that is freed after the segment is computed.
+//!
+//! We model execution using the Step struct. A vector of Steps is simply a
+//! runtime ordering of Tasks, but with some extra state that helps us track
+//! memory allocation across the execution. Specifically, our model usually only
+//! allocates Data upon its first use and immediately deallocates in after its
+//! last use. The only exception is if the Data is marked can_free=false, which
+//! would be the case for unsegmented Fusion inputs or outputs whose lifetimes
+//! must extend past the execution of the entire graph.
 class TaskGraph {
  public:
   using TaskId = int16_t;
@@ -33,13 +51,14 @@ class TaskGraph {
     std::string toString() const;
   };
 
+  //! A Data object represents a TensorView with a given size.
   struct Data {
     std::optional<TaskId> definition;
     std::vector<TaskId> uses;
-    // If set, this means we do not allocate a new output when executing this
-    // Data's definition, instead we re-use the space from the specified input.
-    // Note that this implies an ordering constraint which we will check, since
-    // the definition must be the last use of the aliased input.
+    //! If set, this means we do not allocate a new output when executing this
+    //! Data's definition, instead we re-use the space from the specified input.
+    //! Note that this implies an ordering constraint which we will check, since
+    //! the definition must be the last use of the aliased input.
     std::optional<DataId> input_alias;
     Size size;
 

From 399840603c56080f97cb9140a784323b60c6dd6e Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 15 Aug 2025 08:35:53 -0400
Subject: [PATCH 17/71] Remove debug prints in test

---
 tests/python/test_repro.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/python/test_repro.py b/tests/python/test_repro.py
index a64f41cb76e..8e45d8cd2e2 100644
--- a/tests/python/test_repro.py
+++ b/tests/python/test_repro.py
@@ -2280,8 +2280,6 @@ def test_execution_order(self):
         with RecordTorchMemory() as nvf_mem:
             fd.execute([*t0s, a])
 
-        print("NVF: ", nvf_mem)
-
         def eager_func(t0s, a):
             for t0 in t0s:
                 t1 = torch.nn.functional.relu(t0)
@@ -2297,6 +2295,4 @@ def eager_func(t0s, a):
         with RecordTorchMemory() as eager_mem:
             eager_func(t0s, a)
 
-        print("Eager: ", eager_mem)
-
         assert nvf_mem == eager_mem

From be8332ade19fc0e304cf539245daeec696ab7ddd Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 15 Aug 2025 09:47:52 -0400
Subject: [PATCH 18/71] Respect aliased input constraint

---
 csrc/fusion_segmenter.cpp |  5 +--
 csrc/graph/task_graph.cpp | 77 +++++++++++++++++++++++++++++++++------
 csrc/graph/task_graph.h   |  6 +--
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index e0f7feaca59..1d50b32603e 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2038,8 +2038,7 @@ class SegmentedGroupTaskGraphConverter {
         data.definition = task_id;
         if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
                 tv->fusion()->getOutputAlias(tv).aliased_io)) {
-          TaskGraph::DataId alias_id = maybeRegisterTv(aliased_input_tv);
-          data.input_alias = alias_id;
+          data.aliases_input = maybeRegisterTv(aliased_input_tv);
         }
         data.can_free = !tv->isFusionOutput();
         outputs.push_back(data_id);
@@ -2066,7 +2065,7 @@ class SegmentedGroupTaskGraphConverter {
       all_data_.emplace_back(
           /*definition=*/std::nullopt,
           /*uses=*/std::vector<TaskGraph::TaskId>{},
-          /*input_alias=*/std::nullopt,
+          /*aliases_input=*/std::nullopt,
           size,
           /*can_free=*/true);
       return new_id;
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index eee8067751b..2a224bdd9a8 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -9,6 +9,7 @@
 #include <graph/task_graph.h>
 #include <utils.h>
 
+#include <algorithm>
 #include <limits>
 #include <set>
 #include <sstream>
@@ -71,11 +72,11 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
     // Allocate outputs
     for (const DataId output_id : task.outputs) {
       const Data& data = getData(output_id);
-      if (data.input_alias.has_value()) {
+      if (data.aliases_input.has_value()) {
         // Check that the aliased input has no further uses
         // Note that we will decrement this use count later in this function
         NVF_ERROR(
-            num_uses_.at((size_t)data.input_alias.value()) == 1,
+            num_uses_.at((size_t)data.aliases_input.value()) == 1,
             "Tried to execute segment that would overwrite input alias before "
             "some of its uses");
       } else {
@@ -129,7 +130,14 @@ namespace {
 class TaskSorter {
  public:
   TaskSorter(const TaskGraph& graph, bool validate, int64_t max_iters)
-      : graph_(graph), validate_(validate), max_iters_(max_iters) {
+      : graph_(graph),
+        validate_(validate),
+        max_iters_(max_iters),
+        has_aliasing_(std::ranges::any_of(
+            arange(graph.numData()),
+            [&graph](TaskGraph::DataId data_id) {
+              return graph.getData(data_id).aliases_input.has_value();
+            })) {
     sort();
   }
 
@@ -167,13 +175,14 @@ class TaskSorter {
     for (const TaskGraph::DataId output_id : task.outputs) {
       const TaskGraph::Data& output = graph_.getData(output_id);
       // Allocate outputs if not aliased
-      if (!output.input_alias.has_value()) {
+      if (!output.aliases_input.has_value()) {
         allocated += output.size;
       }
 
       // Update outstanding_dependencies_ and ready_tasks_ for each use
       for (const TaskGraph::TaskId use_id : output.uses) {
-        if (--outstanding_dependencies_.at((size_t)use_id) == 0) {
+        --outstanding_dependencies_.at((size_t)use_id);
+        if (taskIsReady(use_id)) {
           ready_tasks_.insert(use_id);
         }
       }
@@ -229,7 +238,44 @@ class TaskSorter {
     return last_task_id;
   }
 
+  //! A task is ready if it has no outstanding_dependencies _and_ it is the last
+  //! use for all of its aliased inputs.
+  bool taskIsReady(TaskGraph::TaskId task_id) const {
+    if (outstanding_dependencies_.at((size_t)task_id) != 0) {
+      return false;
+    }
+    if (!has_aliasing_ || !task_has_aliased_input_.at((size_t)task_id)) {
+      return true;
+    }
+    // The rest of this function is the aliasing dependency check
+    for (const TaskGraph::DataId output_id : arange(graph_.numData())) {
+      const TaskGraph::Data& output_data = graph_.getData(output_id);
+      if (output_data.aliases_input.has_value()) {
+        TaskGraph::DataId input_id = output_data.aliases_input.value();
+        // Check for future uses (beyond the current one)
+        if (future_uses_.at((size_t)input_id) > 1) {
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
   void sort() {
+    if (has_aliasing_) {
+      task_has_aliased_input_.reserve(graph_.numTasks());
+      for (const TaskGraph::DataId data_id : arange(graph_.numData())) {
+        const TaskGraph::Data& data = graph_.getData(data_id);
+        if (data.aliases_input.has_value()) {
+          NVF_ERROR(
+              data.definition.has_value(),
+              "Data that aliases input must have a definition");
+          task_has_aliased_input_.at(data.definition.value()) = true;
+          continue;
+        }
+      }
+    }
+
     // Set up outstanding_dependencies_, future_uses_, and ready_tasks_
     outstanding_dependencies_.reserve(graph_.numTasks());
     for (const TaskGraph::TaskId task_id : arange(graph_.numTasks())) {
@@ -243,7 +289,7 @@ class TaskSorter {
         }
       }
       outstanding_dependencies_.push_back(inputs_to_compute);
-      if (inputs_to_compute == 0) {
+      if (taskIsReady(task_id)) {
         ready_tasks_.insert(task_id);
       }
     }
@@ -316,8 +362,17 @@ class TaskSorter {
 
  private:
   const TaskGraph& graph_;
-  bool validate_;
-  int64_t max_iters_;
+  const bool validate_;
+  const int64_t max_iters_;
+
+  //! This allows us to skip aliasing checks in the common case where no inputs
+  //! are aliased by outputs
+  const bool has_aliasing_ = false;
+  //! This tells us which tasks overwrite one of their inputs. For these, we
+  //! will need to check that the aliased input has no future uses before
+  //! advancing to it.
+  std::vector<bool> task_has_aliased_input_;
+
   TaskGraph::SortResult result_;
   std::vector<TaskGraph::Step> steps_;
 
@@ -353,9 +408,9 @@ std::string TaskGraph::Data::toString() const {
      << (definition.has_value() ? std::to_string(definition.value()) : "none");
   ss << ", uses={" << uses << "}";
   ss << ", size=" << size;
-  ss << ", alias="
-     << (input_alias.has_value() ? std::to_string(input_alias.value())
-                                 : "none");
+  ss << ", aliases_input="
+     << (aliases_input.has_value() ? std::to_string(aliases_input.value())
+                                   : "none");
   ss << ", can_free=" << (can_free ? "yes" : "no");
   ss << "}";
   return ss.str();
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 208a631952f..195851dcc30 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -59,7 +59,7 @@ class TaskGraph {
     //! Data's definition, instead we re-use the space from the specified input.
     //! Note that this implies an ordering constraint which we will check, since
     //! the definition must be the last use of the aliased input.
-    std::optional<DataId> input_alias;
+    std::optional<DataId> aliases_input;
     Size size;
 
     //! This indicates whether we are able to free this data after its last use.
@@ -135,8 +135,8 @@ class TaskGraph {
   std::string toString() const;
 
  private:
-  std::vector<Task> tasks_;
-  std::vector<Data> data_;
+  const std::vector<Task> tasks_;
+  const std::vector<Data> data_;
 
   //! How much data is allocated by data that has no definition, i.e. input data
   Size initial_allocation_ = 0;

From 91e942fa8ccdcf0d8cc81464b89fec21071f2b4f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 15 Aug 2025 09:50:22 -0400
Subject: [PATCH 19/71] Fix lintrunner

---
 python/nvfuser/pytorch_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/nvfuser/pytorch_utils.py b/python/nvfuser/pytorch_utils.py
index bc4bcf5e782..c5bebf5ffb8 100644
--- a/python/nvfuser/pytorch_utils.py
+++ b/python/nvfuser/pytorch_utils.py
@@ -202,7 +202,7 @@ def __post_init__(self):
 
 
 @dataclass
-class RecordTorchMemory():
+class RecordTorchMemory:
     before: TorchMemorySnapshot | None = None
     after: TorchMemorySnapshot | None = None
 

From 253a7502937c1827d4c15f0bbbcf2d3d3110cf2d Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 15 Aug 2025 10:41:21 -0400
Subject: [PATCH 20/71] Use runtime_info to get correct sizes in
 FusionKernelRuntime

---
 csrc/fusion_segmenter.cpp              | 59 ++++++++++++++++++++++----
 csrc/fusion_segmenter.h                |  4 +-
 csrc/runtime/fusion_kernel_runtime.cpp |  2 +-
 3 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 1d50b32603e..187a36176be 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -28,6 +28,7 @@
 #include <options.h>
 #include <scheduler/debug_utils.h>
 #include <scheduler/normalization_utils.h>
+#include <scheduler/runtime_info.h>
 #include <transform_iter.h>
 #include <transform_replay.h>
 
@@ -1988,8 +1989,10 @@ namespace {
 
 class SegmentedGroupTaskGraphConverter {
  public:
-  static TaskGraph convert(const std::vector<SegmentedGroup*>& groups) {
-    SegmentedGroupTaskGraphConverter conv;
+  static TaskGraph convert(
+      const std::vector<SegmentedGroup*>& groups,
+      SchedulerRuntimeInfo* runtime_info) {
+    SegmentedGroupTaskGraphConverter conv(runtime_info);
     for (SegmentedGroup* group : groups) {
       conv.processGroup(group);
     }
@@ -1997,6 +2000,9 @@ class SegmentedGroupTaskGraphConverter {
   }
 
  private:
+  SegmentedGroupTaskGraphConverter(SchedulerRuntimeInfo* runtime_info)
+      : runtime_info_(runtime_info) {}
+
   void processGroup(SegmentedGroup* group) {
     TaskGraph::TaskId task_id = (TaskGraph::TaskId)all_tasks_.size();
 
@@ -2058,9 +2064,39 @@ class SegmentedGroupTaskGraphConverter {
       TaskGraph::DataId new_id = (TaskGraph::DataId)all_data_.size();
       tv2dataid_[tv] = new_id;
 
-      // TODO: Pass runtime info so we can use actual sizes here, or at least
-      // use a better estimate
-      TaskGraph::Size size = 1;
+      // Assume all tensors the same shape if no runtime_info is given
+      int64_t numel = 1;
+      if (runtime_info_ != nullptr) {
+        // Get the actual size of the tensor allocation
+        if (tv->isFusionInput()) {
+          const std::vector<int64_t>& sizes =
+              runtime_info_->getInputAllocationSizes(tv);
+          const std::vector<int64_t>& strides =
+              runtime_info_->getInputAllocationStrides(tv);
+
+          numel = 1;
+          for (auto [size, stride] : zip(sizes, strides)) {
+            if (size == 0) {
+              // Check for empty tensors
+              numel = 0;
+              break;
+            }
+            numel += (size - 1) * stride;
+          }
+        } else {
+          // Use ExpressionEvaluator for computed tensors assuming they are
+          // contiguous
+          for (IterDomain* id : tv->getMaybeAllocationDomain()) {
+            if (id->isBroadcast() || id->isReduction()) {
+              continue;
+            }
+            numel *= runtime_info_->expressionEvaluator()
+                         .evaluate(id->extent())
+                         .as<int64_t>();
+          }
+        }
+      }
+      TaskGraph::Size size = numel * dataTypeSizeByte(tv->dtype());
 
       all_data_.emplace_back(
           /*definition=*/std::nullopt,
@@ -2075,13 +2111,15 @@ class SegmentedGroupTaskGraphConverter {
   }
 
  private:
+  SchedulerRuntimeInfo* runtime_info_;
   std::vector<TaskGraph::Data> all_data_;
   std::unordered_map<TensorView*, TaskGraph::DataId> tv2dataid_;
   std::vector<TaskGraph::Task> all_tasks_;
 };
 
 std::vector<SegmentedGroup*> optimalTopoSort(
-    const std::vector<SegmentedGroup*>& groups) {
+    const std::vector<SegmentedGroup*>& groups,
+    SchedulerRuntimeInfo* runtime_info) {
   FUSER_PERF_SCOPE("optimalTopoSort");
   if (groups.size() == 1) {
     // Skip setting up the graph and doing the whole analysis when there's just
@@ -2089,7 +2127,8 @@ std::vector<SegmentedGroup*> optimalTopoSort(
     return {groups.front()};
   }
 
-  TaskGraph graph = SegmentedGroupTaskGraphConverter::convert(groups);
+  TaskGraph graph =
+      SegmentedGroupTaskGraphConverter::convert(groups, runtime_info);
 
   TaskGraph::SortResult result = graph.findOptimalOrder();
 
@@ -5483,7 +5522,9 @@ void SegmentedFusion::annotateFP16IntermediateTensors() {
   }
 }
 
-RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
+RuntimeWorkSpace prepareRuntimeOrder(
+    const SegmentedFusion& segmented_fusion,
+    SchedulerRuntimeInfo* runtime_info) {
   FUSER_PERF_SCOPE("prepareRuntimeOrder");
   RuntimeWorkSpace runtime_workspace;
 
@@ -5500,7 +5541,7 @@ RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
   }
 
   runtime_workspace.group_run_order =
-      optimalTopoSort(segmented_fusion.groups());
+      optimalTopoSort(segmented_fusion.groups(), runtime_info);
 
   return runtime_workspace;
 }
diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
index 360f6180344..1a271e37712 100644
--- a/csrc/fusion_segmenter.h
+++ b/csrc/fusion_segmenter.h
@@ -477,7 +477,9 @@ struct RuntimeWorkSpace {
 
 // Perform a topological sort of different groups composiong the Segmented
 // Fusion
-RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion);
+RuntimeWorkSpace prepareRuntimeOrder(
+    const SegmentedFusion& segmented_fusion,
+    SchedulerRuntimeInfo* runtime_info = nullptr);
 
 //! This is a base class for segmenter analysis
 //!  provides the minimal implementation on header so that
diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
index 39e138d85c7..9f5f120a531 100644
--- a/csrc/runtime/fusion_kernel_runtime.cpp
+++ b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -128,7 +128,7 @@ FusionKernelRuntime::FusionKernelRuntime(
 
   // Pre-compute the executor order so that the run time path
   //  would go directly to kernel launch.
-  runtime_workspace_ = prepareRuntimeOrder(*segmented_fusion_);
+  runtime_workspace_ = prepareRuntimeOrder(*segmented_fusion_, &runtime_info);
 
   executors_.resize(segmented_fusion_->groups().size());
 

From cb77db56dd079eb2f8d67b4f65a7647b32aa1718 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 15 Aug 2025 20:12:22 -0400
Subject: [PATCH 21/71] Add test file

---
 CMakeLists.txt                |  1 +
 tests/cpp/test_task_graph.cpp | 75 +++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 tests/cpp/test_task_graph.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 67a69f8d3ab..f33349b1e4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -879,6 +879,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_smem_reuse.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_statement_guard.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_swizzle.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_task_graph.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_tensor_factories.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_tmem.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_transpose.cpp
diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
new file mode 100644
index 00000000000..724e2341e15
--- /dev/null
+++ b/tests/cpp/test_task_graph.cpp
@@ -0,0 +1,75 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#include <macros.h>
+
+#include <csrc/exceptions.h>
+#include <gtest/gtest.h>
+
+#include <graph/task_graph.h>
+#include <tests/cpp/utils.h>
+#include <tests/cpp/validator.h>
+
+namespace nvfuser {
+
+using Tasks = std::vector<TaskGraph::Task>;
+using TaskGraphTest = NVFuserTest;
+
+struct SimpleAlias {
+  TaskGraph::DataId output;
+  TaskGraph::DataId input;
+};
+
+std::vector<TaskGraph::Data> inferData(const Tasks& tasks) {
+  // Find number of data items so we can resize
+  TaskGraph::DataId max_data_id = 0;
+  for (const TaskGraph::Task& task : tasks) {
+    for (TaskGraph::DataId input_id : task.inputs) {
+      max_data_id = std::max(max_data_id, input_id);
+    }
+    for (TaskGraph::DataId output_id : task.outputs) {
+      max_data_id = std::max(max_data_id, output_id);
+    }
+  }
+  std::vector<TaskGraph::Data> all_data((size_t)max_data_id + 1);
+
+  for (const TaskGraph::Task& task : tasks) {
+    auto task_id = (TaskGraph::TaskId)tasks.size();
+    for (TaskGraph::DataId input_id : task.inputs) {
+      all_data.at(input_id).uses.push_back(task_id);
+    }
+    for (TaskGraph::DataId output_id : task.outputs) {
+      all_data.at(output_id).definition = task_id;
+    }
+  }
+
+  // Detect inputs and outputs and ensure they are not freed
+  for (TaskGraph::Data& data : all_data) {
+    data.can_free = data.definition.has_value() && !data.uses.empty();
+  }
+
+  return all_data;
+}
+
+std::vector<TaskGraph::TaskId> getTasks(const TaskGraph::SortResult& result) {
+  const std::vector<TaskGraph::Step>& steps = result.steps;
+  std::vector<TaskGraph::TaskId> tasks;
+  tasks.reserve(steps.size());
+  for (const TaskGraph::Step& step : steps) {
+    tasks.push_back(step.task);
+  }
+  return tasks;
+}
+
+TEST_F(TaskGraphTest, Basic) {
+  Tasks tasks{{{0, 1}, {2}}, {{0, 2}, {3}}};
+  auto data = inferData(tasks);
+  auto graph = TaskGraph(tasks, data);
+
+  std::vector<TaskGraph::TaskId> expected{0, 1};
+  EXPECT_EQ(getTasks(graph.findOptimalOrder()), expected);
+}
+
+} // namespace nvfuser

From ce7335d6dbe9dfaa22ec9ea434571b307543285e Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 12:20:02 -0400
Subject: [PATCH 22/71] Fix bug in initializing task_has_aliased_input_

---
 csrc/graph/task_graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 2a224bdd9a8..51eedd0a0ae 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -263,7 +263,7 @@ class TaskSorter {
 
   void sort() {
     if (has_aliasing_) {
-      task_has_aliased_input_.reserve(graph_.numTasks());
+      task_has_aliased_input_.resize(graph_.numTasks(), false);
       for (const TaskGraph::DataId data_id : arange(graph_.numData())) {
         const TaskGraph::Data& data = graph_.getData(data_id);
         if (data.aliases_input.has_value()) {

From f515cd46d2d95598255b91d42276553c5f16073f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 12:20:35 -0400
Subject: [PATCH 23/71] Pipe runtime_info to lowerSegmentedFusionToHostIr

---
 csrc/host_ir/lowering.cpp              | 5 +++--
 csrc/host_ir/lowering.h                | 3 ++-
 csrc/runtime/fusion_kernel_runtime.cpp | 7 ++++++-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp
index 1cab2cc06fc..d3b9ac1dad9 100644
--- a/csrc/host_ir/lowering.cpp
+++ b/csrc/host_ir/lowering.cpp
@@ -40,7 +40,8 @@ void recomputeOutputTvs(Expr* e, IrCloner& ir_cloner) {
 std::unique_ptr<hir::HostIrContainer> lowerSegmentedFusionToHostIr(
     const SegmentedFusion& segmented_fusion,
     const std::vector<LaunchParams>& launch_params_per_segment,
-    std::vector<std::unique_ptr<ExecutorAbstract>>& executors) {
+    std::vector<std::unique_ptr<ExecutorAbstract>>& executors,
+    SchedulerRuntimeInfo& runtime_info) {
   auto hic = std::make_unique<hir::HostIrContainer>();
 
   IrCloner ir_cloner(hic.get());
@@ -55,7 +56,7 @@ std::unique_ptr<hir::HostIrContainer> lowerSegmentedFusionToHostIr(
   }
 
   for (SegmentedGroup* group :
-       prepareRuntimeOrder(segmented_fusion).group_run_order) {
+       prepareRuntimeOrder(segmented_fusion, &runtime_info).group_run_order) {
     switch (group->schedulerType()) {
       case SchedulerType::Communication: {
         auto deviceid = Communicator::getInstance().deviceId();
diff --git a/csrc/host_ir/lowering.h b/csrc/host_ir/lowering.h
index 91df58f20ab..5c68833f7e7 100644
--- a/csrc/host_ir/lowering.h
+++ b/csrc/host_ir/lowering.h
@@ -21,6 +21,7 @@ std::unique_ptr<hir::HostIrContainer> lowerSegmentedFusionToHostIr(
     // TODO(#4927): Launch parameters should be passed in at runtime, not
     // compile time.  They can change according to input sizes.
     const std::vector<LaunchParams>& launch_params_per_segment,
-    std::vector<std::unique_ptr<ExecutorAbstract>>& executors);
+    std::vector<std::unique_ptr<ExecutorAbstract>>& executors,
+    SchedulerRuntimeInfo& runtime_info);
 
 } // namespace nvfuser
diff --git a/csrc/runtime/fusion_kernel_runtime.cpp b/csrc/runtime/fusion_kernel_runtime.cpp
index 454e17399a1..cd6ac3f1489 100644
--- a/csrc/runtime/fusion_kernel_runtime.cpp
+++ b/csrc/runtime/fusion_kernel_runtime.cpp
@@ -472,8 +472,13 @@ void FusionKernelRuntime::compileFusionParallel(KernelArgumentHolder args) {
     for (const auto& heuristic_params : schedulers()) {
       launch_params_per_segment.push_back(heuristic_params->lparams);
     }
+    SchedulerRuntimeInfo runtime_info(
+        segmented_fusion_->completeFusion(), args);
     std::unique_ptr<hir::HostIrContainer> hic = lowerSegmentedFusionToHostIr(
-        *segmented_fusion_, launch_params_per_segment, executors_);
+        *segmented_fusion_,
+        launch_params_per_segment,
+        executors_,
+        runtime_info);
 #ifdef NVFUSER_HOST_IR_JIT
     hij_ = std::make_unique<HostIrJit>(std::move(hic));
 #else

From 589220643cb305dc013ceab68a69c816ad16bfbc Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 12:22:50 -0400
Subject: [PATCH 24/71] Place TODO in csrc/host_ir/lower.cpp

---
 csrc/host_ir/lower.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/host_ir/lower.cpp b/csrc/host_ir/lower.cpp
index c8859f3c4fe..f36412eec5f 100644
--- a/csrc/host_ir/lower.cpp
+++ b/csrc/host_ir/lower.cpp
@@ -97,6 +97,8 @@ std::unique_ptr<hir::HostIrContainer> HostIrLower::lower(
           std::move(fusion), KernelArgumentHolder(), options, true);
   // Infer a topologically ordered traversal of the segmented fusion to
   // determine the order for launching the kernels/comms
+  // TODO: pass runtime info to prepareRuntimeOrder to optimize runtime order
+  // for memory use
   RuntimeWorkSpace workspace = prepareRuntimeOrder(*staged_fusion);
   // Create the HostIrContainer representing the host program. Each segment of
   // the segmented fusion will be translated to a HostIR

From 4e9ba4daaf68fe0ba1add0383aca610c2dbf9a68 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 13:28:44 -0400
Subject: [PATCH 25/71] Fix bugs in initialization of graph. validate inputs

---
 csrc/graph/task_graph.cpp | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 51eedd0a0ae..567f5050bc9 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -39,6 +39,13 @@ TaskGraph::TaskGraph(
         task.inputs.begin(), task.inputs.end(), [&](DataId data_id) {
           return getData(data_id).definition.has_value();
         }));
+    // Validate input
+    for (DataId input_id : task.inputs) {
+      NVF_ERROR(input_id >= 0 && (size_t)input_id < data_.size());
+    }
+    for (DataId output_id : task.outputs) {
+      NVF_ERROR(output_id >= 0 && (size_t)output_id < data_.size());
+    }
   }
   num_uses_.reserve(data_.size());
   for (const Data& data : data_) {
@@ -46,6 +53,18 @@ TaskGraph::TaskGraph(
     if (!data.definition.has_value()) {
       initial_allocation_ += (Size)data.size;
     }
+    // Validate input
+    if (data.definition.has_value()) {
+      DataId d = data.definition.value();
+      NVF_ERROR(d >= 0 && (size_t)d < tasks_.size());
+    }
+    if (data.aliases_input.has_value()) {
+      DataId a = data.aliases_input.value();
+      NVF_ERROR(a >= 0 && (size_t)a < tasks_.size());
+    }
+    for (TaskId use : data.uses) {
+      NVF_ERROR(use >= 0 && (size_t)use < tasks_.size());
+    }
   }
 }
 
@@ -277,7 +296,13 @@ class TaskSorter {
     }
 
     // Set up outstanding_dependencies_, future_uses_, and ready_tasks_
-    outstanding_dependencies_.reserve(graph_.numTasks());
+    future_uses_.resize(graph_.numData(), 0);
+    for (const TaskGraph::DataId data_id : arange(graph_.numData())) {
+      const TaskGraph::Data& data = graph_.getData(data_id);
+      future_uses_.at((size_t)data_id) = data.uses.size();
+    }
+
+    outstanding_dependencies_.resize(graph_.numTasks(), 0);
     for (const TaskGraph::TaskId task_id : arange(graph_.numTasks())) {
       const TaskGraph::Task& task = graph_.getTask(task_id);
       TaskGraph::DataId inputs_to_compute = 0;
@@ -288,18 +313,12 @@ class TaskSorter {
           inputs_to_compute++;
         }
       }
-      outstanding_dependencies_.push_back(inputs_to_compute);
+      outstanding_dependencies_.at((size_t)task_id) = inputs_to_compute;
       if (taskIsReady(task_id)) {
         ready_tasks_.insert(task_id);
       }
     }
 
-    future_uses_.reserve(graph_.numData());
-    for (const TaskGraph::DataId data_id : arange(graph_.numData())) {
-      const TaskGraph::Data& data = graph_.getData(data_id);
-      future_uses_.push_back(data.uses.size());
-    }
-
     // Initialize best_usage
     TaskGraph::Size best_usage = std::numeric_limits<TaskGraph::Size>::max();
     std::vector<TaskGraph::Step> best_steps;

From cc753c90018b4390570d69673f9e323c4cf20e46 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 13:28:56 -0400
Subject: [PATCH 26/71] Add ImpossibleAlias test

---
 tests/cpp/test_task_graph.cpp | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 724e2341e15..c19c1c217ae 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -35,8 +35,7 @@ std::vector<TaskGraph::Data> inferData(const Tasks& tasks) {
   }
   std::vector<TaskGraph::Data> all_data((size_t)max_data_id + 1);
 
-  for (const TaskGraph::Task& task : tasks) {
-    auto task_id = (TaskGraph::TaskId)tasks.size();
+  for (const auto& [task_id, task] : enumerate(tasks)) {
     for (TaskGraph::DataId input_id : task.inputs) {
       all_data.at(input_id).uses.push_back(task_id);
     }
@@ -72,4 +71,21 @@ TEST_F(TaskGraphTest, Basic) {
   EXPECT_EQ(getTasks(graph.findOptimalOrder()), expected);
 }
 
+// This example includes two segments, each of which aliases the other
+TEST_F(TaskGraphTest, ImpossibleAlias) {
+  // Two tasks, each takes the same two inputs
+  Tasks tasks{{{0, 1}, {2}}, {{0, 1}, {3}}};
+  auto data = inferData(tasks);
+  // Each of the segment outputs aliases a different input
+  data[2].aliases_input = 0;
+  data[3].aliases_input = 1;
+  // This graph can't be ordered without breaking the aliasing constraint
+  auto graph = TaskGraph(tasks, data);
+
+  EXPECT_THAT(
+      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
+          "Ran out of ready tasks before completing ordering")));
+}
+
 } // namespace nvfuser

From 760b73b6ec3c5d7799afde520ffa58a3182a930d Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 14:07:14 -0400
Subject: [PATCH 27/71] Add cycle tests

---
 tests/cpp/test_task_graph.cpp | 37 +++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index c19c1c217ae..02847ef3247 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -88,4 +88,41 @@ TEST_F(TaskGraphTest, ImpossibleAlias) {
           "Ran out of ready tasks before completing ordering")));
 }
 
+TEST_F(TaskGraphTest, SelfEdge) {
+  Tasks tasks{{{0}, {0}}};
+  auto data = inferData(tasks);
+  // This graph can't be ordered because it contains an edge from a Data node
+  // back to itself. A task can't be both producer and consumer to a Data.
+  auto graph = TaskGraph(tasks, data);
+
+  EXPECT_THAT(
+      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
+          "Ran out of ready tasks before completing ordering")));
+}
+
+TEST_F(TaskGraphTest, TwoCycle) {
+  Tasks tasks{{{0}, {1}}, {{1}, {0}}};
+  auto data = inferData(tasks);
+  // This graph can't be ordered because it contains a cycle
+  auto graph = TaskGraph(tasks, data);
+
+  EXPECT_THAT(
+      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
+          "Ran out of ready tasks before completing ordering")));
+}
+
+TEST_F(TaskGraphTest, ThreeCycle) {
+  Tasks tasks{{{0}, {1}}, {{1}, {2}}, {{2}, {0}}};
+  auto data = inferData(tasks);
+  // This graph can't be ordered because it contains a cycle
+  auto graph = TaskGraph(tasks, data);
+
+  EXPECT_THAT(
+      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
+          "Ran out of ready tasks before completing ordering")));
+}
+
 } // namespace nvfuser

From bcad2a3f0d6f70a3c8d0813ff70166bb78a62eaa Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 15:06:56 -0400
Subject: [PATCH 28/71] Start improving tests

---
 tests/cpp/test_task_graph.cpp | 72 +++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 02847ef3247..da06681c7a8 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -63,6 +63,11 @@ std::vector<TaskGraph::TaskId> getTasks(const TaskGraph::SortResult& result) {
 }
 
 TEST_F(TaskGraphTest, Basic) {
+  //   0   1
+  //   |\ /
+  //   | 2
+  //   |/
+  //   3
   Tasks tasks{{{0, 1}, {2}}, {{0, 2}, {3}}};
   auto data = inferData(tasks);
   auto graph = TaskGraph(tasks, data);
@@ -73,6 +78,11 @@ TEST_F(TaskGraphTest, Basic) {
 
 // This example includes two segments, each of which aliases the other
 TEST_F(TaskGraphTest, ImpossibleAlias) {
+  //   0   1
+  //   |\ /|
+  //   | X |
+  //   |/ \|
+  //   2   3
   // Two tasks, each takes the same two inputs
   Tasks tasks{{{0, 1}, {2}}, {{0, 1}, {3}}};
   auto data = inferData(tasks);
@@ -125,4 +135,66 @@ TEST_F(TaskGraphTest, ThreeCycle) {
           "Ran out of ready tasks before completing ordering")));
 }
 
+TEST_F(TaskGraphTest, FreeableIntermediate) {
+  //   0
+  //  / \
+  // 1   2
+  //     |
+  //     3
+  Tasks tasks{
+      {{0}, {1}}, // Task 0
+      {{0}, {2}}, // Task 1
+      {{2}, {3}}, // Task 2
+  };
+  auto data = inferData(tasks);
+  auto graph = TaskGraph(tasks, data);
+
+  std::cout << graph << std::endl;
+
+  TaskGraph::SortResult result = graph.findOptimalOrder();
+
+  // Expect that we evaluate the branch with intermediates before the other,
+  // since those intermediates can take the space we'll need later for output 1
+  std::vector<TaskGraph::TaskId> expected{1, 2, 0};
+  EXPECT_EQ(getTasks(result), expected);
+
+  EXPECT_EQ(result.steps.back().high_water_mark, 2);
+}
+
+TEST_F(TaskGraphTest, DifferentSizes) {
+  //   0
+  //  / \
+  // 1   4
+  // |   |
+  // 2   5
+  // |   |
+  // 3   6
+  //  \ /
+  //   7
+  Tasks tasks{
+      {{0}, {1}}, // Task 0
+      {{1}, {2}}, // Task 1
+      {{2}, {3}}, // Task 2
+      {{0}, {4}}, // Task 3
+      {{4}, {5}}, // Task 4
+      {{5}, {6}}, // Task 5
+      {{3, 6}, {7}} // Task 6
+  };
+  auto data = inferData(tasks);
+  data[1].size = 8;
+  data[2].size = 12;
+  data[3].size = 8;
+  data[4].size = 10;
+  // Note that 4 is large but 5 is smaller than the others, so we should compute
+  // up to here then start on the 0-1-2-3 branch after freeing 4
+  data[5].size = 5;
+  data[6].size = 8;
+  auto graph = TaskGraph(tasks, data);
+
+  std::cout << graph << std::endl;
+
+  std::vector<TaskGraph::TaskId> expected{0, 3, 4, 1, 2, 5, 6};
+  EXPECT_EQ(getTasks(graph.findOptimalOrder()), expected);
+}
+
 } // namespace nvfuser

From 51c8c6c39bb244ed340eb7387f0ae105ed5f2a1a Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 15:30:43 -0400
Subject: [PATCH 29/71] More improvements to tests

---
 tests/cpp/test_task_graph.cpp | 46 ++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index da06681c7a8..03bea1a1fa9 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -46,6 +46,7 @@ std::vector<TaskGraph::Data> inferData(const Tasks& tasks) {
 
   // Detect inputs and outputs and ensure they are not freed
   for (TaskGraph::Data& data : all_data) {
+    data.size = 1;
     data.can_free = data.definition.has_value() && !data.uses.empty();
   }
 
@@ -72,8 +73,12 @@ TEST_F(TaskGraphTest, Basic) {
   auto data = inferData(tasks);
   auto graph = TaskGraph(tasks, data);
 
+  const TaskGraph::SortResult result = graph.findOptimalOrder();
+
+  ASSERT_EQ(result.steps.size(), tasks.size());
   std::vector<TaskGraph::TaskId> expected{0, 1};
-  EXPECT_EQ(getTasks(graph.findOptimalOrder()), expected);
+  EXPECT_EQ(getTasks(result), expected);
+  EXPECT_EQ(result.steps.back().high_water_mark, 4);
 }
 
 // This example includes two segments, each of which aliases the other
@@ -93,7 +98,7 @@ TEST_F(TaskGraphTest, ImpossibleAlias) {
   auto graph = TaskGraph(tasks, data);
 
   EXPECT_THAT(
-      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      [&graph]() { graph.findOptimalOrder(); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "Ran out of ready tasks before completing ordering")));
 }
@@ -106,7 +111,7 @@ TEST_F(TaskGraphTest, SelfEdge) {
   auto graph = TaskGraph(tasks, data);
 
   EXPECT_THAT(
-      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      [&graph]() { graph.findOptimalOrder(); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "Ran out of ready tasks before completing ordering")));
 }
@@ -118,7 +123,7 @@ TEST_F(TaskGraphTest, TwoCycle) {
   auto graph = TaskGraph(tasks, data);
 
   EXPECT_THAT(
-      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      [&graph]() { graph.findOptimalOrder(); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "Ran out of ready tasks before completing ordering")));
 }
@@ -130,35 +135,34 @@ TEST_F(TaskGraphTest, ThreeCycle) {
   auto graph = TaskGraph(tasks, data);
 
   EXPECT_THAT(
-      [&graph]() { getTasks(graph.findOptimalOrder()); },
+      [&graph]() { graph.findOptimalOrder(); },
       ::testing::ThrowsMessage<nvfuser::nvfError>(::testing::HasSubstr(
           "Ran out of ready tasks before completing ordering")));
 }
 
 TEST_F(TaskGraphTest, FreeableIntermediate) {
   //   0
-  //  / \
-  // 1   2
+  //  /|\
+  // 1 2 3
   //     |
-  //     3
+  //     4
   Tasks tasks{
       {{0}, {1}}, // Task 0
       {{0}, {2}}, // Task 1
-      {{2}, {3}}, // Task 2
+      {{0}, {3}}, // Task 2
+      {{3}, {4}}, // Task 3
   };
   auto data = inferData(tasks);
   auto graph = TaskGraph(tasks, data);
 
-  std::cout << graph << std::endl;
-
-  TaskGraph::SortResult result = graph.findOptimalOrder();
+  const TaskGraph::SortResult result = graph.findOptimalOrder();
 
-  // Expect that we evaluate the branch with intermediates before the other,
-  // since those intermediates can take the space we'll need later for output 1
-  std::vector<TaskGraph::TaskId> expected{1, 2, 0};
-  EXPECT_EQ(getTasks(result), expected);
-
-  EXPECT_EQ(result.steps.back().high_water_mark, 2);
+  // Expect that we evaluate the branch with intermediate before the others,
+  // since that intermediate 3 can take the space we'll need later for output 1
+  // or 2
+  ASSERT_EQ(result.steps.size(), tasks.size());
+  EXPECT_NE(getTasks(result).back(), 3);
+  EXPECT_EQ(result.steps.back().high_water_mark, 4);
 }
 
 TEST_F(TaskGraphTest, DifferentSizes) {
@@ -193,8 +197,12 @@ TEST_F(TaskGraphTest, DifferentSizes) {
 
   std::cout << graph << std::endl;
 
+  const TaskGraph::SortResult result = graph.findOptimalOrder();
+
+  ASSERT_EQ(result.steps.size(), tasks.size());
   std::vector<TaskGraph::TaskId> expected{0, 3, 4, 1, 2, 5, 6};
-  EXPECT_EQ(getTasks(graph.findOptimalOrder()), expected);
+  EXPECT_EQ(getTasks(result), expected);
+  EXPECT_EQ(result.steps.back().high_water_mark, 2);
 }
 
 } // namespace nvfuser

From 2e85c6ff42e0f0f4282a2588ef5a14b26070e77b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 15:38:14 -0400
Subject: [PATCH 30/71] Minor

---
 tests/cpp/test_task_graph.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 03bea1a1fa9..2412b644797 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -70,7 +70,7 @@ TEST_F(TaskGraphTest, Basic) {
   //   |/
   //   3
   Tasks tasks{{{0, 1}, {2}}, {{0, 2}, {3}}};
-  auto data = inferData(tasks);
+  std::vector<TaskGraph::Data> data = inferData(tasks);
   auto graph = TaskGraph(tasks, data);
 
   const TaskGraph::SortResult result = graph.findOptimalOrder();
@@ -90,7 +90,7 @@ TEST_F(TaskGraphTest, ImpossibleAlias) {
   //   2   3
   // Two tasks, each takes the same two inputs
   Tasks tasks{{{0, 1}, {2}}, {{0, 1}, {3}}};
-  auto data = inferData(tasks);
+  std::vector<TaskGraph::Data> data = inferData(tasks);
   // Each of the segment outputs aliases a different input
   data[2].aliases_input = 0;
   data[3].aliases_input = 1;
@@ -105,7 +105,7 @@ TEST_F(TaskGraphTest, ImpossibleAlias) {
 
 TEST_F(TaskGraphTest, SelfEdge) {
   Tasks tasks{{{0}, {0}}};
-  auto data = inferData(tasks);
+  std::vector<TaskGraph::Data> data = inferData(tasks);
   // This graph can't be ordered because it contains an edge from a Data node
   // back to itself. A task can't be both producer and consumer to a Data.
   auto graph = TaskGraph(tasks, data);
@@ -118,7 +118,7 @@ TEST_F(TaskGraphTest, SelfEdge) {
 
 TEST_F(TaskGraphTest, TwoCycle) {
   Tasks tasks{{{0}, {1}}, {{1}, {0}}};
-  auto data = inferData(tasks);
+  std::vector<TaskGraph::Data> data = inferData(tasks);
   // This graph can't be ordered because it contains a cycle
   auto graph = TaskGraph(tasks, data);
 
@@ -130,7 +130,7 @@ TEST_F(TaskGraphTest, TwoCycle) {
 
 TEST_F(TaskGraphTest, ThreeCycle) {
   Tasks tasks{{{0}, {1}}, {{1}, {2}}, {{2}, {0}}};
-  auto data = inferData(tasks);
+  std::vector<TaskGraph::Data> data = inferData(tasks);
   // This graph can't be ordered because it contains a cycle
   auto graph = TaskGraph(tasks, data);
 
@@ -152,7 +152,7 @@ TEST_F(TaskGraphTest, FreeableIntermediate) {
       {{0}, {3}}, // Task 2
       {{3}, {4}}, // Task 3
   };
-  auto data = inferData(tasks);
+  std::vector<TaskGraph::Data> data = inferData(tasks);
   auto graph = TaskGraph(tasks, data);
 
   const TaskGraph::SortResult result = graph.findOptimalOrder();
@@ -184,7 +184,7 @@ TEST_F(TaskGraphTest, DifferentSizes) {
       {{5}, {6}}, // Task 5
       {{3, 6}, {7}} // Task 6
   };
-  auto data = inferData(tasks);
+  std::vector<TaskGraph::Data> data = inferData(tasks);
   data[1].size = 8;
   data[2].size = 12;
   data[3].size = 8;

From ef65076091984b9a886daf1f45a1b8559044b377 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 16:21:25 -0400
Subject: [PATCH 31/71] Fix backtracking bug

---
 csrc/graph/task_graph.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 567f5050bc9..ae13198eecb 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -319,8 +319,7 @@ class TaskSorter {
       }
     }
 
-    // Initialize best_usage
-    TaskGraph::Size best_usage = std::numeric_limits<TaskGraph::Size>::max();
+    // Initialize best steps found so far
     std::vector<TaskGraph::Step> best_steps;
 
     // This is the main optimization loop
@@ -340,6 +339,9 @@ class TaskSorter {
         }
       }
 
+      // Reset backtracked_task_id
+      backtracked_task_id = -1;
+
       if (next_task_id == -1) {
         // There are no ready tasks with ID above the backtracked_task_id. This
         // means it is time to backtrack
@@ -358,7 +360,8 @@ class TaskSorter {
 
       // If our high water mark is above best_usage, terminate early and
       // backtrack
-      if (steps_.back().high_water_mark > best_usage) {
+      if (!best_steps.empty() &&
+          steps_.back().high_water_mark > best_steps.back().high_water_mark) {
         backtracked_task_id = backtrack();
         continue;
       }

From 0f4126c84f822aebde3002e5c6794fbe0c91a523 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 16:38:51 -0400
Subject: [PATCH 32/71] Fix up DifferentSizes test

---
 tests/cpp/test_task_graph.cpp | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 2412b644797..2588e785634 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -185,14 +185,20 @@ TEST_F(TaskGraphTest, DifferentSizes) {
       {{3, 6}, {7}} // Task 6
   };
   std::vector<TaskGraph::Data> data = inferData(tasks);
-  data[1].size = 8;
-  data[2].size = 12;
-  data[3].size = 8;
-  data[4].size = 10;
-  // Note that 4 is large but 5 is smaller than the others, so we should compute
-  // up to here then start on the 0-1-2-3 branch after freeing 4
-  data[5].size = 5;
-  data[6].size = 8;
+  data[0].size = 1;
+
+  data[1].size = 12;
+  data[2].size = 10;
+  data[3].size = 10;
+
+  // Note that 4 and 5  are large but that 6 is smaller than the others, so we
+  // should compute up to here then start on the 0-1-2-3 branch after freeing 4
+  // and 5. Otherwise we would need to hold
+  data[4].size = 11;
+  data[5].size = 11;
+  data[6].size = 7;
+
+  data[7].size = 1;
   auto graph = TaskGraph(tasks, data);
 
   std::cout << graph << std::endl;
@@ -200,9 +206,9 @@ TEST_F(TaskGraphTest, DifferentSizes) {
   const TaskGraph::SortResult result = graph.findOptimalOrder();
 
   ASSERT_EQ(result.steps.size(), tasks.size());
-  std::vector<TaskGraph::TaskId> expected{0, 3, 4, 1, 2, 5, 6};
+  std::vector<TaskGraph::TaskId> expected{3, 4, 5, 0, 1, 2, 6};
   EXPECT_EQ(getTasks(result), expected);
-  EXPECT_EQ(result.steps.back().high_water_mark, 2);
+  EXPECT_EQ(result.steps.back().high_water_mark, 30);
 }
 
 } // namespace nvfuser

From 620cf3aa16b16fe32017805feeefef50af0f229c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 19:10:12 -0400
Subject: [PATCH 33/71] Drafted new failing test

---
 tests/cpp/test_task_graph.cpp | 47 +++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 2588e785634..67888331f34 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -211,4 +211,51 @@ TEST_F(TaskGraphTest, DifferentSizes) {
   EXPECT_EQ(result.steps.back().high_water_mark, 30);
 }
 
+TEST_F(TaskGraphTest, DifferentSizesRestartBranch) {
+  //   0
+  //  / \
+  // 1   6
+  // |   |
+  // 2*  7
+  // |   |
+  // 3   8*
+  // |   |
+  // 4*  9
+  // |   |
+  // 5  10
+  //  \ /
+  //  11
+  //
+  // The starred nodes are smaller than the others
+  Tasks tasks{
+      {{0}, {1}}, // Task 0
+      {{1}, {2}}, // Task 1
+      {{2}, {3}}, // Task 2
+      {{3}, {4}}, // Task 3
+      {{4}, {5}}, // Task 4
+      {{0}, {6}}, // Task 5
+      {{6}, {7}}, // Task 6
+      {{7}, {8}}, // Task 7
+      {{8}, {9}}, // Task 8
+      {{9}, {10}}, // Task 9
+      {{5, 10}, {11}} // Task 10
+  };
+  std::vector<TaskGraph::Data> data = inferData(tasks);
+  for (auto& di : data) {
+    di.size = 10;
+  }
+  data[2].size = 5;
+  data[4].size = 5;
+  data[8].size = 5;
+
+  auto graph = TaskGraph(tasks, data);
+
+  const TaskGraph::SortResult result = graph.findOptimalOrder();
+
+  ASSERT_EQ(result.steps.size(), tasks.size());
+  std::vector<TaskGraph::TaskId> expected{5, 6, 0, 1, 7, 8, 9, 2, 3, 4, 10};
+  EXPECT_EQ(getTasks(result), expected);
+  EXPECT_EQ(result.steps.back().high_water_mark, 30);
+}
+
 } // namespace nvfuser

From ed4585f05682f9cc59b3eeedfeffa454665a0f63 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 20:01:44 -0400
Subject: [PATCH 34/71] Add test from Kayaaslan 2018

---
 tests/cpp/test_task_graph.cpp | 82 +++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 67888331f34..2dc4332595c 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -201,8 +201,6 @@ TEST_F(TaskGraphTest, DifferentSizes) {
   data[7].size = 1;
   auto graph = TaskGraph(tasks, data);
 
-  std::cout << graph << std::endl;
-
   const TaskGraph::SortResult result = graph.findOptimalOrder();
 
   ASSERT_EQ(result.steps.size(), tasks.size());
@@ -211,51 +209,61 @@ TEST_F(TaskGraphTest, DifferentSizes) {
   EXPECT_EQ(result.steps.back().high_water_mark, 30);
 }
 
-TEST_F(TaskGraphTest, DifferentSizesRestartBranch) {
-  //   0
-  //  / \
-  // 1   6
-  // |   |
-  // 2*  7
-  // |   |
-  // 3   8*
-  // |   |
-  // 4*  9
-  // |   |
-  // 5  10
-  //  \ /
-  //  11
-  //
+// This is the example from Figure 1 of Kayaaslan et al. 2018
+// It includes temporary space needed for each task.
+// This is a candidate for the Liu algorithm instead of brute force search.
+// https://doi.org/10.1016/j.tcs.2017.09.037
+TEST_F(TaskGraphTest, InTree) {
+  // 0 3
+  // | |
+  // 1 4 7
+  // | | |
+  // 2 5 8
+  //  \| |
+  //   6 9
+  //    \|
+  //    10
   // The starred nodes are smaller than the others
   Tasks tasks{
-      {{0}, {1}}, // Task 0
-      {{1}, {2}}, // Task 1
-      {{2}, {3}}, // Task 2
-      {{3}, {4}}, // Task 3
-      {{4}, {5}}, // Task 4
-      {{0}, {6}}, // Task 5
-      {{6}, {7}}, // Task 6
-      {{7}, {8}}, // Task 7
-      {{8}, {9}}, // Task 8
-      {{9}, {10}}, // Task 9
-      {{5, 10}, {11}} // Task 10
+      {{0}, {1}},     // Task 0
+      {{1}, {2}},     // Task 1
+      {{3}, {4}},     // Task 2
+      {{4}, {5}},     // Task 3
+      {{2, 5}, {6}},  // Task 4
+      {{7}, {8}},     // Task 5
+      {{8}, {9}},     // Task 6
+      {{6, 9}, {10}}, // Task 7
   };
   std::vector<TaskGraph::Data> data = inferData(tasks);
-  for (auto& di : data) {
-    di.size = 10;
-  }
-  data[2].size = 5;
-  data[4].size = 5;
-  data[8].size = 5;
+  data[0].size = 1; // input
+  data[1].size = 4;
+  data[2].size = 1;
+  data[3].size = 1; // input
+  data[4].size = 2;
+  data[5].size = 2;
+  data[6].size = 2;
+  data[7].size = 1; // input
+  data[8].size = 1;
+  data[9].size = 5;
+  data[10].size = 1;
+  tasks[0].temp_space = 4; // A
+  tasks[1].temp_space = 3; // B
+  tasks[2].temp_space = 1; // C
+  tasks[3].temp_space = 2; // D
+  tasks[4].temp_space = 2; // E
+  tasks[5].temp_space = 8; // F
+  tasks[6].temp_space = 2; // G
+  tasks[7].temp_space = 1; // H
 
   auto graph = TaskGraph(tasks, data);
 
   const TaskGraph::SortResult result = graph.findOptimalOrder();
 
   ASSERT_EQ(result.steps.size(), tasks.size());
-  std::vector<TaskGraph::TaskId> expected{5, 6, 0, 1, 7, 8, 9, 2, 3, 4, 10};
-  EXPECT_EQ(getTasks(result), expected);
-  EXPECT_EQ(result.steps.back().high_water_mark, 30);
+  // By Kayaaslan et al. 2018, Sn 3.1,
+  // one optimal order is F A B C D E G H which has cost 34
+  // There are others with the same cost such as F C D A B E G H
+  EXPECT_EQ(result.steps.back().high_water_mark, 34);
 }
 
 } // namespace nvfuser

From cdf85988639afd678cd7d57e66f2763434309a3b Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 20:40:51 -0400
Subject: [PATCH 35/71] Introduce time limit for sorting

---
 csrc/graph/task_graph.cpp | 29 +++++++++++++++++++----------
 csrc/graph/task_graph.h   |  2 +-
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index ae13198eecb..c9a5c7d6660 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -10,6 +10,7 @@
 #include <utils.h>
 
 #include <algorithm>
+#include <chrono>
 #include <limits>
 #include <set>
 #include <sstream>
@@ -148,10 +149,10 @@ namespace {
 //! c.f. https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
 class TaskSorter {
  public:
-  TaskSorter(const TaskGraph& graph, bool validate, int64_t max_iters)
+  TaskSorter(const TaskGraph& graph, bool validate, int64_t max_time_us)
       : graph_(graph),
         validate_(validate),
-        max_iters_(max_iters),
+        max_time_us_(max_time_us),
         has_aliasing_(std::ranges::any_of(
             arange(graph.numData()),
             [&graph](TaskGraph::DataId data_id) {
@@ -324,9 +325,20 @@ class TaskSorter {
 
     // This is the main optimization loop
     TaskGraph::TaskId backtracked_task_id = -1;
-    int64_t iter = 0;
-    while (iter < max_iters_) {
-      iter++;
+
+    using Clock = std::chrono::high_resolution_clock;
+    Clock::time_point start = Clock::now();
+
+    for (int64_t iter : arange(10000000)) {
+      if (iter % 64 == 0) {
+        Clock::time_point end = Clock::now();
+        if (std::chrono::duration_cast<std::chrono::microseconds>(end - start)
+                .count() > max_time_us_) {
+          result_.iterations = iter;
+          break;
+        }
+      }
+
       NVF_ERROR(
           !ready_tasks_.empty() || steps_.size() == (size_t)graph_.numTasks(),
           "Ran out of ready tasks before completing ordering");
@@ -372,7 +384,6 @@ class TaskSorter {
         best_steps = steps_;
       }
     }
-    result_.iterations = iter;
 
     // Record our best found steps
     result_.steps = best_steps;
@@ -385,7 +396,7 @@ class TaskSorter {
  private:
   const TaskGraph& graph_;
   const bool validate_;
-  const int64_t max_iters_;
+  const int64_t max_time_us_;
 
   //! This allows us to skip aliasing checks in the common case where no inputs
   //! are aliased by outputs
@@ -474,9 +485,7 @@ std::string TaskGraph::toString() const {
 }
 
 TaskGraph::SortResult TaskGraph::findOptimalOrder() const {
-  // TODO: Find a reasonable default number of iterations. Note that one
-  // iteration equals one task, not one ordering
-  TaskSorter sorter(*this, /*validate=*/true, /*max_iters=*/2000);
+  TaskSorter sorter(*this, /*validate=*/true, /*max_time_us=*/100000);
   return sorter.result();
 }
 
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 195851dcc30..060716451f4 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -122,7 +122,7 @@ class TaskGraph {
 
     //! Whether the search was exhaustive. If not, then it was likely cut off
     //! early because of an iteration limit.
-    bool exhaustive;
+    bool exhaustive = false;
 
     std::string toString() const;
   };

From d5951f3cabd35914cccaed7f785513e68298fda3 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 20 Aug 2025 20:44:35 -0400
Subject: [PATCH 36/71] lintrunner tests

---
 tests/cpp/test_task_graph.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 2dc4332595c..17dfceb902a 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -225,13 +225,13 @@ TEST_F(TaskGraphTest, InTree) {
   //    10
   // The starred nodes are smaller than the others
   Tasks tasks{
-      {{0}, {1}},     // Task 0
-      {{1}, {2}},     // Task 1
-      {{3}, {4}},     // Task 2
-      {{4}, {5}},     // Task 3
-      {{2, 5}, {6}},  // Task 4
-      {{7}, {8}},     // Task 5
-      {{8}, {9}},     // Task 6
+      {{0}, {1}}, // Task 0
+      {{1}, {2}}, // Task 1
+      {{3}, {4}}, // Task 2
+      {{4}, {5}}, // Task 3
+      {{2, 5}, {6}}, // Task 4
+      {{7}, {8}}, // Task 5
+      {{8}, {9}}, // Task 6
       {{6, 9}, {10}}, // Task 7
   };
   std::vector<TaskGraph::Data> data = inferData(tasks);

From 648053071db84227b74c49341c8da996237c37cc Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 21 Aug 2025 07:05:23 -0400
Subject: [PATCH 37/71] Change diagrams to proper multiline comments

---
 tests/cpp/test_task_graph.cpp | 80 ++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 17dfceb902a..015a929ac5f 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -64,11 +64,13 @@ std::vector<TaskGraph::TaskId> getTasks(const TaskGraph::SortResult& result) {
 }
 
 TEST_F(TaskGraphTest, Basic) {
-  //   0   1
-  //   |\ /
-  //   | 2
-  //   |/
-  //   3
+  /*
+   *   0   1
+   *   |\ /
+   *   | 2
+   *   |/
+   *   3
+   */
   Tasks tasks{{{0, 1}, {2}}, {{0, 2}, {3}}};
   std::vector<TaskGraph::Data> data = inferData(tasks);
   auto graph = TaskGraph(tasks, data);
@@ -83,12 +85,15 @@ TEST_F(TaskGraphTest, Basic) {
 
 // This example includes two segments, each of which aliases the other
 TEST_F(TaskGraphTest, ImpossibleAlias) {
-  //   0   1
-  //   |\ /|
-  //   | X |
-  //   |/ \|
-  //   2   3
-  // Two tasks, each takes the same two inputs
+  /*
+   *   0   1
+   *   |\ /|
+   *   | X |
+   *   |/ \|
+   *   2   3
+   *
+   * Two tasks, each takes the same two inputs
+   */
   Tasks tasks{{{0, 1}, {2}}, {{0, 1}, {3}}};
   std::vector<TaskGraph::Data> data = inferData(tasks);
   // Each of the segment outputs aliases a different input
@@ -141,11 +146,13 @@ TEST_F(TaskGraphTest, ThreeCycle) {
 }
 
 TEST_F(TaskGraphTest, FreeableIntermediate) {
-  //   0
-  //  /|\
-  // 1 2 3
-  //     |
-  //     4
+  /*
+   *     0
+   *    /|\
+   *   1 2 3
+   *       |
+   *       4
+   */
   Tasks tasks{
       {{0}, {1}}, // Task 0
       {{0}, {2}}, // Task 1
@@ -166,15 +173,17 @@ TEST_F(TaskGraphTest, FreeableIntermediate) {
 }
 
 TEST_F(TaskGraphTest, DifferentSizes) {
-  //   0
-  //  / \
-  // 1   4
-  // |   |
-  // 2   5
-  // |   |
-  // 3   6
-  //  \ /
-  //   7
+  /*
+   *     0
+   *    / \
+   *   1   4
+   *   |   |
+   *   2   5
+   *   |   |
+   *   3   6
+   *    \ /
+   *     7
+   */
   Tasks tasks{
       {{0}, {1}}, // Task 0
       {{1}, {2}}, // Task 1
@@ -214,16 +223,17 @@ TEST_F(TaskGraphTest, DifferentSizes) {
 // This is a candidate for the Liu algorithm instead of brute force search.
 // https://doi.org/10.1016/j.tcs.2017.09.037
 TEST_F(TaskGraphTest, InTree) {
-  // 0 3
-  // | |
-  // 1 4 7
-  // | | |
-  // 2 5 8
-  //  \| |
-  //   6 9
-  //    \|
-  //    10
-  // The starred nodes are smaller than the others
+  /*
+   *   0 3
+   *   | |
+   *   1 4 7
+   *   | | |
+   *   2 5 8
+   *    \| |
+   *     6 9
+   *      \|
+   *      10
+   */
   Tasks tasks{
       {{0}, {1}}, // Task 0
       {{1}, {2}}, // Task 1

From 42fcc64bb630d329b299fb08880c5c8e363b438c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 21 Aug 2025 08:54:22 -0400
Subject: [PATCH 38/71] Add NVFUSER_DUMP=task_graph

---
 csrc/options.cpp | 1 +
 csrc/options.h   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/csrc/options.cpp b/csrc/options.cpp
index 5918444feba..470fcc6e1cd 100644
--- a/csrc/options.cpp
+++ b/csrc/options.cpp
@@ -145,6 +145,7 @@ std::unordered_map<DebugDumpOption, std::vector<std::string>> Options<
       {"segmented_fusion", DebugDumpOption::FusionSegments},
       {"segmenter_logging", DebugDumpOption::FusionSegmenterLog},
       {"scheduler_params", DebugDumpOption::SchedulerDebug},
+      {"task_graph", DebugDumpOption::TaskGraph},
       {"dynamic_shared_memory", DebugDumpOption::DynamicSharedMemory},
       {"scheduler_verbose", DebugDumpOption::SchedulerVerbose},
       {"sync_map", DebugDumpOption::SyncMap},
diff --git a/csrc/options.h b/csrc/options.h
index 1f6d6d80249..f5bb2296f5b 100644
--- a/csrc/options.h
+++ b/csrc/options.h
@@ -73,6 +73,7 @@ enum class DebugDumpOption {
   PythonFrontendDebug, //! Python Frontend debug information.
   TransformPropagator, //! When running TransformPropagator, print propagation
                        //! path and replay result
+  TaskGraph, //! Print information about TaskGraph used in segmentation ordering
   Cubin, //! Dump compiled CUBIN
   Sass, //! Dump disassembled SASS
   SassToFile, //!< Dump disassembled SASS to File

From 41aa1544b3bf689acf0e10521b1f9a01a5660972 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 21 Aug 2025 08:54:46 -0400
Subject: [PATCH 39/71] Add mermaid printing when NVFUSER_DUMP=task_graph is
 given

---
 csrc/.cursor/rules        |   0
 csrc/graph/task_graph.cpp | 103 +++++++++++++++++++++++++++++++++++++-
 csrc/graph/task_graph.h   |   3 ++
 3 files changed, 104 insertions(+), 2 deletions(-)
 create mode 100644 csrc/.cursor/rules

diff --git a/csrc/.cursor/rules b/csrc/.cursor/rules
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index c9a5c7d6660..036c9427b15 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -5,8 +5,10 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include <debug.h>
 #include <exceptions.h>
 #include <graph/task_graph.h>
+#include <options.h>
 #include <utils.h>
 
 #include <algorithm>
@@ -15,6 +17,7 @@
 #include <set>
 #include <sstream>
 #include <string>
+#include "options.h"
 
 namespace nvfuser {
 
@@ -69,6 +72,88 @@ TaskGraph::TaskGraph(
   }
 }
 
+std::string TaskGraph::toMermaid() const {
+  std::stringstream ss;
+
+  ss << "flowchart TD\n";
+
+  bool print_data_size = false;
+  if (numData() > 0) {
+    Size sz = data_.front().size;
+    for (const Data& data : data_) {
+      if (data.size != sz) {
+        print_data_size = true;
+        break;
+      }
+    }
+  }
+
+  std::vector<bool> is_aliased_input(numData(), false);
+
+  // Declare nodes with shapes and labels
+  for (const auto& [data_id, data] : enumerate(data_)) {
+    if (data.aliases_input.has_value()) {
+      is_aliased_input.at(data.aliases_input.value()) = true;
+    }
+    ss << "    d" << data_id << "([\"d" << data_id;
+    if (print_data_size) {
+      ss << " [" << data.size << "]";
+    }
+    ss << "\"]);\n";
+  }
+  for (const auto& [task_id, task] : enumerate(tasks_)) {
+    if (task.temp_space != 0) {
+      ss << "    t" << task_id << "[\"t" << task_id << " [" << task.temp_space
+         << "]\"];\n";
+    }
+  }
+
+  for (const auto& [task_id, task] : enumerate(tasks_)) {
+    for (const DataId& input_id : task.inputs) {
+      ss << "    d" << input_id << " --> t" << task_id << "\n";
+    }
+    for (const DataId& output_id : task.outputs) {
+      ss << "    t" << task_id << " --> d" << output_id << "\n";
+    }
+  }
+
+  ss << "\n";
+  ss << "    classDef task fill:orange;\n";
+  ss << "    classDef data fill:lightblue;\n";
+  ss << "    classDef dataInput fill:lightgreen;\n";
+  ss << "    classDef dataOutput fill:pink;\n";
+  ss << "    classDef aliasedInput fill:yellow;\n";
+  ss << "    classDef aliasEdge stroke-dasharray:3,stroke:blue;\n";
+
+  ss << "\n";
+  for (const TaskId task_id : arange(numTasks())) {
+    ss << "    class t" << task_id << " task;\n";
+  }
+  ss << "\n";
+  for (const auto& [data_id, data] : enumerate(data_)) {
+    // Create edges for aliases
+    if (data.aliases_input.has_value()) {
+      ss << "    d" << data_id << " alias" << data_id << "@--> d"
+         << data.aliases_input.value() << ";\n";
+      ss << "    class alias" << data_id << " aliasEdge;\n";
+    }
+
+    std::string class_name = "data";
+    if (!data.definition.has_value()) {
+      if (is_aliased_input.at(data_id)) {
+        class_name = "aliasedInput";
+      } else {
+        class_name = "dataInput";
+      }
+    } else if (!data.can_free) {
+      class_name = "dataOutput";
+    }
+    ss << "    class d" << data_id << " " << class_name << ";\n";
+  }
+
+  return ss.str();
+}
+
 void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
   // First find any Data in the graph that has no definition. This must be
   // preallocated before running the program, so we initialize allocated and
@@ -149,8 +234,13 @@ namespace {
 //! c.f. https://en.wikipedia.org/wiki/Topological_sorting#Kahn's_algorithm
 class TaskSorter {
  public:
-  TaskSorter(const TaskGraph& graph, bool validate, int64_t max_time_us)
+  TaskSorter(
+      const TaskGraph& graph,
+      bool validate,
+      int64_t max_time_us,
+      bool print_debug)
       : graph_(graph),
+        debug_(print_debug),
         validate_(validate),
         max_time_us_(max_time_us),
         has_aliasing_(std::ranges::any_of(
@@ -158,6 +248,10 @@ class TaskSorter {
             [&graph](TaskGraph::DataId data_id) {
               return graph.getData(data_id).aliases_input.has_value();
             })) {
+    if (debug_) {
+      debug() << graph.toString() << "\n\n";
+      debug() << "Mermaid graph:\n" << graph.toMermaid() << std::endl;
+    }
     sort();
   }
 
@@ -395,6 +489,7 @@ class TaskSorter {
 
  private:
   const TaskGraph& graph_;
+  const bool debug_;
   const bool validate_;
   const int64_t max_time_us_;
 
@@ -485,7 +580,11 @@ std::string TaskGraph::toString() const {
 }
 
 TaskGraph::SortResult TaskGraph::findOptimalOrder() const {
-  TaskSorter sorter(*this, /*validate=*/true, /*max_time_us=*/100000);
+  TaskSorter sorter(
+      *this,
+      /*validate=*/true,
+      /*max_time_us=*/100000,
+      /*debug=*/isDebugDumpEnabled(DebugDumpOption::TaskGraph));
   return sorter.result();
 }
 
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 060716451f4..98506319ce5 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -134,6 +134,9 @@ class TaskGraph {
 
   std::string toString() const;
 
+  //! Generates a string in the mermaid language for rendering online
+  std::string toMermaid() const;
+
  private:
   const std::vector<Task> tasks_;
   const std::vector<Data> data_;

From 9493cf059d6ebe7105c36b557a05bb11af55dd99 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Thu, 21 Aug 2025 08:55:01 -0400
Subject: [PATCH 40/71] Add more tests. SharedIntermediateWithAlias is failing
 to sort

---
 tests/cpp/test_task_graph.cpp | 54 +++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 015a929ac5f..28fc9630aae 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -17,11 +17,6 @@ namespace nvfuser {
 using Tasks = std::vector<TaskGraph::Task>;
 using TaskGraphTest = NVFuserTest;
 
-struct SimpleAlias {
-  TaskGraph::DataId output;
-  TaskGraph::DataId input;
-};
-
 std::vector<TaskGraph::Data> inferData(const Tasks& tasks) {
   // Find number of data items so we can resize
   TaskGraph::DataId max_data_id = 0;
@@ -83,6 +78,55 @@ TEST_F(TaskGraphTest, Basic) {
   EXPECT_EQ(result.steps.back().high_water_mark, 4);
 }
 
+TEST_F(TaskGraphTest, SharedIntermediate) {
+  /*
+   *     0
+   *    /|\
+   *   | 1 |
+   *   |/ \|
+   *   2   3
+   */
+  Tasks tasks{
+      {{0}, {1}},
+      {{0, 1}, {2}},
+      {{0, 1}, {3}},
+  };
+  std::vector<TaskGraph::Data> data = inferData(tasks);
+  auto graph = TaskGraph(tasks, data);
+
+  const TaskGraph::SortResult result = graph.findOptimalOrder();
+
+  ASSERT_EQ(result.steps.size(), tasks.size());
+  // Either 0 1 2 or 0 2 1 are acceptable orders
+  EXPECT_EQ(result.steps.back().high_water_mark, 4);
+}
+
+TEST_F(TaskGraphTest, SharedIntermediateWithAlias) {
+  /*
+   *     0
+   *    /|\
+   *   | 1 |
+   *   |/ \|
+   *   2   3
+   */
+  Tasks tasks{
+      {{0}, {1}}, // Task 0
+      {{0, 1}, {2}}, // Task 1
+      {{0, 1}, {3}}, // Task 2
+  };
+  std::vector<TaskGraph::Data> data = inferData(tasks);
+  data.at(3).aliases_input = 0;
+  auto graph = TaskGraph(tasks, data);
+
+  const TaskGraph::SortResult result = graph.findOptimalOrder();
+
+  ASSERT_EQ(result.steps.size(), tasks.size());
+  // Due to the alias 0 1 2 is the only acceptable order
+  std::vector<TaskGraph::TaskId> expected{0, 2, 1};
+  EXPECT_EQ(getTasks(result), expected);
+  EXPECT_EQ(result.steps.back().high_water_mark, 4);
+}
+
 // This example includes two segments, each of which aliases the other
 TEST_F(TaskGraphTest, ImpossibleAlias) {
   /*

From b23274d07319b1acb288a301d194cc34b20502ca Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 22 Aug 2025 10:29:08 -0400
Subject: [PATCH 41/71] Convert graphs with aliases

---
 csrc/graph/task_graph.cpp     | 126 +++++++++++++++++++++++++++++-----
 csrc/graph/task_graph.h       |  11 +++
 tests/cpp/test_task_graph.cpp |  32 ++++++---
 3 files changed, 142 insertions(+), 27 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 036c9427b15..0ec28260bb0 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -79,9 +79,16 @@ std::string TaskGraph::toMermaid() const {
 
   bool print_data_size = false;
   if (numData() > 0) {
-    Size sz = data_.front().size;
+    Size sz = -1;
     for (const Data& data : data_) {
-      if (data.size != sz) {
+      if (data.size == 0) {
+        continue;
+      }
+      if (sz == -1) {
+        sz = data.size;
+        continue;
+      }
+      if (data.size != 0 && data.size != sz) {
         print_data_size = true;
         break;
       }
@@ -96,7 +103,9 @@ std::string TaskGraph::toMermaid() const {
       is_aliased_input.at(data.aliases_input.value()) = true;
     }
     ss << "    d" << data_id << "([\"d" << data_id;
-    if (print_data_size) {
+    if (print_data_size || data.size == 0) {
+      // Print data size if there are different sized data elements. Always
+      // print [0] for empty data (these will be shown in gray)
       ss << " [" << data.size << "]";
     }
     ss << "\"]);\n";
@@ -122,6 +131,7 @@ std::string TaskGraph::toMermaid() const {
   ss << "    classDef data fill:lightblue;\n";
   ss << "    classDef dataInput fill:lightgreen;\n";
   ss << "    classDef dataOutput fill:pink;\n";
+  ss << "    classDef dataEmpty fill:#EEE,stroke:#DDD,color:#999;\n";
   ss << "    classDef aliasedInput fill:yellow;\n";
   ss << "    classDef aliasEdge stroke-dasharray:3,stroke:blue;\n";
 
@@ -147,6 +157,8 @@ std::string TaskGraph::toMermaid() const {
       }
     } else if (!data.can_free) {
       class_name = "dataOutput";
+    } else if (data.size == 0) {
+      class_name = "dataEmpty";
     }
     ss << "    class d" << data_id << " " << class_name << ";\n";
   }
@@ -176,17 +188,17 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
 
     // Allocate outputs
     for (const DataId output_id : task.outputs) {
-      const Data& data = getData(output_id);
-      if (data.aliases_input.has_value()) {
+      const Data& output = getData(output_id);
+      if (output.aliases_input.has_value()) {
         // Check that the aliased input has no further uses
         // Note that we will decrement this use count later in this function
         NVF_ERROR(
-            num_uses_.at((size_t)data.aliases_input.value()) == 1,
+            future_uses.at((size_t)output.aliases_input.value()) == 1,
             "Tried to execute segment that would overwrite input alias before "
             "some of its uses");
       } else {
         // Don't allocate outputs if they are reusing input memory
-        allocated += data.size;
+        allocated += output.size;
       }
     }
 
@@ -195,6 +207,7 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
 
     // This is the most space we will use, so update high water mark here
     high_water_mark = std::max(high_water_mark, allocated);
+
     NVF_ERROR(
         step.high_water_mark == high_water_mark,
         "Mismatch in high water mark during validation");
@@ -224,6 +237,63 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
   }
 }
 
+TaskGraph TaskGraph::convertAliasesToDependencies() const {
+  // Begin with a copy of the tasks and data
+  std::vector<Task> tasks{tasks_};
+  std::vector<Data> data{data_};
+
+  // This is used to ensure we don't have multiple aliases of the same input
+  std::unordered_set<DataId> aliased_inputs;
+
+  // If we modify data while traversing it, then we run the risk
+
+  for (TaskId task_id : arange((TaskId)tasks.size())) {
+    Task& task = tasks.at(task_id);
+    for (DataId output_id : task.outputs) {
+      Data& output = data.at((size_t)output_id);
+      if (output.aliases_input.has_value()) {
+        DataId& alias_id = output.aliases_input.value();
+        // Reset the aliases_input flag before modifying the data vector
+        output.aliases_input = std::nullopt;
+        Data& alias = data.at((size_t)alias_id);
+        NVF_ERROR_EQ(
+            output.size,
+            alias.size,
+            "Expected alias to have same size as alias");
+        // Reset to unaliased and set size to zero
+        output.size = 0;
+
+        NVF_ERROR(
+            !aliased_inputs.contains(alias_id),
+            "Found multiple outputs aliasing the same input");
+        aliased_inputs.insert(alias_id);
+
+        // For each use of the aliased input, add a new output to it and make
+        // that output a new input to the current task
+        for (TaskId use_id : alias.uses) {
+          if (use_id == task_id) {
+            continue;
+          }
+          Task& use = tasks.at((size_t)use_id);
+
+          auto dummy_data_id = (DataId)data.size();
+          data.emplace_back(
+              /*definition=*/std::optional<TaskId>{use_id},
+              /*uses=*/std::vector<TaskId>{task_id},
+              /*aliases_input=*/std::nullopt,
+              /*size=*/0,
+              /*can_free=*/true);
+
+          use.outputs.push_back(dummy_data_id);
+          task.inputs.push_back(dummy_data_id);
+        }
+      }
+    }
+  }
+
+  return {tasks, data};
+}
+
 namespace {
 
 //! [Backtracking algorithm to find optimal topological ordering]
@@ -239,18 +309,35 @@ class TaskSorter {
       bool validate,
       int64_t max_time_us,
       bool print_debug)
-      : graph_(graph),
+      : orig_graph_(graph),
+        graph_(graph.convertAliasesToDependencies()),
         debug_(print_debug),
         validate_(validate),
-        max_time_us_(max_time_us),
-        has_aliasing_(std::ranges::any_of(
-            arange(graph.numData()),
-            [&graph](TaskGraph::DataId data_id) {
-              return graph.getData(data_id).aliases_input.has_value();
-            })) {
+        max_time_us_(max_time_us) {
     if (debug_) {
-      debug() << graph.toString() << "\n\n";
-      debug() << "Mermaid graph:\n" << graph.toMermaid() << std::endl;
+      has_aliasing_ = std::ranges::any_of(
+          arange(orig_graph_.numData()), [&](TaskGraph::DataId data_id) {
+            return orig_graph_.getData(data_id).aliases_input.has_value();
+          });
+      if (has_aliasing_) {
+        debug() << "Aliasing detected in task graph. Original graph:\n";
+        debug() << orig_graph_.toString() << "\n\n";
+        if (hasDebugDumpArgument(DebugDumpOption::TaskGraph, "mermaid")) {
+          debug() << "Original graph (mermaid):\n"
+                  << graph_.toMermaid() << std::endl;
+        }
+        debug() << "Modified graph without aliasing:\n";
+        debug() << graph_.toString() << "\n\n";
+        if (hasDebugDumpArgument(DebugDumpOption::TaskGraph, "mermaid")) {
+          debug() << "Modified graph (mermaid):\n"
+                  << graph_.toMermaid() << std::endl;
+        }
+      } else {
+        debug() << graph_.toString() << "\n\n";
+        if (hasDebugDumpArgument(DebugDumpOption::TaskGraph, "mermaid")) {
+          debug() << "Mermaid graph:\n" << graph_.toMermaid() << std::endl;
+        }
+      }
     }
     sort();
   }
@@ -262,7 +349,7 @@ class TaskSorter {
  private:
   inline void validate() const {
     if (validate_) {
-      graph_.validateSteps(steps_);
+      orig_graph_.validateSteps(steps_);
     }
   }
 
@@ -488,14 +575,15 @@ class TaskSorter {
   }
 
  private:
-  const TaskGraph& graph_;
+  const TaskGraph& orig_graph_;
+  const TaskGraph graph_;
   const bool debug_;
   const bool validate_;
   const int64_t max_time_us_;
 
   //! This allows us to skip aliasing checks in the common case where no inputs
   //! are aliased by outputs
-  const bool has_aliasing_ = false;
+  bool has_aliasing_ = false;
   //! This tells us which tasks overwrite one of their inputs. For these, we
   //! will need to check that the aliased input has no future uses before
   //! advancing to it.
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 98506319ce5..8cf79468d95 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -127,6 +127,17 @@ class TaskGraph {
     std::string toString() const;
   };
 
+  //! This converts a graph that has aliases into one that has no aliases but
+  //! has the same task order dependencies and the same memory requirements for
+  //! any execution order. This is done by adding new Data nodes that have zero
+  //! size in order to enforce the constraint that the last use of an aliased
+  //! input must be the one that overwrites that input.
+  //!
+  //! This conversion is mainly used to simplify algorithms so that they can
+  //! guarantee the aliasing condition without needing to explicitly handle
+  //! aliasing.
+  TaskGraph convertAliasesToDependencies() const;
+
   //! This does an exhaustive search of all possible orderings using a modified
   //! Kahn's algorithm to efficiently traverse the set of possible topological
   //! orderings.
diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 28fc9630aae..21f453e7506 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -115,16 +115,32 @@ TEST_F(TaskGraphTest, SharedIntermediateWithAlias) {
       {{0, 1}, {3}}, // Task 2
   };
   std::vector<TaskGraph::Data> data = inferData(tasks);
-  data.at(3).aliases_input = 0;
-  auto graph = TaskGraph(tasks, data);
 
-  const TaskGraph::SortResult result = graph.findOptimalOrder();
+  {
+    data.at(2).aliases_input = std::nullopt;
+    data.at(3).aliases_input = 0;
+    auto graph = TaskGraph(tasks, data);
+    const TaskGraph::SortResult result = graph.findOptimalOrder();
+
+    ASSERT_EQ(result.steps.size(), tasks.size());
+    // Due to the alias 0 1 2 is the only acceptable order
+    std::vector<TaskGraph::TaskId> expected{0, 1, 2};
+    EXPECT_EQ(getTasks(result), expected);
+    EXPECT_EQ(result.steps.back().high_water_mark, 3);
+  }
 
-  ASSERT_EQ(result.steps.size(), tasks.size());
-  // Due to the alias 0 1 2 is the only acceptable order
-  std::vector<TaskGraph::TaskId> expected{0, 2, 1};
-  EXPECT_EQ(getTasks(result), expected);
-  EXPECT_EQ(result.steps.back().high_water_mark, 4);
+  { // When 2 aliases the input instead, we should switch the order
+    data.at(2).aliases_input = 0;
+    data.at(3).aliases_input = std::nullopt;
+    auto graph = TaskGraph(tasks, data);
+    const TaskGraph::SortResult result = graph.findOptimalOrder();
+
+    ASSERT_EQ(result.steps.size(), tasks.size());
+    // Now 0 2 1 is the only acceptable order
+    std::vector<TaskGraph::TaskId> expected{0, 2, 1};
+    EXPECT_EQ(getTasks(result), expected);
+    EXPECT_EQ(result.steps.back().high_water_mark, 3);
+  }
 }
 
 // This example includes two segments, each of which aliases the other

From 95dcf4234a70c3a3bf4e3b96c3a6aaf8d1cf7b9c Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 22 Aug 2025 10:36:52 -0400
Subject: [PATCH 42/71] Remove code related to aliasing from TaskSorter

---
 csrc/graph/task_graph.cpp | 61 +++++++++------------------------------
 1 file changed, 13 insertions(+), 48 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 0ec28260bb0..f5fd3c02898 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -298,6 +298,10 @@ namespace {
 
 //! [Backtracking algorithm to find optimal topological ordering]
 //!
+//! Note that if the input graph has aliases, we first convert to a graph
+//! without aliases but with more data links. This allows us to ignore aliasing
+//! when sorting the tasks, while maintaining the same memory usage analysis.
+//!
 //! If validate==true, then we will validate the steps vector after every
 //! backtracking step.
 //!
@@ -315,11 +319,11 @@ class TaskSorter {
         validate_(validate),
         max_time_us_(max_time_us) {
     if (debug_) {
-      has_aliasing_ = std::ranges::any_of(
+      const bool has_aliasing = std::ranges::any_of(
           arange(orig_graph_.numData()), [&](TaskGraph::DataId data_id) {
             return orig_graph_.getData(data_id).aliases_input.has_value();
           });
-      if (has_aliasing_) {
+      if (has_aliasing) {
         debug() << "Aliasing detected in task graph. Original graph:\n";
         debug() << orig_graph_.toString() << "\n\n";
         if (hasDebugDumpArgument(DebugDumpOption::TaskGraph, "mermaid")) {
@@ -375,10 +379,8 @@ class TaskSorter {
 
     for (const TaskGraph::DataId output_id : task.outputs) {
       const TaskGraph::Data& output = graph_.getData(output_id);
-      // Allocate outputs if not aliased
-      if (!output.aliases_input.has_value()) {
-        allocated += output.size;
-      }
+      // Allocate outputs
+      allocated += output.size;
 
       // Update outstanding_dependencies_ and ready_tasks_ for each use
       for (const TaskGraph::TaskId use_id : output.uses) {
@@ -439,44 +441,15 @@ class TaskSorter {
     return last_task_id;
   }
 
-  //! A task is ready if it has no outstanding_dependencies _and_ it is the last
-  //! use for all of its aliased inputs.
+  //! A task is ready if it has no outstanding_dependencies and it is the last
+  //! use for all of its aliased inputs. Note that since we convert to a graph
+  //! with no aliases in the constructor of this class, it is safe to assume
+  //! that there are no alias conflicts.
   bool taskIsReady(TaskGraph::TaskId task_id) const {
-    if (outstanding_dependencies_.at((size_t)task_id) != 0) {
-      return false;
-    }
-    if (!has_aliasing_ || !task_has_aliased_input_.at((size_t)task_id)) {
-      return true;
-    }
-    // The rest of this function is the aliasing dependency check
-    for (const TaskGraph::DataId output_id : arange(graph_.numData())) {
-      const TaskGraph::Data& output_data = graph_.getData(output_id);
-      if (output_data.aliases_input.has_value()) {
-        TaskGraph::DataId input_id = output_data.aliases_input.value();
-        // Check for future uses (beyond the current one)
-        if (future_uses_.at((size_t)input_id) > 1) {
-          return false;
-        }
-      }
-    }
-    return true;
+    return outstanding_dependencies_.at((size_t)task_id) == 0;
   }
 
   void sort() {
-    if (has_aliasing_) {
-      task_has_aliased_input_.resize(graph_.numTasks(), false);
-      for (const TaskGraph::DataId data_id : arange(graph_.numData())) {
-        const TaskGraph::Data& data = graph_.getData(data_id);
-        if (data.aliases_input.has_value()) {
-          NVF_ERROR(
-              data.definition.has_value(),
-              "Data that aliases input must have a definition");
-          task_has_aliased_input_.at(data.definition.value()) = true;
-          continue;
-        }
-      }
-    }
-
     // Set up outstanding_dependencies_, future_uses_, and ready_tasks_
     future_uses_.resize(graph_.numData(), 0);
     for (const TaskGraph::DataId data_id : arange(graph_.numData())) {
@@ -581,14 +554,6 @@ class TaskSorter {
   const bool validate_;
   const int64_t max_time_us_;
 
-  //! This allows us to skip aliasing checks in the common case where no inputs
-  //! are aliased by outputs
-  bool has_aliasing_ = false;
-  //! This tells us which tasks overwrite one of their inputs. For these, we
-  //! will need to check that the aliased input has no future uses before
-  //! advancing to it.
-  std::vector<bool> task_has_aliased_input_;
-
   TaskGraph::SortResult result_;
   std::vector<TaskGraph::Step> steps_;
 

From 3c5f0915ecf141dd8cf8559b9a920dba7dd9ba09 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 22 Aug 2025 10:37:54 -0400
Subject: [PATCH 43/71] Remove mistakenly added empty file

---
 csrc/.cursor/rules | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 csrc/.cursor/rules

diff --git a/csrc/.cursor/rules b/csrc/.cursor/rules
deleted file mode 100644
index e69de29bb2d..00000000000

From a52875fdfd2dc53d1c5db360e4498e3d7b5e199d Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 22 Aug 2025 11:52:36 -0400
Subject: [PATCH 44/71] Validate during backtracking only in testing.

---
 csrc/fusion_segmenter.cpp | 2 +-
 csrc/graph/task_graph.cpp | 6 +++---
 csrc/graph/task_graph.h   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index b0b6bbdcab8..d0b85ab33ba 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2120,7 +2120,7 @@ std::vector<SegmentedGroup*> optimalTopoSort(
   TaskGraph graph =
       SegmentedGroupTaskGraphConverter::convert(groups, runtime_info);
 
-  TaskGraph::SortResult result = graph.findOptimalOrder();
+  TaskGraph::SortResult result = graph.findOptimalOrder(/*validate=*/false);
 
   std::vector<SegmentedGroup*> order;
   order.reserve(groups.size());
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index f5fd3c02898..6e6631940e2 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -328,7 +328,7 @@ class TaskSorter {
         debug() << orig_graph_.toString() << "\n\n";
         if (hasDebugDumpArgument(DebugDumpOption::TaskGraph, "mermaid")) {
           debug() << "Original graph (mermaid):\n"
-                  << graph_.toMermaid() << std::endl;
+                  << orig_graph_.toMermaid() << std::endl;
         }
         debug() << "Modified graph without aliasing:\n";
         debug() << graph_.toString() << "\n\n";
@@ -632,10 +632,10 @@ std::string TaskGraph::toString() const {
   return ss.str();
 }
 
-TaskGraph::SortResult TaskGraph::findOptimalOrder() const {
+TaskGraph::SortResult TaskGraph::findOptimalOrder(bool validate) const {
   TaskSorter sorter(
       *this,
-      /*validate=*/true,
+      validate,
       /*max_time_us=*/100000,
       /*debug=*/isDebugDumpEnabled(DebugDumpOption::TaskGraph));
   return sorter.result();
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 8cf79468d95..9323fd55f42 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -141,7 +141,7 @@ class TaskGraph {
   //! This does an exhaustive search of all possible orderings using a modified
   //! Kahn's algorithm to efficiently traverse the set of possible topological
   //! orderings.
-  SortResult findOptimalOrder() const;
+  SortResult findOptimalOrder(bool validate = true) const;
 
   std::string toString() const;
 

From 589b99726e09683fb30fb6bf5de91c478e6de6f2 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 22 Aug 2025 15:50:54 -0400
Subject: [PATCH 45/71] Update stroke colors on mermaid plots

---
 csrc/graph/task_graph.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 6e6631940e2..c65a553b2d0 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -127,12 +127,12 @@ std::string TaskGraph::toMermaid() const {
   }
 
   ss << "\n";
-  ss << "    classDef task fill:orange;\n";
-  ss << "    classDef data fill:lightblue;\n";
-  ss << "    classDef dataInput fill:lightgreen;\n";
-  ss << "    classDef dataOutput fill:pink;\n";
+  ss << "    classDef task fill:orange,stroke:darkorange;\n";
+  ss << "    classDef data fill:lightblue,stroke:blue;\n";
+  ss << "    classDef dataInput fill:lightgreen,stroke:green;\n";
+  ss << "    classDef dataOutput fill:pink,stroke:red;\n";
   ss << "    classDef dataEmpty fill:#EEE,stroke:#DDD,color:#999;\n";
-  ss << "    classDef aliasedInput fill:yellow;\n";
+  ss << "    classDef aliasedInput fill:yellow,stroke:yellow;\n";
   ss << "    classDef aliasEdge stroke-dasharray:3,stroke:blue;\n";
 
   ss << "\n";

From 55bbd7a9758fdd9b7aa3fc9df04cc9788ad836e7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 3 Sep 2025 07:21:54 -0400
Subject: [PATCH 46/71] Only update best_steps if hwm is improved

---
 csrc/graph/task_graph.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index c65a553b2d0..9213969ff85 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -526,15 +526,19 @@ class TaskSorter {
 
       // If our high water mark is above best_usage, terminate early and
       // backtrack
-      if (!best_steps.empty() &&
-          steps_.back().high_water_mark > best_steps.back().high_water_mark) {
+      TaskGraph::Size hwm = steps_.back().high_water_mark;
+      TaskGraph::Size best_hwm = best_steps.empty()
+          ? std::numeric_limits<TaskGraph::Size>::max()
+          : best_steps.back().high_water_mark;
+
+      if (hwm > best_hwm) {
         backtracked_task_id = backtrack();
         continue;
       }
 
-      // Our usage is at or below best_usage. Have we completed an ordering? If
+      // Our usage is at or below best. Have we completed an ordering? If
       // so, update best_steps
-      if (steps_.size() == (size_t)graph_.numTasks()) {
+      if (steps_.size() == (size_t)graph_.numTasks() && hwm < best_hwm) {
         best_steps = steps_;
       }
     }

From 549124e85f7ca65f9cff59de2cf204d351b0a914 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 3 Sep 2025 09:02:24 -0400
Subject: [PATCH 47/71] Fix typo

---
 csrc/graph/task_graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 9213969ff85..2f53e4ce1f8 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -64,7 +64,7 @@ TaskGraph::TaskGraph(
     }
     if (data.aliases_input.has_value()) {
       DataId a = data.aliases_input.value();
-      NVF_ERROR(a >= 0 && (size_t)a < tasks_.size());
+      NVF_ERROR(a >= 0 && (size_t)a < data_.size());
     }
     for (TaskId use : data.uses) {
       NVF_ERROR(use >= 0 && (size_t)use < tasks_.size());

From 595ce7998ce27cb55a72ae97238a6df6a75c15b5 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 3 Sep 2025 10:09:10 -0400
Subject: [PATCH 48/71] Skip looking up sizes for sharded inputs

---
 csrc/fusion_segmenter.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index ff368982ad6..a20301c748f 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2058,7 +2058,7 @@ class SegmentedGroupTaskGraphConverter {
       int64_t numel = 1;
       if (runtime_info_ != nullptr) {
         // Get the actual size of the tensor allocation
-        if (tv->isFusionInput()) {
+        if (tv->isFusionInput() && !isSharded(tv)) {
           const std::vector<int64_t>& sizes =
               runtime_info_->getInputAllocationSizes(tv);
           const std::vector<int64_t>& strides =
@@ -2077,12 +2077,16 @@ class SegmentedGroupTaskGraphConverter {
           // Use ExpressionEvaluator for computed tensors assuming they are
           // contiguous
           for (IterDomain* id : tv->getMaybeAllocationDomain()) {
-            if (id->isBroadcast() || id->isReduction()) {
+            if (id->isBroadcast() || id->isReduction() || id->isDeviceDim()) {
               continue;
             }
-            numel *= runtime_info_->expressionEvaluator()
-                         .evaluate(id->extent())
-                         .as<int64_t>();
+            PolymorphicValue pv =
+                runtime_info_->expressionEvaluator().evaluate(id->extent());
+            // If we can't determine the size of this dimension, just assume
+            // it's 2. This way we will give precedence to tensors with
+            // allocation domains that have more concrete IDs.
+            int64_t dim_size = pv.is<int64_t>() ? pv.as<int64_t>() : 2;
+            numel *= dim_size;
           }
         }
       }

From d95e0bbd9b1d3743efa9a6d42143db2e04f6de48 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 5 Sep 2025 11:08:11 -0400
Subject: [PATCH 49/71] Handle CPU scalars properly

---
 csrc/fusion_segmenter.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index a20301c748f..d7dbe286b20 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2056,7 +2056,11 @@ class SegmentedGroupTaskGraphConverter {
 
       // Assume all tensors the same shape if no runtime_info is given
       int64_t numel = 1;
-      if (runtime_info_ != nullptr) {
+      if (tv->isCpuScalar()) {
+        // runtime_info_ will not include sizes of GPU scalars and sine they do
+        // not result in any GPU allocation we count them as empty.
+        numel = 0;
+      } else if (runtime_info_ != nullptr) {
         // Get the actual size of the tensor allocation
         if (tv->isFusionInput() && !isSharded(tv)) {
           const std::vector<int64_t>& sizes =

From 9dc649f95c3c2ff4d56242dcd42f11cae4d27c13 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 5 Sep 2025 14:32:45 -0400
Subject: [PATCH 50/71] Finish DifferentSizes test

---
 tests/cpp/test_task_graph.cpp | 37 +++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 21f453e7506..844a78a0b1a 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -232,6 +232,11 @@ TEST_F(TaskGraphTest, FreeableIntermediate) {
   EXPECT_EQ(result.steps.back().high_water_mark, 4);
 }
 
+// This is a parallel chains graph, the optimal schedule should cut this into an
+// out-tree and an in-tree with the cut placed at local minimal of the
+// hill-valley representation of each chain.
+// See Kayaaslan et al. 2018
+// https://doi.org/10.1016/j.tcs.2017.09.037
 TEST_F(TaskGraphTest, DifferentSizes) {
   /*
    *     0
@@ -241,8 +246,10 @@ TEST_F(TaskGraphTest, DifferentSizes) {
    *   2   5
    *   |   |
    *   3   6
+   *   |   |
+   *   |   7
    *    \ /
-   *     7
+   *     8
    */
   Tasks tasks{
       {{0}, {1}}, // Task 0
@@ -251,31 +258,35 @@ TEST_F(TaskGraphTest, DifferentSizes) {
       {{0}, {4}}, // Task 3
       {{4}, {5}}, // Task 4
       {{5}, {6}}, // Task 5
-      {{3, 6}, {7}} // Task 6
+      {{6}, {7}}, // Task 6
+      {{3, 7}, {8}} // Task 7
   };
   std::vector<TaskGraph::Data> data = inferData(tasks);
   data[0].size = 1;
 
-  data[1].size = 12;
-  data[2].size = 10;
-  data[3].size = 10;
+  data[1].size = 15;
+  data[2].size = 7; // hill-valley = 8
+  data[3].size = 11;
 
-  // Note that 4 and 5  are large but that 6 is smaller than the others, so we
-  // should compute up to here then start on the 0-1-2-3 branch after freeing 4
-  // and 5. Otherwise we would need to hold
-  data[4].size = 11;
+  data[4].size = 10;
   data[5].size = 11;
-  data[6].size = 7;
+  data[6].size = 7; // hill-valley = 4
+  data[7].size = 8;
 
-  data[7].size = 1;
+  data[8].size = 1;
   auto graph = TaskGraph(tasks, data);
 
   const TaskGraph::SortResult result = graph.findOptimalOrder();
 
+  // The local minima are at data 2 and 6, so we should compute up to each first
+  // then compute the end parts afterward.
+
   ASSERT_EQ(result.steps.size(), tasks.size());
-  std::vector<TaskGraph::TaskId> expected{3, 4, 5, 0, 1, 2, 6};
+  std::vector<TaskGraph::TaskId> expected{0, 1, 3, 4, 5, 2, 6, 7};
   EXPECT_EQ(getTasks(result), expected);
-  EXPECT_EQ(result.steps.back().high_water_mark, 30);
+  // Note that the suboptimal straightforward ordering in this case is {0, 1,
+  // 2, 3, 4, 5, 6, 7} which has a high_water_mark of 33
+  EXPECT_EQ(result.steps.back().high_water_mark, 29);
 }
 
 // This is the example from Figure 1 of Kayaaslan et al. 2018

From ed2395e665900963bec2ded5defb680ab1004438 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Mon, 15 Sep 2025 11:06:23 -0400
Subject: [PATCH 51/71] Address some reviewer comments

---
 csrc/fusion_segmenter.cpp | 114 ++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 55 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index d7dbe286b20..ca411a6fae7 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2047,65 +2047,69 @@ class SegmentedGroupTaskGraphConverter {
     all_tasks_.emplace_back(inputs, outputs, temp_space);
   }
 
-  TaskGraph::DataId maybeRegisterTv(TensorView* tv) {
-    auto it = tv2dataid_.find(tv);
-    if (it == tv2dataid_.end()) {
-      // Register this TV
-      TaskGraph::DataId new_id = (TaskGraph::DataId)all_data_.size();
-      tv2dataid_[tv] = new_id;
-
-      // Assume all tensors the same shape if no runtime_info is given
-      int64_t numel = 1;
-      if (tv->isCpuScalar()) {
-        // runtime_info_ will not include sizes of GPU scalars and sine they do
-        // not result in any GPU allocation we count them as empty.
-        numel = 0;
-      } else if (runtime_info_ != nullptr) {
-        // Get the actual size of the tensor allocation
-        if (tv->isFusionInput() && !isSharded(tv)) {
-          const std::vector<int64_t>& sizes =
-              runtime_info_->getInputAllocationSizes(tv);
-          const std::vector<int64_t>& strides =
-              runtime_info_->getInputAllocationStrides(tv);
-
-          numel = 1;
-          for (auto [size, stride] : zip(sizes, strides)) {
-            if (size == 0) {
-              // Check for empty tensors
-              numel = 0;
-              break;
-            }
-            numel += (size - 1) * stride;
-          }
-        } else {
-          // Use ExpressionEvaluator for computed tensors assuming they are
-          // contiguous
-          for (IterDomain* id : tv->getMaybeAllocationDomain()) {
-            if (id->isBroadcast() || id->isReduction() || id->isDeviceDim()) {
-              continue;
-            }
-            PolymorphicValue pv =
-                runtime_info_->expressionEvaluator().evaluate(id->extent());
-            // If we can't determine the size of this dimension, just assume
-            // it's 2. This way we will give precedence to tensors with
-            // allocation domains that have more concrete IDs.
-            int64_t dim_size = pv.is<int64_t>() ? pv.as<int64_t>() : 2;
-            numel *= dim_size;
-          }
+  int64_t getNumElements(TensorView* tv) {
+    // Assume all tensors the same shape if no runtime_info is given
+    int64_t numel = 1;
+    if (tv->isCpuScalar()) {
+      // runtime_info_ will not include sizes of GPU scalars and sine they do
+      // not result in any GPU allocation we count them as empty.
+      return 0;
+    } else if (
+        runtime_info_ == nullptr || !tv->isFusionInput() || isShareded(tv)) {
+      // Use ExpressionEvaluator for computed tensors assuming they are
+      // contiguous
+      for (IterDomain* id : tv->getMaybeAllocationDomain()) {
+        if (id->isBroadcast() || id->isReduction() || id->isDeviceDim()) {
+          continue;
         }
+        PolymorphicValue pv =
+            runtime_info_->expressionEvaluator().evaluate(id->extent());
+        // If we can't determine the size of this dimension, just assume
+        // it's 2. This way we will give precedence to tensors with
+        // allocation domains that have more concrete IDs.
+        int64_t dim_size = pv.is<int64_t>() ? pv.as<int64_t>() : 2;
+        numel *= dim_size;
       }
-      TaskGraph::Size size = numel * dataTypeSizeByte(tv->dtype());
-
-      all_data_.emplace_back(
-          /*definition=*/std::nullopt,
-          /*uses=*/std::vector<TaskGraph::TaskId>{},
-          /*aliases_input=*/std::nullopt,
-          size,
-          /*can_free=*/true);
-      return new_id;
-    } else {
+    }
+    // Get the actual size of the tensor allocation
+    const std::vector<int64_t>& sizes =
+        runtime_info_->getInputAllocationSizes(tv);
+    const std::vector<int64_t>& strides =
+        runtime_info_->getInputAllocationStrides(tv);
+    NVF_ERROR_EQ(sizes.size(), strides.size());
+
+    numel = 1;
+    for (auto [size, stride] : zip(sizes, strides)) {
+      if (size == 0) {
+        // Check for empty tensors
+        numel = 0;
+        break;
+      }
+      numel += (size - 1) * stride;
+    }
+    return numel;
+  }
+
+  TaskGraph::DataId maybeRegisterTv(TensorView* tv) {
+    auto it = tv2dataid_.find(tv);
+    if (it != tv2dataid_.end()) {
+      // tv is already registered
       return it->second;
     }
+
+    // Register this TV
+    auto new_id = static_cast<TaskGraph::DataId>(std::ssize_t(all_data_));
+    tv2dataid_[tv] = new_id;
+
+    TaskGraph::Size size = getNumElements(tv) * dataTypeSizeByte(tv->dtype());
+
+    all_data_.emplace_back(
+        /*definition=*/std::nullopt,
+        /*uses=*/std::vector<TaskGraph::TaskId>{},
+        /*aliases_input=*/std::nullopt,
+        size,
+        /*can_free=*/true);
+    return new_id;
   }
 
  private:

From 3546fa337c2c793a8650e625db2f004fc235d4a7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 09:53:10 -0400
Subject: [PATCH 52/71] Fix typos, use ExpressionEvaluator to compute numel

---
 csrc/fusion_segmenter.cpp | 58 ++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 37 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index ca411a6fae7..d6656526336 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2047,45 +2047,28 @@ class SegmentedGroupTaskGraphConverter {
     all_tasks_.emplace_back(inputs, outputs, temp_space);
   }
 
-  int64_t getNumElements(TensorView* tv) {
-    // Assume all tensors the same shape if no runtime_info is given
-    int64_t numel = 1;
+  int64_t getNumAllocatedElements(TensorView* tv) {
     if (tv->isCpuScalar()) {
-      // runtime_info_ will not include sizes of GPU scalars and sine they do
-      // not result in any GPU allocation we count them as empty.
+      // Since CPU scalars do not result in any GPU allocation we count them as
+      // empty.
       return 0;
-    } else if (
-        runtime_info_ == nullptr || !tv->isFusionInput() || isShareded(tv)) {
-      // Use ExpressionEvaluator for computed tensors assuming they are
-      // contiguous
-      for (IterDomain* id : tv->getMaybeAllocationDomain()) {
-        if (id->isBroadcast() || id->isReduction() || id->isDeviceDim()) {
-          continue;
-        }
-        PolymorphicValue pv =
-            runtime_info_->expressionEvaluator().evaluate(id->extent());
-        // If we can't determine the size of this dimension, just assume
-        // it's 2. This way we will give precedence to tensors with
-        // allocation domains that have more concrete IDs.
-        int64_t dim_size = pv.is<int64_t>() ? pv.as<int64_t>() : 2;
-        numel *= dim_size;
-      }
     }
-    // Get the actual size of the tensor allocation
-    const std::vector<int64_t>& sizes =
-        runtime_info_->getInputAllocationSizes(tv);
-    const std::vector<int64_t>& strides =
-        runtime_info_->getInputAllocationStrides(tv);
-    NVF_ERROR_EQ(sizes.size(), strides.size());
-
-    numel = 1;
-    for (auto [size, stride] : zip(sizes, strides)) {
-      if (size == 0) {
-        // Check for empty tensors
-        numel = 0;
-        break;
+    int64_t numel = 1;
+    // Use ExpressionEvaluator for computed tensors assuming they are
+    // contiguous
+    for (IterDomain* id : tv->getMaybeAllocationDomain()) {
+      if (id->isBroadcast() || id->isReduction() || id->isDeviceDim()) {
+        continue;
+      }
+      PolymorphicValue pv = std::monostate{};
+      if (runtime_info_ != nullptr) {
+        pv = runtime_info_->expressionEvaluator().evaluate(id->extent());
       }
-      numel += (size - 1) * stride;
+      // If we can't determine the size of this dimension, just assume
+      // it's 2. This way we will give precedence to tensors with
+      // allocation domains that have more concrete IDs.
+      int64_t dim_size = pv.is<int64_t>() ? pv.as<int64_t>() : 2;
+      numel *= dim_size;
     }
     return numel;
   }
@@ -2098,10 +2081,11 @@ class SegmentedGroupTaskGraphConverter {
     }
 
     // Register this TV
-    auto new_id = static_cast<TaskGraph::DataId>(std::ssize_t(all_data_));
+    auto new_id = static_cast<TaskGraph::DataId>(std::ssize(all_data_));
     tv2dataid_[tv] = new_id;
 
-    TaskGraph::Size size = getNumElements(tv) * dataTypeSizeByte(tv->dtype());
+    TaskGraph::Size size =
+        getNumAllocatedElements(tv) * dataTypeSizeByte(tv->dtype());
 
     all_data_.emplace_back(
         /*definition=*/std::nullopt,

From 239a80d65c02478f0063e2ebca4e2895985294b7 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 10:59:39 -0400
Subject: [PATCH 53/71] Add comment about assumptions in inferData in tests

---
 tests/cpp/test_task_graph.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index 844a78a0b1a..a9fc169bbd3 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -40,6 +40,12 @@ std::vector<TaskGraph::Data> inferData(const Tasks& tasks) {
   }
 
   // Detect inputs and outputs and ensure they are not freed
+  //
+  // Note that any Data without a definition is treated as an input.
+  // Additionally, only Data that has no uses is considered an output. In the
+  // general case we could have outputs that are used in other Tasks, but these
+  // will need to be handled manually, since the intention of this tool is to
+  // enable common graph patterns to be built quickly.
   for (TaskGraph::Data& data : all_data) {
     data.size = 1;
     data.can_free = data.definition.has_value() && !data.uses.empty();

From 51ee59491e501925573a84c8fb90597e5af50ce2 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 11:00:07 -0400
Subject: [PATCH 54/71] Fill uses and definitions, and validate

---
 csrc/graph/task_graph.cpp | 50 +++++++++++++++++++++++++++++++++++++++
 csrc/graph/task_graph.h   |  9 ++++++-
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 2f53e4ce1f8..37817352f71 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -32,6 +32,23 @@ TaskGraph::TaskGraph(
       data.size() <= std::numeric_limits<TaskGraph::DataId>::max(),
       "There are too many data objects to represent with TaskGraph::DataId");
 
+  // Fill in Data uses and definitions
+  for (TaskId task_id : arange(numTasks())) {
+    const Task& task = tasks_.at((size_t)task_id);
+    for (DataId output_id : task.outputs) {
+      data_.at((size_t)output_id).definition = task_id;
+    }
+    for (DataId input_id : task.inputs) {
+      Data& input = data_.at(input_id);
+      if (std::find(input.uses.begin(), input.uses.end(), task_id) ==
+          input.uses.end()) {
+        input.uses.push_back(task_id);
+      }
+    }
+  }
+
+  validateGraph();
+
   // Initialize the counts of future uses of data and unmet dependencies of
   // tasks. These are the out-degrees of Data and in-degrees of Tasks,
   // respectively.
@@ -72,6 +89,39 @@ TaskGraph::TaskGraph(
   }
 }
 
+void TaskGraph::validateGraph() const {
+  for (TaskId task_id : arange(numTasks())) {
+    const Task& task = tasks_.at((size_t)task_id);
+    for (DataId output_id : task.outputs) {
+      const Data& output = getData(output_id);
+      NVF_ERROR(
+          output.definition.has_value() &&
+          output.definition.value() == task_id);
+    }
+    for (DataId input_id : task.inputs) {
+      const Data& input = getData(input_id);
+      NVF_ERROR(
+          std::find(input.uses.begin(), input.uses.end(), task_id) !=
+          input.uses.end());
+    }
+  }
+
+  for (const auto& [data_id, data] : enumerate(data_)) {
+    if (data.definition.has_value()) {
+      const Task& def = getTask(data.definition.value());
+      NVF_ERROR(
+          std::find(def.outputs.begin(), def.outputs.end(), data_id) !=
+          def.outputs.end());
+    }
+    for (const TaskId use_id : data.uses) {
+      const Task& use = getTask(use_id);
+      NVF_ERROR(
+          std::find(use.inputs.begin(), use.inputs.end(), data_id) !=
+          use.inputs.end());
+    }
+  }
+}
+
 std::string TaskGraph::toMermaid() const {
   std::stringstream ss;
 
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 9323fd55f42..8471e14c0a6 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -71,6 +71,11 @@ class TaskGraph {
     std::string toString() const;
   };
 
+  //! Note that the Tasks provided here must have accurate inputs, outputs, and
+  //! temporary space. The Data must have accurate aliases_input, size, and
+  //! can_free fields. However, uses and definitions can be empty in which case
+  //! they will be filled in automatically. Any pre-existing definitions or uses
+  //! will be checked for consistency.
   TaskGraph(const std::vector<Task>& tasks, const std::vector<Data>& data);
 
   //! This represents the execution of a single Task in a given ordering. It
@@ -109,6 +114,8 @@ class TaskGraph {
     return initial_allocation_;
   }
 
+  void validateGraph() const;
+
   //! Given a list of steps, recompute the active space and high water mark.
   //! This is useful for validating that our backtracking algorithm does not
   //! corrupt this data. Raises an exception if corruption is detected.
@@ -150,7 +157,7 @@ class TaskGraph {
 
  private:
   const std::vector<Task> tasks_;
-  const std::vector<Data> data_;
+  std::vector<Data> data_;
 
   //! How much data is allocated by data that has no definition, i.e. input data
   Size initial_allocation_ = 0;

From 2de536d76fd45443ddc41de971b2ea2c35882cc5 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 11:03:33 -0400
Subject: [PATCH 55/71] Skip manually setting definition and uses in
 fusion_segmenter.cpp

---
 csrc/fusion_segmenter.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index d6656526336..9f655ba0836 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1994,8 +1994,6 @@ class SegmentedGroupTaskGraphConverter {
       : runtime_info_(runtime_info) {}
 
   void processGroup(SegmentedGroup* group) {
-    TaskGraph::TaskId task_id = (TaskGraph::TaskId)all_tasks_.size();
-
     // When there are aliased inputs, they will appear as _outputs_ of the
     // SegmentedGroup. To avoid actually adding those as outputs, we record them
     // here first
@@ -2016,7 +2014,6 @@ class SegmentedGroupTaskGraphConverter {
         // Ignore scalar inputs
         TaskGraph::DataId data_id = maybeRegisterTv(tv);
         TaskGraph::Data& data = all_data_.at((size_t)data_id);
-        data.uses.push_back(task_id);
         data.can_free = !tv->isFusionInput();
         inputs.push_back(data_id);
       }
@@ -2031,7 +2028,6 @@ class SegmentedGroupTaskGraphConverter {
         }
         TaskGraph::DataId data_id = maybeRegisterTv(tv);
         TaskGraph::Data& data = all_data_.at((size_t)data_id);
-        data.definition = task_id;
         if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
                 tv->fusion()->getOutputAlias(tv).aliased_io)) {
           data.aliases_input = maybeRegisterTv(aliased_input_tv);

From 3a0f9e278c700e8d64efc4cb7318925b26093c67 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 11:06:58 -0400
Subject: [PATCH 56/71] Remove early exit for unsegmented fusions

---
 csrc/fusion_segmenter.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 9f655ba0836..d8ebac5ee2a 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2103,11 +2103,6 @@ std::vector<SegmentedGroup*> optimalTopoSort(
     const std::vector<SegmentedGroup*>& groups,
     SchedulerRuntimeInfo* runtime_info) {
   FUSER_PERF_SCOPE("optimalTopoSort");
-  if (groups.size() == 1) {
-    // Skip setting up the graph and doing the whole analysis when there's just
-    // a single group
-    return {groups.front()};
-  }
 
   TaskGraph graph =
       SegmentedGroupTaskGraphConverter::convert(groups, runtime_info);

From f313e1ed0010b83bc8950e5d2e8cf3b136782413 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 14:01:23 -0400
Subject: [PATCH 57/71] Simplify alias check

---
 csrc/fusion_segmenter.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index d8ebac5ee2a..bdd4a6df1dc 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1999,11 +1999,9 @@ class SegmentedGroupTaskGraphConverter {
     // here first
     std::unordered_set<TensorView*> aliased_input_tvs;
     for (Val* v : group->outputs()) {
-      if (auto* tv = dynamic_cast<TensorView*>(v)) {
-        if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
-                tv->fusion()->getOutputAlias(tv).aliased_io)) {
-          aliased_input_tvs.insert(aliased_input_tv);
-        }
+      if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
+              v->fusion()->getOutputAlias(v).aliased_io)) {
+        aliased_input_tvs.insert(aliased_input_tv);
       }
     }
 

From fb36a15040670dd4f86591807d9b029fea574a48 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 14:32:34 -0400
Subject: [PATCH 58/71] aliases_input optional<DataId> -> DataId

---
 csrc/fusion_segmenter.cpp     |  2 +-
 csrc/graph/task_graph.cpp     | 30 ++++++++++++++----------------
 csrc/graph/task_graph.h       | 15 +++++++++------
 tests/cpp/test_task_graph.cpp |  4 ++--
 4 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index bdd4a6df1dc..e02a51e1b54 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2084,7 +2084,7 @@ class SegmentedGroupTaskGraphConverter {
     all_data_.emplace_back(
         /*definition=*/std::nullopt,
         /*uses=*/std::vector<TaskGraph::TaskId>{},
-        /*aliases_input=*/std::nullopt,
+        /*aliases_input=*/-1,
         size,
         /*can_free=*/true);
     return new_id;
diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 37817352f71..ad52d1e7e48 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -79,8 +79,8 @@ TaskGraph::TaskGraph(
       DataId d = data.definition.value();
       NVF_ERROR(d >= 0 && (size_t)d < tasks_.size());
     }
-    if (data.aliases_input.has_value()) {
-      DataId a = data.aliases_input.value();
+    if (data.aliases_input != -1) {
+      DataId a = data.aliases_input;
       NVF_ERROR(a >= 0 && (size_t)a < data_.size());
     }
     for (TaskId use : data.uses) {
@@ -149,8 +149,8 @@ std::string TaskGraph::toMermaid() const {
 
   // Declare nodes with shapes and labels
   for (const auto& [data_id, data] : enumerate(data_)) {
-    if (data.aliases_input.has_value()) {
-      is_aliased_input.at(data.aliases_input.value()) = true;
+    if (data.aliases_input != -1) {
+      is_aliased_input.at(data.aliases_input) = true;
     }
     ss << "    d" << data_id << "([\"d" << data_id;
     if (print_data_size || data.size == 0) {
@@ -192,9 +192,9 @@ std::string TaskGraph::toMermaid() const {
   ss << "\n";
   for (const auto& [data_id, data] : enumerate(data_)) {
     // Create edges for aliases
-    if (data.aliases_input.has_value()) {
+    if (data.aliases_input != -1) {
       ss << "    d" << data_id << " alias" << data_id << "@--> d"
-         << data.aliases_input.value() << ";\n";
+         << data.aliases_input << ";\n";
       ss << "    class alias" << data_id << " aliasEdge;\n";
     }
 
@@ -239,11 +239,11 @@ void TaskGraph::validateSteps(const std::vector<Step>& steps) const {
     // Allocate outputs
     for (const DataId output_id : task.outputs) {
       const Data& output = getData(output_id);
-      if (output.aliases_input.has_value()) {
+      if (output.aliases_input != -1) {
         // Check that the aliased input has no further uses
         // Note that we will decrement this use count later in this function
         NVF_ERROR(
-            future_uses.at((size_t)output.aliases_input.value()) == 1,
+            future_uses.at((size_t)output.aliases_input) == 1,
             "Tried to execute segment that would overwrite input alias before "
             "some of its uses");
       } else {
@@ -301,10 +301,10 @@ TaskGraph TaskGraph::convertAliasesToDependencies() const {
     Task& task = tasks.at(task_id);
     for (DataId output_id : task.outputs) {
       Data& output = data.at((size_t)output_id);
-      if (output.aliases_input.has_value()) {
-        DataId& alias_id = output.aliases_input.value();
+      if (output.aliases_input != -1) {
+        DataId alias_id = output.aliases_input;
         // Reset the aliases_input flag before modifying the data vector
-        output.aliases_input = std::nullopt;
+        output.aliases_input = -1;
         Data& alias = data.at((size_t)alias_id);
         NVF_ERROR_EQ(
             output.size,
@@ -330,7 +330,7 @@ TaskGraph TaskGraph::convertAliasesToDependencies() const {
           data.emplace_back(
               /*definition=*/std::optional<TaskId>{use_id},
               /*uses=*/std::vector<TaskId>{task_id},
-              /*aliases_input=*/std::nullopt,
+              /*aliases_input=*/-1,
               /*size=*/0,
               /*can_free=*/true);
 
@@ -371,7 +371,7 @@ class TaskSorter {
     if (debug_) {
       const bool has_aliasing = std::ranges::any_of(
           arange(orig_graph_.numData()), [&](TaskGraph::DataId data_id) {
-            return orig_graph_.getData(data_id).aliases_input.has_value();
+            return orig_graph_.getData(data_id).aliases_input != -1;
           });
       if (has_aliasing) {
         debug() << "Aliasing detected in task graph. Original graph:\n";
@@ -643,9 +643,7 @@ std::string TaskGraph::Data::toString() const {
      << (definition.has_value() ? std::to_string(definition.value()) : "none");
   ss << ", uses={" << uses << "}";
   ss << ", size=" << size;
-  ss << ", aliases_input="
-     << (aliases_input.has_value() ? std::to_string(aliases_input.value())
-                                   : "none");
+  ss << ", aliases_input=" << aliases_input;
   ss << ", can_free=" << (can_free ? "yes" : "no");
   ss << "}";
   return ss.str();
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 8471e14c0a6..309dc55cc2b 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -55,12 +55,15 @@ class TaskGraph {
   struct Data {
     std::optional<TaskId> definition;
     std::vector<TaskId> uses;
-    //! If set, this means we do not allocate a new output when executing this
-    //! Data's definition, instead we re-use the space from the specified input.
-    //! Note that this implies an ordering constraint which we will check, since
-    //! the definition must be the last use of the aliased input.
-    std::optional<DataId> aliases_input;
-    Size size;
+
+    //! If set to something other than -1, this means we do not allocate a new
+    //! output when executing this Data's definition, instead we re-use the
+    //! space from the specified input. Note that this implies an ordering
+    //! constraint which we will check, since the definition must be the last
+    //! use of the aliased input.
+    DataId aliases_input = -1;
+
+    Size size = 1;
 
     //! This indicates whether we are able to free this data after its last use.
     //! For a segmented fusion, unsegmented fusion inputs and outputs cannot be
diff --git a/tests/cpp/test_task_graph.cpp b/tests/cpp/test_task_graph.cpp
index a9fc169bbd3..2a6772d7899 100644
--- a/tests/cpp/test_task_graph.cpp
+++ b/tests/cpp/test_task_graph.cpp
@@ -123,7 +123,7 @@ TEST_F(TaskGraphTest, SharedIntermediateWithAlias) {
   std::vector<TaskGraph::Data> data = inferData(tasks);
 
   {
-    data.at(2).aliases_input = std::nullopt;
+    data.at(2).aliases_input = -1;
     data.at(3).aliases_input = 0;
     auto graph = TaskGraph(tasks, data);
     const TaskGraph::SortResult result = graph.findOptimalOrder();
@@ -137,7 +137,7 @@ TEST_F(TaskGraphTest, SharedIntermediateWithAlias) {
 
   { // When 2 aliases the input instead, we should switch the order
     data.at(2).aliases_input = 0;
-    data.at(3).aliases_input = std::nullopt;
+    data.at(3).aliases_input = -1;
     auto graph = TaskGraph(tasks, data);
     const TaskGraph::SortResult result = graph.findOptimalOrder();
 

From 5cd1e66fd0807cd34ccb88c84cbaf9db1f57f688 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 14:53:44 -0400
Subject: [PATCH 59/71] Remove unused include

---
 csrc/fusion_segmenter.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index e02a51e1b54..6c9a90e8db9 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -6,7 +6,6 @@
  */
 // clang-format on
 #include <algorithm>
-#include <limits>
 #include <sstream>
 
 #include <debug.h>

From f1260a149ef351384eca426ce5d6d57c47467219 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 17 Sep 2025 14:54:07 -0400
Subject: [PATCH 60/71] Print result of findOptimalOrder in debug dump

---
 csrc/graph/task_graph.cpp | 20 +++++++++++++++++++-
 csrc/graph/task_graph.h   |  2 +-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index ad52d1e7e48..76ddc5b5bd6 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -534,11 +534,11 @@ class TaskSorter {
     Clock::time_point start = Clock::now();
 
     for (int64_t iter : arange(10000000)) {
+      result_.iterations = iter;
       if (iter % 64 == 0) {
         Clock::time_point end = Clock::now();
         if (std::chrono::duration_cast<std::chrono::microseconds>(end - start)
                 .count() > max_time_us_) {
-          result_.iterations = iter;
           break;
         }
       }
@@ -596,6 +596,24 @@ class TaskSorter {
     // Record our best found steps
     result_.steps = best_steps;
 
+    if (isDebugDumpEnabled(DebugDumpOption::TaskGraph)) {
+      Clock::time_point stop = Clock::now();
+      debug() << "Found these steps in "
+              << (std::chrono::duration_cast<std::chrono::milliseconds>(
+                      stop - start)
+                      .count())
+              << " ms:\n";
+      for (const TaskGraph::Step& step : result_.steps) {
+        debug() << "  " << step << "\n";
+      }
+      debug() << "The search contained " << result_.iterations
+              << " iterations and was ";
+      if (!result_.exhaustive) {
+        debug() << "NOT ";
+      }
+      debug() << "exhaustive" << std::endl;
+    }
+
     // Validate final result
     NVF_ERROR(result_.steps.size() == (size_t)graph_.numTasks());
     validate();
diff --git a/csrc/graph/task_graph.h b/csrc/graph/task_graph.h
index 309dc55cc2b..790bb738526 100644
--- a/csrc/graph/task_graph.h
+++ b/csrc/graph/task_graph.h
@@ -128,7 +128,7 @@ class TaskGraph {
     std::vector<Step> steps;
 
     //! Number of iterations computed
-    int64_t iterations;
+    int64_t iterations = 0;
 
     //! Whether the search was exhaustive. If not, then it was likely cut off
     //! early because of an iteration limit.

From 60aa579ec1c3e30c879b0c93a7c9cdf5d3e3c17f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 19 Sep 2025 11:03:09 -0400
Subject: [PATCH 61/71] Fix AliasTest.TrivialInputForwarding

---
 csrc/fusion_segmenter.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 6c9a90e8db9..593b8826ab9 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2018,9 +2018,12 @@ class SegmentedGroupTaskGraphConverter {
     std::vector<TaskGraph::DataId> outputs;
     for (Val* v : group->outputs()) {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
-        if (aliased_input_tvs.count(tv)) {
+        if (aliased_input_tvs.count(tv) || tv->isFusionInput()) {
           // These are counted as outputs but are actually _inputs_ to this
           // group
+          // Note that we skip setting alias links in the graph when the input
+          // is simply forwarded to the outputs unchanged.
+          // See AliasTest.TrivialInputForwarding for an example of this
           continue;
         }
         TaskGraph::DataId data_id = maybeRegisterTv(tv);

From d08bd533610a506b6bf7d50096110b97f620f590 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Fri, 19 Sep 2025 11:12:33 -0400
Subject: [PATCH 62/71] Handle cases where tv->dtype() is Index

---
 csrc/fusion_segmenter.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 593b8826ab9..f5a755b4cd3 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1985,6 +1985,8 @@ class SegmentedGroupTaskGraphConverter {
     for (SegmentedGroup* group : groups) {
       conv.processGroup(group);
     }
+    std::cout << conv.all_tasks_ << std::endl;
+    std::cout << conv.all_data_ << std::endl;
     return TaskGraph(conv.all_tasks_, conv.all_data_);
   }
 
@@ -2080,8 +2082,16 @@ class SegmentedGroupTaskGraphConverter {
     auto new_id = static_cast<TaskGraph::DataId>(std::ssize(all_data_));
     tv2dataid_[tv] = new_id;
 
+    // If the TV is of type Index, we don't know if it will be 8 bytes or 4
+    // bytes until we are given input
+    DataType dtype = tv->dtype();
+    if (dtype == DataType::Index) {
+      // If we don't have runtime info, assume it is 64-bit
+      dtype = runtime_info_ != nullptr ? runtime_info_->getIndexType()
+                                       : DataType::Int;
+    }
     TaskGraph::Size size =
-        getNumAllocatedElements(tv) * dataTypeSizeByte(tv->dtype());
+        getNumAllocatedElements(tv) * dataTypeSizeByte(dtype);
 
     all_data_.emplace_back(
         /*definition=*/std::nullopt,

From ab3857565b962d90997a09aedd91b0008abb9721 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Tue, 23 Sep 2025 14:02:48 -0400
Subject: [PATCH 63/71] Update csrc/graph/task_graph.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/graph/task_graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 76ddc5b5bd6..1952a487101 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -707,7 +707,7 @@ TaskGraph::SortResult TaskGraph::findOptimalOrder(bool validate) const {
       *this,
       validate,
       /*max_time_us=*/100000,
-      /*debug=*/isDebugDumpEnabled(DebugDumpOption::TaskGraph));
+      isDebugDumpEnabled(DebugDumpOption::TaskGraph));
   return sorter.result();
 }
 

From a444097a02ccac33c7c6e6694873b037f60355e4 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Wed, 24 Sep 2025 06:58:24 -0400
Subject: [PATCH 64/71] Update csrc/fusion_segmenter.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/fusion_segmenter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index f5a755b4cd3..e644f27291e 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2012,7 +2012,7 @@ class SegmentedGroupTaskGraphConverter {
       if (auto* tv = dynamic_cast<TensorView*>(v)) {
         // Ignore scalar inputs
         TaskGraph::DataId data_id = maybeRegisterTv(tv);
-        TaskGraph::Data& data = all_data_.at((size_t)data_id);
+        TaskGraph::Data& data = all_data_.at(data_id);
         data.can_free = !tv->isFusionInput();
         inputs.push_back(data_id);
       }

From fa63660957b87f6a7c31cba8cd10fc8449b25b13 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Wed, 24 Sep 2025 06:58:40 -0400
Subject: [PATCH 65/71] Update csrc/graph/task_graph.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/graph/task_graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 1952a487101..28e35d22661 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -591,7 +591,7 @@ class TaskSorter {
       if (steps_.size() == (size_t)graph_.numTasks() && hwm < best_hwm) {
         best_steps = steps_;
       }
-    }
+    } // for iter
 
     // Record our best found steps
     result_.steps = best_steps;

From c7cec9b793947d23a00de3a6af9418ca3fe2c7a6 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 1 Oct 2025 10:46:53 -0400
Subject: [PATCH 66/71] Use std::lower_bound

---
 csrc/graph/task_graph.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 28e35d22661..db5e10a34d2 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -547,13 +547,9 @@ class TaskSorter {
           !ready_tasks_.empty() || steps_.size() == (size_t)graph_.numTasks(),
           "Ran out of ready tasks before completing ordering");
 
-      TaskGraph::TaskId next_task_id = -1;
-      for (const TaskGraph::TaskId ready_id : ready_tasks_) {
-        if (ready_id > backtracked_task_id) {
-          next_task_id = ready_id;
-          break;
-        }
-      }
+      const auto it = std::lower_bound(
+          ready_tasks_.begin(), ready_tasks_.end(), backtracked_task_id + 1);
+      TaskGraph::TaskId next_task_id = it == ready_tasks_.end() ? -1 : *it;
 
       // Reset backtracked_task_id
       backtracked_task_id = -1;

From 2ae3a3139d323a49b2846c31f81e22ef2150b75f Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Wed, 1 Oct 2025 10:52:37 -0400
Subject: [PATCH 67/71] Update csrc/graph/task_graph.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/graph/task_graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index db5e10a34d2..78ef8e1ff55 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -632,7 +632,7 @@ class TaskSorter {
 
   //! There is one entry here for each Data and indicating how many uses there
   //! are remaining. When it reaches zero, the Data can be freed if allowed.
-  std::vector<TaskGraph::TaskId> future_uses_;
+  std::vector<int64_t> future_uses_;
 
   //! This holds all candidates for the next step, sorted by ID
   std::set<TaskGraph::TaskId> ready_tasks_;

From d80f63b676da069ed66379a2d1420981d4d3f5fc Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Wed, 1 Oct 2025 14:02:57 -0400
Subject: [PATCH 68/71] Remove iteration limit

---
 csrc/graph/task_graph.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 78ef8e1ff55..258d18e107d 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -533,9 +533,10 @@ class TaskSorter {
     using Clock = std::chrono::high_resolution_clock;
     Clock::time_point start = Clock::now();
 
-    for (int64_t iter : arange(10000000)) {
-      result_.iterations = iter;
-      if (iter % 64 == 0) {
+    result_.iterations = 0;
+    while (true) {
+      result_.iterations++;
+      if (result_.iterations % 64 == 0) {
         Clock::time_point end = Clock::now();
         if (std::chrono::duration_cast<std::chrono::microseconds>(end - start)
                 .count() > max_time_us_) {
@@ -587,7 +588,7 @@ class TaskSorter {
       if (steps_.size() == (size_t)graph_.numTasks() && hwm < best_hwm) {
         best_steps = steps_;
       }
-    } // for iter
+    } // while
 
     // Record our best found steps
     result_.steps = best_steps;

From 064d9393fb8dc5a8d2fc4d7888a04bfa2652cab9 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <jhinkle@nvidia.com>
Date: Tue, 21 Oct 2025 11:58:55 -0400
Subject: [PATCH 69/71] Remove debug prints

---
 csrc/fusion_segmenter.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 7185a13f76d..4e0b6b68722 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -2014,8 +2014,6 @@ class SegmentedGroupTaskGraphConverter {
     for (SegmentedGroup* group : groups) {
       conv.processGroup(group);
     }
-    std::cout << conv.all_tasks_ << std::endl;
-    std::cout << conv.all_data_ << std::endl;
     return TaskGraph(conv.all_tasks_, conv.all_data_);
   }
 

From 6011cac0319fd1cae3295b49c6441b68a5219eb8 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Tue, 28 Oct 2025 19:30:25 -0400
Subject: [PATCH 70/71] Update csrc/graph/task_graph.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/graph/task_graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 258d18e107d..5d87dc08233 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -545,7 +545,7 @@ class TaskSorter {
       }
 
       NVF_ERROR(
-          !ready_tasks_.empty() || steps_.size() == (size_t)graph_.numTasks(),
+          !ready_tasks_.empty() || std::ssize(steps_) == graph_.numTasks(),
           "Ran out of ready tasks before completing ordering");
 
       const auto it = std::lower_bound(

From e92731e1caae29537ad14d73136c1afa26a030c9 Mon Sep 17 00:00:00 2001
From: Jacob Hinkle <1454944+jacobhinkle@users.noreply.github.com>
Date: Tue, 28 Oct 2025 19:31:04 -0400
Subject: [PATCH 71/71] Update csrc/graph/task_graph.cpp

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
---
 csrc/graph/task_graph.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/graph/task_graph.cpp b/csrc/graph/task_graph.cpp
index 5d87dc08233..9ece0deffb3 100644
--- a/csrc/graph/task_graph.cpp
+++ b/csrc/graph/task_graph.cpp
@@ -585,7 +585,7 @@ class TaskSorter {
 
       // Our usage is at or below best. Have we completed an ordering? If
       // so, update best_steps
-      if (steps_.size() == (size_t)graph_.numTasks() && hwm < best_hwm) {
+      if (std::ssize(steps_) == graph_.numTasks() && hwm < best_hwm) {
         best_steps = steps_;
       }
     } // while