NVIDIA · jacobhinkle · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -261,6 +261,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/fusion_guard.cpp
   ${NVFUSER_SRCS_DIR}/fusion_segmenter.cpp
   ${NVFUSER_SRCS_DIR}/global_allocator.cpp
+  ${NVFUSER_SRCS_DIR}/graph/task_graph.cpp
   ${NVFUSER_SRCS_DIR}/grouped_reduction.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/container.cpp
   ${NVFUSER_SRCS_DIR}/host_ir/evaluator.cpp
@@ -992,6 +993,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_statement_guard.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_stream.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_swizzle.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_task_graph.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_tensor_factories.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_tmem.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_transpose.cpp

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
@@ -14,7 +14,9 @@
 #include <debug.h>
 #include <device_lower/utils.h>
 #include <disjoint_set.h>
+#include <exceptions.h>
 #include <fusion.h>
+#include <graph/task_graph.h>
 #include <instrumentation.h>
 #include <ir/all_nodes.h>
 #include <ir/cloner.h>
@@ -28,6 +30,7 @@
 #include <options.h>
 #include <scheduler/debug_utils.h>
 #include <scheduler/normalization_utils.h>
+#include <scheduler/runtime_info.h>
 #include <transform_iter.h>
 #include <transform_replay.h>
 
@@ -2001,8 +2004,159 @@ bool SegmentCandidateFinder::hasSegmentHints(Fusion* fusion) {
 }
 
 namespace {
+
+class SegmentedGroupTaskGraphConverter {
+ public:
+  static TaskGraph convert(
+      const std::vector<SegmentedGroup*>& groups,
+      SchedulerRuntimeInfo* runtime_info) {
+    SegmentedGroupTaskGraphConverter conv(runtime_info);
+    for (SegmentedGroup* group : groups) {
+      conv.processGroup(group);
+    }
+    return TaskGraph(conv.all_tasks_, conv.all_data_);
+  }
+
+ private:
+  SegmentedGroupTaskGraphConverter(SchedulerRuntimeInfo* runtime_info)
+      : runtime_info_(runtime_info) {}
+
+  void processGroup(SegmentedGroup* group) {
+    // When there are aliased inputs, they will appear as _outputs_ of the
+    // SegmentedGroup. To avoid actually adding those as outputs, we record them
+    // here first
+    std::unordered_set<TensorView*> aliased_input_tvs;
+    for (Val* v : group->outputs()) {
+      if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
+              v->fusion()->getOutputAlias(v).aliased_io)) {
+        aliased_input_tvs.insert(aliased_input_tv);
+      }
+    }
+
+    std::vector<TaskGraph::DataId> inputs;
+    // These are fusion inputs, so they are not edges between segments
+    for (Val* v : group->inputs()) {
+      if (auto* tv = dynamic_cast<TensorView*>(v)) {
+        // Ignore scalar inputs
+        TaskGraph::DataId data_id = maybeRegisterTv(tv);
+        TaskGraph::Data& data = all_data_.at(data_id);
+        data.can_free = !tv->isFusionInput();
+        inputs.push_back(data_id);
+      }
+    }
+    std::vector<TaskGraph::DataId> outputs;
+    for (Val* v : group->outputs()) {
+      if (auto* tv = dynamic_cast<TensorView*>(v)) {
+        if (aliased_input_tvs.count(tv) || tv->isFusionInput()) {
+          // These are counted as outputs but are actually _inputs_ to this
+          // group
+          // Note that we skip setting alias links in the graph when the input
+          // is simply forwarded to the outputs unchanged.
+          // See AliasTest.TrivialInputForwarding for an example of this
+          continue;
+        }
+        TaskGraph::DataId data_id = maybeRegisterTv(tv);
+        TaskGraph::Data& data = all_data_.at((size_t)data_id);
+        if (auto* aliased_input_tv = dynamic_cast<TensorView*>(
+                tv->fusion()->getOutputAlias(tv).aliased_io)) {
+          data.aliases_input = maybeRegisterTv(aliased_input_tv);
+        }
+        data.can_free = !tv->isFusionOutput();
+        outputs.push_back(data_id);
+      }
+    }
+
+    // TODO: inspect compiled segment executors to determine temp gmem needed
+    TaskGraph::Size temp_space = 0;
+
+    all_tasks_.emplace_back(inputs, outputs, temp_space);
+  }
+
+  int64_t getNumAllocatedElements(TensorView* tv) {
+    if (tv->isCpuScalar()) {
+      // Since CPU scalars do not result in any GPU allocation we count them as
+      // empty.
+      return 0;
+    }
+    int64_t numel = 1;
+    // Use ExpressionEvaluator for computed tensors assuming they are
+    // contiguous
+    for (IterDomain* id : tv->getMaybeAllocationDomain()) {
+      if (id->isBroadcast() || id->isReduction() || id->isDeviceDim()) {
+        continue;
+      }
+      PolymorphicValue pv = std::monostate{};
+      if (runtime_info_ != nullptr) {
+        pv = runtime_info_->expressionEvaluator().evaluate(id->extent());
+      }
+      // If we can't determine the size of this dimension, just assume
+      // it's 2. This way we will give precedence to tensors with
+      // allocation domains that have more concrete IDs.
+      int64_t dim_size = pv.is<int64_t>() ? pv.as<int64_t>() : 2;
+      numel *= dim_size;
+    }
+    return numel;
+  }
+
+  TaskGraph::DataId maybeRegisterTv(TensorView* tv) {
+    auto it = tv2dataid_.find(tv);
+    if (it != tv2dataid_.end()) {
+      // tv is already registered
+      return it->second;
+    }
+
+    // Register this TV
+    auto new_id = static_cast<TaskGraph::DataId>(std::ssize(all_data_));
+    tv2dataid_[tv] = new_id;
+
+    // If the TV is of type Index, we don't know if it will be 8 bytes or 4
+    // bytes until we are given input
+    DataType dtype = tv->dtype();
+    if (dtype == DataType::Index) {
+      // If we don't have runtime info, assume it is 64-bit
+      dtype = runtime_info_ != nullptr ? runtime_info_->getIndexType()
+                                       : DataType::Int;
+    }
+    TaskGraph::Size size =
+        getNumAllocatedElements(tv) * dataTypeSizeByte(dtype);
+
+    all_data_.emplace_back(
+        /*definition=*/std::nullopt,
+        /*uses=*/std::vector<TaskGraph::TaskId>{},
+        /*aliases_input=*/-1,
+        size,
+        /*can_free=*/true);
+    return new_id;
+  }
+
+ private:
+  SchedulerRuntimeInfo* runtime_info_;
+  std::vector<TaskGraph::Data> all_data_;
+  std::unordered_map<TensorView*, TaskGraph::DataId> tv2dataid_;
+  std::vector<TaskGraph::Task> all_tasks_;
+};
+
+std::vector<SegmentedGroup*> optimalTopoSort(
+    const std::vector<SegmentedGroup*>& groups,
+    SchedulerRuntimeInfo* runtime_info) {
+  FUSER_PERF_SCOPE("optimalTopoSort");
+
+  TaskGraph graph =
+      SegmentedGroupTaskGraphConverter::convert(groups, runtime_info);
+
+  TaskGraph::SortResult result = graph.findOptimalOrder(/*validate=*/false);
+
+  std::vector<SegmentedGroup*> order;
+  order.reserve(groups.size());
+  for (const TaskGraph::Step& step : result.steps) {
+    order.push_back(groups.at((size_t)step.task));
+  }
+  return order;
+}
+
 std::vector<SegmentedGroup*> toposort(
     const std::vector<SegmentedGroup*>& groups) {
+  FUSER_PERF_SCOPE("toposort");
   std::deque<SegmentedGroup*> to_visit;
   std::unordered_map<SegmentedGroup*, int64_t> num_producer_edges;
   for (SegmentedGroup* group : groups) {
@@ -5383,7 +5537,10 @@ void SegmentedFusion::annotateFP16IntermediateTensors() {
   }
 }
 
-RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
+RuntimeWorkSpace prepareRuntimeOrder(
+    const SegmentedFusion& segmented_fusion,
+    SchedulerRuntimeInfo* runtime_info) {
+  FUSER_PERF_SCOPE("prepareRuntimeOrder");
   RuntimeWorkSpace runtime_workspace;
 
   // setup the order tensor dimensions are bound
@@ -5398,7 +5555,8 @@ RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) {
     }
   }
 
-  runtime_workspace.group_run_order = toposort(segmented_fusion.groups());
+  runtime_workspace.group_run_order =
+      optimalTopoSort(segmented_fusion.groups(), runtime_info);
 
   return runtime_workspace;
 }

diff --git a/csrc/fusion_segmenter.h b/csrc/fusion_segmenter.h
@@ -477,7 +477,9 @@ struct RuntimeWorkSpace {
 
 // Perform a topological sort of different groups composiong the Segmented
 // Fusion
-RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion);
+RuntimeWorkSpace prepareRuntimeOrder(
+    const SegmentedFusion& segmented_fusion,
+    SchedulerRuntimeInfo* runtime_info = nullptr);
 
 //! This is a base class for segmenter analysis
 //!  provides the minimal implementation on header so that