-
Notifications
You must be signed in to change notification settings - Fork 80
Search for optimal segment execution order #4973
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
8e294ba
7b279c2
6012846
da53605
daeb547
8076f0a
b29ad5e
6488c24
e7cf966
589653f
9e7c2ba
647fdb5
1d9b750
29b684c
05b42c5
082d379
5de4683
1ec54f0
3998406
be8332a
91e942f
253a750
a53650b
cb77db5
ce7335d
f515cd4
5892206
4e9ba4d
cc753c9
760b73b
bcad2a3
51c8c6c
2e85c6f
ef65076
0f4126c
620cf3a
290b652
ed4585f
cdf8598
d5951f3
a67c6e2
6480530
42fcc64
41aa154
9493cf0
020c333
5aafe11
b23274d
95dcf42
3c5f091
a52875f
589b997
5e9e899
55bbd7a
532ddcc
549124e
595ce79
d95e0bb
7ad6e97
9dc649f
957dc67
ed2395e
965436b
3546fa3
239a80d
51ee594
2de536d
3a0f9e2
c270fec
f313e1e
fb36a15
5cd1e66
f1260a1
337921a
60aa579
d08bd53
ab38575
a444097
fa63660
efbe8ba
8e4f508
c7cec9b
2ae3a31
d80f63b
64d1fa9
064d939
6c1a6f7
6011cac
e92731e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,7 +14,9 @@ | |
| #include <debug.h> | ||
| #include <device_lower/utils.h> | ||
| #include <disjoint_set.h> | ||
| #include <exceptions.h> | ||
| #include <fusion.h> | ||
| #include <graph/task_graph.h> | ||
| #include <instrumentation.h> | ||
| #include <ir/all_nodes.h> | ||
| #include <ir/cloner.h> | ||
|
|
@@ -28,6 +30,7 @@ | |
| #include <options.h> | ||
| #include <scheduler/debug_utils.h> | ||
| #include <scheduler/normalization_utils.h> | ||
| #include <scheduler/runtime_info.h> | ||
| #include <transform_iter.h> | ||
| #include <transform_replay.h> | ||
|
|
||
|
|
@@ -2001,8 +2004,159 @@ bool SegmentCandidateFinder::hasSegmentHints(Fusion* fusion) { | |
| } | ||
|
|
||
| namespace { | ||
|
|
||
| class SegmentedGroupTaskGraphConverter { | ||
| public: | ||
| static TaskGraph convert( | ||
| const std::vector<SegmentedGroup*>& groups, | ||
| SchedulerRuntimeInfo* runtime_info) { | ||
| SegmentedGroupTaskGraphConverter conv(runtime_info); | ||
| for (SegmentedGroup* group : groups) { | ||
| conv.processGroup(group); | ||
| } | ||
| return TaskGraph(conv.all_tasks_, conv.all_data_); | ||
| } | ||
|
|
||
| private: | ||
| SegmentedGroupTaskGraphConverter(SchedulerRuntimeInfo* runtime_info) | ||
| : runtime_info_(runtime_info) {} | ||
|
|
||
| void processGroup(SegmentedGroup* group) { | ||
| // When there are aliased inputs, they will appear as _outputs_ of the | ||
| // SegmentedGroup. To avoid actually adding those as outputs, we record them | ||
| // here first | ||
| std::unordered_set<TensorView*> aliased_input_tvs; | ||
| for (Val* v : group->outputs()) { | ||
| if (auto* aliased_input_tv = dynamic_cast<TensorView*>( | ||
| v->fusion()->getOutputAlias(v).aliased_io)) { | ||
| aliased_input_tvs.insert(aliased_input_tv); | ||
| } | ||
| } | ||
|
|
||
| std::vector<TaskGraph::DataId> inputs; | ||
| // These are fusion inputs, so they are not edges between segments | ||
| for (Val* v : group->inputs()) { | ||
| if (auto* tv = dynamic_cast<TensorView*>(v)) { | ||
| // Ignore scalar inputs | ||
| TaskGraph::DataId data_id = maybeRegisterTv(tv); | ||
| TaskGraph::Data& data = all_data_.at(data_id); | ||
| data.can_free = !tv->isFusionInput(); | ||
| inputs.push_back(data_id); | ||
| } | ||
| } | ||
| std::vector<TaskGraph::DataId> outputs; | ||
| for (Val* v : group->outputs()) { | ||
| if (auto* tv = dynamic_cast<TensorView*>(v)) { | ||
| if (aliased_input_tvs.count(tv) || tv->isFusionInput()) { | ||
| // These are counted as outputs but are actually _inputs_ to this | ||
| // group | ||
| // Note that we skip setting alias links in the graph when the input | ||
| // is simply forwarded to the outputs unchanged. | ||
| // See AliasTest.TrivialInputForwarding for an example of this | ||
| continue; | ||
| } | ||
| TaskGraph::DataId data_id = maybeRegisterTv(tv); | ||
| TaskGraph::Data& data = all_data_.at((size_t)data_id); | ||
| if (auto* aliased_input_tv = dynamic_cast<TensorView*>( | ||
| tv->fusion()->getOutputAlias(tv).aliased_io)) { | ||
| data.aliases_input = maybeRegisterTv(aliased_input_tv); | ||
| } | ||
| data.can_free = !tv->isFusionOutput(); | ||
| outputs.push_back(data_id); | ||
| } | ||
| } | ||
|
|
||
| // TODO: inspect compiled segment executors to determine temp gmem needed | ||
| TaskGraph::Size temp_space = 0; | ||
|
Comment on lines
+2069
to
+2070
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We currently prepare the runtime order before compilation of segments. However, if we did it afterward we would have access to the executor telling us how much temp space to use. |
||
|
|
||
| all_tasks_.emplace_back(inputs, outputs, temp_space); | ||
| } | ||
|
|
||
| int64_t getNumAllocatedElements(TensorView* tv) { | ||
| if (tv->isCpuScalar()) { | ||
| // Since CPU scalars do not result in any GPU allocation we count them as | ||
| // empty. | ||
| return 0; | ||
| } | ||
| int64_t numel = 1; | ||
| // Use ExpressionEvaluator for computed tensors assuming they are | ||
| // contiguous | ||
| for (IterDomain* id : tv->getMaybeAllocationDomain()) { | ||
| if (id->isBroadcast() || id->isReduction() || id->isDeviceDim()) { | ||
| continue; | ||
| } | ||
| PolymorphicValue pv = std::monostate{}; | ||
| if (runtime_info_ != nullptr) { | ||
| pv = runtime_info_->expressionEvaluator().evaluate(id->extent()); | ||
| } | ||
| // If we can't determine the size of this dimension, just assume | ||
| // it's 2. This way we will give precedence to tensors with | ||
| // allocation domains that have more concrete IDs. | ||
| int64_t dim_size = pv.is<int64_t>() ? pv.as<int64_t>() : 2; | ||
| numel *= dim_size; | ||
| } | ||
| return numel; | ||
| } | ||
|
|
||
| TaskGraph::DataId maybeRegisterTv(TensorView* tv) { | ||
| auto it = tv2dataid_.find(tv); | ||
| if (it != tv2dataid_.end()) { | ||
| // tv is already registered | ||
| return it->second; | ||
| } | ||
|
|
||
| // Register this TV | ||
| auto new_id = static_cast<TaskGraph::DataId>(std::ssize(all_data_)); | ||
| tv2dataid_[tv] = new_id; | ||
|
|
||
| // If the TV is of type Index, we don't know if it will be 8 bytes or 4 | ||
| // bytes until we are given input | ||
| DataType dtype = tv->dtype(); | ||
| if (dtype == DataType::Index) { | ||
| // If we don't have runtime info, assume it is 64-bit | ||
| dtype = runtime_info_ != nullptr ? runtime_info_->getIndexType() | ||
| : DataType::Int; | ||
| } | ||
| TaskGraph::Size size = | ||
| getNumAllocatedElements(tv) * dataTypeSizeByte(dtype); | ||
|
|
||
| all_data_.emplace_back( | ||
| /*definition=*/std::nullopt, | ||
| /*uses=*/std::vector<TaskGraph::TaskId>{}, | ||
| /*aliases_input=*/-1, | ||
| size, | ||
| /*can_free=*/true); | ||
| return new_id; | ||
| } | ||
|
|
||
| private: | ||
| SchedulerRuntimeInfo* runtime_info_; | ||
| std::vector<TaskGraph::Data> all_data_; | ||
| std::unordered_map<TensorView*, TaskGraph::DataId> tv2dataid_; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you find this indirection any useful? I once created such an indirection thinking it would be faster. It turned out to speed up very little and made implementation unnecessarily more complicated.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure whether it's faster in practice, but I can try it and check. I did originally try and keep things small and simple for perf reasons because I knew we'd need to do some expensive iterative optimizations. There are cases in our tests where exhaustive search runs out of time even if I increase the time limit to 100 seconds. Of course, it's unclear whether we get any benefit from more iterations even in those cases and an algorithmic improvement would probably be preferable. I agree that usually the indirection is slightly annoying. I need to use |
||
| std::vector<TaskGraph::Task> all_tasks_; | ||
| }; | ||
|
|
||
| std::vector<SegmentedGroup*> optimalTopoSort( | ||
| const std::vector<SegmentedGroup*>& groups, | ||
| SchedulerRuntimeInfo* runtime_info) { | ||
| FUSER_PERF_SCOPE("optimalTopoSort"); | ||
|
|
||
| TaskGraph graph = | ||
| SegmentedGroupTaskGraphConverter::convert(groups, runtime_info); | ||
|
|
||
| TaskGraph::SortResult result = graph.findOptimalOrder(/*validate=*/false); | ||
|
|
||
| std::vector<SegmentedGroup*> order; | ||
| order.reserve(groups.size()); | ||
| for (const TaskGraph::Step& step : result.steps) { | ||
| order.push_back(groups.at((size_t)step.task)); | ||
| } | ||
| return order; | ||
| } | ||
|
|
||
| std::vector<SegmentedGroup*> toposort( | ||
| const std::vector<SegmentedGroup*>& groups) { | ||
| FUSER_PERF_SCOPE("toposort"); | ||
| std::deque<SegmentedGroup*> to_visit; | ||
| std::unordered_map<SegmentedGroup*, int64_t> num_producer_edges; | ||
| for (SegmentedGroup* group : groups) { | ||
|
|
@@ -5383,7 +5537,10 @@ void SegmentedFusion::annotateFP16IntermediateTensors() { | |
| } | ||
| } | ||
|
|
||
| RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) { | ||
| RuntimeWorkSpace prepareRuntimeOrder( | ||
| const SegmentedFusion& segmented_fusion, | ||
| SchedulerRuntimeInfo* runtime_info) { | ||
| FUSER_PERF_SCOPE("prepareRuntimeOrder"); | ||
| RuntimeWorkSpace runtime_workspace; | ||
|
|
||
| // setup the order tensor dimensions are bound | ||
|
|
@@ -5398,7 +5555,8 @@ RuntimeWorkSpace prepareRuntimeOrder(const SegmentedFusion& segmented_fusion) { | |
| } | ||
| } | ||
|
|
||
| runtime_workspace.group_run_order = toposort(segmented_fusion.groups()); | ||
| runtime_workspace.group_run_order = | ||
| optimalTopoSort(segmented_fusion.groups(), runtime_info); | ||
|
|
||
| return runtime_workspace; | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.