You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
The commented-out code in MatmulOp::evaluate might be useful for future development or debugging. Consider keeping it or documenting why it was removed.
// if (const auto rfactor_did_idx = getRFactorDeviceDimensionIndex(out());// rfactor_did_idx != -1) {// matmul_out = matmul_out.unsqueeze(rfactor_did_idx);// }// const auto& [sizes, strides] = inferShapeOfOutput(out(), ee);// auto meta_out = at::detail::empty_strided_meta(sizes, strides, a.dtype());// if (meta_out.is_contiguous()) {// return {matmul_out};// }// auto strided_matmul_out = at::empty_strided(sizes, strides, a.options());// strided_matmul_out = strided_matmul_out.copy_(matmul_out);// return {strided_matmul_out};
The performance test in tests/cpp/test_matmul_perf.cpp should be evaluated against a baseline to ensure that the changes do not introduce performance regressions.
os << "\n";
}
}
os << std::endl;
}
namespace {
// Returns the output shardings of the given fusion. As a short cut, if none of// the outputs have a device mesh, returns an empty vector indicating single-GPU// execution.
std::vector<Sharding> getOutputShardings(Fusion* fusion) {
std::vector<Sharding> output_shardings;
if (std::none_of(
fusion->outputs().begin(), fusion->outputs().end(), [](Val* v) {
if (auto* tv = dynamic_cast<TensorView*>(v)) {
return tv->hasDeviceMesh();
}
returnfalse;
})) {
return output_shardings;
}
output_shardings.reserve(fusion->outputs().size());
for (Val* out_val : fusion->outputs()) {
if (auto* out_tv = dynamic_cast<TensorView*>(out_val)) {
if (fusion->getOutputAlias(out_tv).hide_output) {
continue;
}
const DeviceMesh& mesh = out_tv->getDeviceMesh();
Sharding& output_sharding = output_shardings.emplace_back(mesh);
if (mesh.size() > 0) {
for (const ParallelType parallel_type : kParallelTypeDIDs) {
if (constauto axis = getShardedLogicalAxis(out_tv, parallel_type);
axis != -1) {
output_sharding.setAxisIsShardedOn(axis, parallel_type);
}
}
}
} else {
output_shardings.emplace_back(DeviceMesh());
}
}
return output_shardings;
}
} // namespace
std::pair<KernelArgumentHolder, std::vector<Sharding>> FusionDefinition::
execute(
KernelArgumentHolder args,
std::optional<int8_t> selected_device,
bool override_user_schedule,
bool capture_debug_output,
bool profile,
std::vector<std::string> _enable_options,
std::vector<std::string> _disable_options) const {
debug_output_ = std::nullopt;
std::stringstream debug_ss;
DebugStreamGuard dsg(capture_debug_output ? debug_ss : std::cout);
args.setDeviceIndex(selected_device);
NVF_CHECK(id().has_value(), "Valid fusion schedule is not available!");
auto scheds = fusionCache()->queryFusionSchedules(id().value());
if (profile) {
ProfilerOptionsGuard::getCurOptions().set(ProfilerOption::Enable);
}
EnableOptionsGuard enable_opt_guard;
for (constauto& _enable_option : _enable_options) {
std::optional<EnableOption> opt = stringToEnableOption(_enable_option);
NVF_CHECK(opt.has_value(), "Unrecognized enable_option: ", _enable_option);
EnableOptionsGuard::getCurOptions().set(opt.value());
}
DisableOptionsGuard disable_opt_guard;
for (constauto& _disable_option : _disable_options) {
std::optional<DisableOption> opt = stringToDisableOption(_disable_option);
NVF_CHECK(
opt.has_value(), "Unrecognized disable_option: ", _disable_option);
DisableOptionsGuard::getCurOptions().set(opt.value());
}
auto find_user_schedule = [&]() -> const UserSchedule* {
if (override_user_schedule) {
returnnullptr;
}
auto user_sched_id = fusionCache()->queryUserScheduleId(scheds, args);
if (!user_sched_id.has_value()) {
returnnullptr;
}
NVF_CHECK(
args.empty() || args.getDeviceIndex() > -1,
"Inputs are not all on the same device or don't match selection!");
const UserSchedule& user_sched = fusionCache()->queryUserSchedule(
scheds, user_sched_id.value(), args.getDeviceIndex());
return &user_sched;
};
constauto* user_sched = find_user_schedule();
KernelArgumentHolder outputs;
if (user_sched == nullptr) {
scheds->createExecutorIfNotExists();
outputs = scheds->auto_gen_schedules->runFusionWithInputs(
args, std::nullopt, args.getDeviceIndex());
} else {
if (isProfilerEnabledWithCupti()) {
FusionProfiler::start();
FusionProfiler::createSegments(1);
}
scheds->last_user_def_scheduled_ir = user_sched->scheduled_fusion.get();
scheds->last_user_def_executor = user_sched->executor.get();
if (user_sched->heuristic_params == nullptr) {
// Manual scheduleif (!user_sched->executor->isCompiled()) {
user_sched->executor->compile(user_sched->scheduled_fusion.get(), args);
}
outputs = user_sched->executor->run(args);
} else {
// Automatic scheduler was used for UserSchedule.// Pass launch and compile params to compileFusion and runFusion.if (!user_sched->executor->isCompiled()) {
user_sched->executor->compile(
user_sched->scheduled_fusion.get(),
args,
user_sched->heuristic_params->lparams,
user_sched->heuristic_params->cparams,
user_sched->heuristic_params->scheduler_type);
}
outputs = user_sched->executor->run(
args,
{},
user_sched->heuristic_params->lparams,
user_sched->heuristic_params->cparams);
}
if (isProfilerEnabledWithCupti()) {
FusionProfiler::segment(0).scheduler("user");
FusionProfiler::stop();
if (isProfilerPrintingEnabled()) {
debug() << FusionProfiler::profile();
}
}
}
if (profile) {
ProfilerOptionsGuard::getCurOptions().unset(ProfilerOption::Enable);
}
The beginEvent and endEvent methods in Trace class are now empty, which might remove important tracing functionality. Ensure that this is intentional and that tracing is handled elsewhere if necessary.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
No description provided.