-
Notifications
You must be signed in to change notification settings - Fork 75
Open
Description
Repro:
TEST_F(PersistentBufferTest, BroadcastDivByZero) {
auto fusion_ptr = std::make_unique<Fusion>();
auto& fusion = *fusion_ptr;
FusionGuard fg(fusion_ptr.get());
auto tv0 = makeSymbolicTensor(1, DataType::BFloat16);
fusion.addInput(tv0);
auto tv1 = makeSymbolicTensor(2, DataType::BFloat16);
fusion.addInput(tv1);
auto tv2 = broadcast(tv0, {false, true});
auto tv3 = castOp(DataType::Float, tv2);
auto tv4 = castOp(DataType::Float, tv1);
auto tv5 = add(tv3, tv4);
auto tv6 = sum(tv5, {0, 1});
auto tv7 = broadcast(tv6, {true});
auto tv8 = castOp(DataType::Float, tv0);
auto tv9 = add(tv8, tv7);
auto tv10 = castOp(DataType::BFloat16, tv9);
fusion.addOutput(tv10);
fusion.printMath();
auto options = at::TensorOptions().dtype(at::kBFloat16).device(at::kCUDA, 0);
auto t0 = at::randn({64}, options);
auto t1 = at::randn({64, 16}, options);
SchedulerRuntimeInfo runtime_info(fusion_ptr.get(), {t0, t1});
ASSERT_TRUE(Schedule::canSchedule(
SchedulerType::InnerPersistent, fusion_ptr.get(), runtime_info));
auto scheduler =
SchedulerEntry::makeSchedulerInstance(SchedulerType::InnerPersistent);
auto heuristic_params =
scheduler->computeHeuristics(fusion_ptr.get(), runtime_info);
scheduler->schedule(fusion_ptr.get(), heuristic_params.get());
}
This results in a divsion-by-zero error:
0x0000555556a951d6 in nvfuser::scheduler_utils::safeDiv (x=224, y=0) at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/utils.h:104
104 return std::max(x / y, (int64_t)1);
(gdb) bt
#0 0x0000555556a951d6 in nvfuser::scheduler_utils::safeDiv (x=224, y=0) at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/utils.h:104
#1 0x0000555556a8f485 in nvfuser::(anonymous namespace)::getMaxPersistentBatch (buffer_bytes_per_batch=0, target_threads_per_sm=896, register_overhead=16, is_high_bandwidth_flops_ratio=false)
at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/normalization_inner.cpp:147
#2 0x0000555556a8dde8 in nvfuser::(anonymous namespace)::innerPersistentHeuristic2D (properties=..., rparams=0x55555b573f90) at /home/nmaruyama/nvfuser/debug1/csrc/scheduler/normalization_inner.cpp:414
This is because the persistent buffer, t0, has a smaller number of elements (64) than the number of reduced elements (16 * 64).