You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Multiple tests related to MisalignedVectorize have been removed. Ensure that the functionality they tested is covered by other tests or is no longer relevant.
ke.compile(&fusion, {t0, t1});
auto cg_outputs = ke.run({t0, t1});
testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionVectorization1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeContigTensor(2);
auto tv1 = makeContigTensor(2);
fusion.addInput(tv0);
fusion.addInput(tv1);
auto tv2 = add(tv0, tv1);
fusion.addOutput(tv2);
tv2->split(1, 16);
tv2->split(1, 64);
tv2->axis(0)->parallelize(ParallelType::BIDx);
tv2->axis(2)->parallelize(ParallelType::TIDx);
auto c0 = tv0->cacheAfter();
auto c1 = tv1->cacheAfter();
tv2->cacheBefore();
c0->computeAt(tv2, -2);
c1->computeAt(tv2, -2);
std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
for (auto tv : vectorized_tvs) {
tv->split(-1, 4);
tv->axis(-1)->parallelize(ParallelType::Vectorize);
}
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
constint bx = 128;
constint by = 2048;
at::Tensor t0 = at::randn({bx, by}, options);
at::Tensor t1 = at::randn({bx, by}, options);
KernelExecutor ke;
ke.compile(&fusion, {t0, t1});
auto cg_outputs = ke.run({t0, t1});
testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionVectorization2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
auto tv1 = makeSymbolicTensor(2);
fusion.addInput(tv0);
fusion.addInput(tv1);
auto tv2 = add(tv0, tv1);
fusion.addOutput(tv2);
tv2->split(1, 16);
tv2->split(1, 64);
tv2->axis(0)->parallelize(ParallelType::BIDx);
tv2->axis(2)->parallelize(ParallelType::TIDx);
auto c0 = tv0->cacheAfter();
auto c1 = tv1->cacheAfter();
tv2->cacheBefore();
c0->computeAt(tv2, -2);
c1->computeAt(tv2, -2);
std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
for (auto tv : vectorized_tvs) {
tv->split(-1, 4);
// Vectorize the wrong dimension
tv->axis(-2)->parallelize(ParallelType::Vectorize);
}
KernelExecutor ke;
// Make sure compilation fails// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)ASSERT_ANY_THROW(ke.compile(&fusion));
}
// TODO: Re-enable once vectorization validation is fixedTEST_F(NVFuserTest, FusionVectorization3_CUDA) {
GTEST_SKIP();
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
auto tv1 = makeSymbolicTensor(2);
fusion.addInput(tv0);
fusion.addInput(tv1);
auto tv2 = add(tv0, tv1);
fusion.addOutput(tv2);
tv2->split(1, 16);
tv2->split(1, 64);
tv2->axis(0)->parallelize(ParallelType::BIDx);
tv2->axis(2)->parallelize(ParallelType::TIDx);
auto c0 = tv0->cacheAfter();
auto c1 = tv1->cacheAfter();
tv2->cacheBefore();
c0->computeAt(tv2, -2);
c1->computeAt(tv2, -2);
std::vector<TensorView*> vectorized_tvs = {c0, c1, tv2};
for (auto tv : vectorized_tvs) {
tv->split(-1, 4);
tv->axis(-1)->parallelize(ParallelType::Vectorize);
}
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
constint bx = 128;
constint by = 2049;
at::Tensor t0 = at::randn({bx, by}, options);
at::Tensor t1 = at::randn({bx, by}, options);
KernelExecutor ke;
ke.compile(&fusion, {t0, t1});
// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)ASSERT_ANY_THROW(ke.run({t0, t1}));
// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)ASSERT_ANY_THROW(ke.run(
{t0.index({"...", at::indexing::Slice(1)}),
t1.index({"...", at::indexing::Slice(1)})}));
t0 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)});
t1 = at::randn({bx, 2048}, options).index({"...", at::indexing::Slice(4)});
auto cg_outputs = ke.run({t0, t1});
testValidate(&fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionVectorizationRFactor_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeContigTensor(2);
auto tv1 = makeContigTensor(2);
fusion.addInput(tv0);
fusion.addInput(tv1);
auto tv2 = add(tv0, tv1);
auto tv3 = sum(tv2, {-1});
fusion.addOutput(tv3);
tv3->split(-1, 128 * 4);
tv3->split(-1, 4);
// Reduce outer dim firstauto tv4 = tv3->rFactor({-3, -1});
// Tv3 will reduce threadsauto tv6 = tv0->cacheAfter();
auto tv7 = tv1->cacheAfter();
tv0->computeAt(tv3, 1);
tv1->computeAt(tv3, 1);
tv3->axis(0)->parallelize(ParallelType::BIDx);
tv0->computeAt(tv4, -2);
tv1->computeAt(tv4, -2);
tv6->axis(-1)->parallelize(ParallelType::Vectorize);
tv7->axis(-1)->parallelize(ParallelType::Vectorize);
tv4->axis(-2)->parallelize(ParallelType::TIDx);
tv3->axis(1)->parallelize(ParallelType::TIDx);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
constint bx = 128;
constint by = 2048;
at::Tensor t0 = at::randn({bx, by}, options);
at::Tensor t1 = at::randn({bx, by}, options);
KernelExecutor ke;
ke.compile(&fusion, {t0, t1});
auto cg_outputs = ke.run({t0, t1});
auto aten_output = t0.add(t1).sum(1);
testValidate(
&fusion, cg_outputs, {t0, t1}, {aten_output}, __LINE__, __FILE__);
auto t3 = t0.add(t1).sum(1);
testValidate(&fusion, cg_outputs, {t0, t1}, {t3}, __LINE__, __FILE__);
}
// Unswitched loops with extent one may omit else clause.TEST_F(NVFuserTest, FusionSizeOneLoop1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Progressively broadcast tensors
TensorView* tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
TensorView* tv1 = makeSymbolicTensor(2);
fusion.addInput(tv1);
TensorView* tv2 = makeSymbolicTensor(3);
fusion.addInput(tv2);
TensorView* tv3 = broadcast(tv0, {false, true});
TensorView* tv4 = add(tv3, tv1);
TensorView* tv5 = add(tv4, tv2);
fusion.addOutput(tv5);
// Split inner dimension
tv5->split(1, 8);
// Merge middle dims with outer dimensions
tv5->merge(2);
tv5->merge(0);
// tv5[I0*I1o, I1i*I2]// Get a dim of size 1 to unswitch
tv5->split(0, 1, false);
// Compute everything inline
tv0->computeAt(tv5, -1);
tv5->axis(0)->parallelize(ParallelType::Unswitch);
tv5->axis(1)->parallelize(ParallelType::BIDx);
tv5->axis(2)->parallelize(ParallelType::TIDx);
// Make sure the unswitched loop does not have an else clause.
GpuLower gpulw(&fusion);
gpulw.run();
NVF_CHECK(!UnswitchInElseChecker::check(gpulw));
constint x = 11;
constint y = 12;
constint z = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({x}, options);
at::Tensor t1 = at::randn({x, y}, options);
at::Tensor t2 = at::randn({z, x, y}, options);
KernelExecutor ke;
ke.compile(&fusion, {t0, t1, t2});
auto cg_outputs = ke.run({t0, t1, t2});
testValidate(&fusion, cg_outputs, {t0, t1, t2}, __LINE__, __FILE__);
}
// The unswitched loop has extent one but inner loops don't. The else// part should not be omitted.TEST_F(NVFuserTest, FusionSizeOneLoop2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
constint x = 15;
auto tv0 = makeConcreteTensor({x});
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
fusion.addOutput(tv1);
tv1->split(-1, 4);
tv1->split(-2, 1);
tv1->axis(-2)->parallelize(ParallelType::Unswitch);
// Make sure the size-one unswitched loop does not omit the else clause.
GpuLower gpulw(&fusion);
gpulw.run();
NVF_CHECK(UnswitchInElseChecker::check(gpulw));
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({x}, options);
KernelExecutor ke;
ke.compile(&fusion, {t0});
auto cg_outputs = ke.run({t0});
testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionValidateParallelize1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
fusion.addOutput(tv2);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv2->axis(-1)->parallelize(ParallelType::TIDy);
// Invalid as tv1 and tv2 do have the same ParallelType
KernelExecutor ke;
// NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto)ASSERT_ANY_THROW(ke.compile(&fusion));
}
TEST_F(NVFuserTest, FusionValidateParallelize2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
fusion.addOutput(tv2);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv2->axis(-1)->parallelize(ParallelType::TIDy);
tv1->setMemoryType(MemoryType::Shared);
// tv1 and tv2 do have the same ParallelType, but tv1 is on shared// memory, so it is valid
KernelExecutor ke;
ke.compile(&fusion);
}
TEST_F(NVFuserTest, FusionValidateParallelize3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
fusion.addOutput(tv2);
tv1->split(-1, 4);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv2->split(-1, 4);
tv2->axis(-1)->parallelize(ParallelType::TIDx);
tv1->setMemoryType(MemoryType::Global);
// tv1 and tv2 have the same shape and ParallelType
KernelExecutor ke;
ke.compile(&fusion);
}
TEST_F(NVFuserTest, FusionValidateParallelize4_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
fusion.addOutput(tv2);
tv1->split(-1, 4);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv2->split(-1, 8);
tv2->axis(-1)->parallelize(ParallelType::TIDx);
tv1->setMemoryType(MemoryType::Global);
// tv1 and tv2 do not have the same shape but global memory comm is supported.
KernelExecutor ke;
ke.compile(&fusion);
}
TEST_F(NVFuserTest, FusionValidateParallelize5_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Val>(1.0));
auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
fusion.addOutput(tv2);
tv1->split(-1, 4);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv1->setMemoryType(MemoryType::Shared);
tv2->split(-1, 8);
tv2->axis(-1)->parallelize(ParallelType::TIDx);
// tv1 and tv2 do not have the same shape, but tv1 is on shared// memory, so it is valid
KernelExecutor ke;
ke.compile(&fusion);
}
// See issue #995TEST_F(NVFuserTest, FusionValidateParallelize6_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
int64_t W = 5, X = 6, Y = 7, Z = 8;
auto tv0 = makeConcreteTensor({X, Y, Z});
auto tv1 = makeConcreteTensor({W, X, Y, Z});
fusion.addInput(tv0);
fusion.addInput(tv1);
auto tv2 = add(tv0, IrBuilder::create<Val>(1.0));
auto tv3 = broadcast(tv2, {true, false, false, false});
auto tv4 = add(tv3, tv1);
fusion.addOutput(tv4);
tv4->merge(0);
tv4->merge(0);
tv4->merge(0);
tv4->split(0, 4);
tv4->split(0, 3);
tv4->split(0, 2);
TransformPropagatorWithCheck propagator(tv4);
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
It hasn't been used. If needed, we should start from scratch.