-
Notifications
You must be signed in to change notification settings - Fork 609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Normalize operator for GPU backend #1986
Conversation
!build |
@@ -270,7 +270,7 @@ class NormalizeImplGPU { | |||
|
|||
// this condition is false when the other Setup overload was used | |||
if (axes_.data() != axes.data()) | |||
axes_ = { axes_.begin(), axes_.end() }; | |||
axes_ = { axes.begin(), axes.end() }; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a bug fix.
CI MESSAGE: [1354384]: BUILD STARTED |
CI MESSAGE: [1354384]: BUILD PASSED |
void Normalize<CPUBackend>::AllocTempStorage() { | ||
const TypeInfo &float_type = TypeTable::GetTypeInfo(DALI_FLOAT); | ||
int n = data_shape_.num_samples(); | ||
const TensorListShape<> &tmp_shape = batch_norm_ | ||
? uniform_list_shape(n, param_shape_[0]) // extend to all samples, to enable parallelism | ||
: param_shape_; | ||
|
||
if (ShouldCalcMean()) { | ||
mean_.Resize(tmp_shape); | ||
} else if (has_tensor_mean_) { | ||
assert(!batch_norm_); | ||
// use mean as-is | ||
assert(param_shape_ == mean_input_.shape); | ||
} else if (has_scalar_mean_) { | ||
// need to broadcast mean to match required shape | ||
if (is_uniform(param_shape_)) { | ||
// if param_shape_ is uniform, we need only one tensor | ||
mean_.Resize(TensorListShape<>({ param_shape_[0] })); | ||
} else { | ||
mean_.Resize(param_shape_); | ||
} | ||
} | ||
if (ShouldCalcStdDev()) { | ||
inv_stddev_.Resize(tmp_shape); | ||
} else if (has_tensor_stddev_) { | ||
assert(!batch_norm_); | ||
// we need space to calculate inverse stddev | ||
inv_stddev_.Resize(stddev_input_.shape); | ||
} else { | ||
assert(has_scalar_stddev_); | ||
if (!IsFullReduction()) { | ||
// need to broadcast stddev to match required shape | ||
if (is_uniform(param_shape_)) { | ||
// if param_shape_ is uniform, we need only one tensor | ||
inv_stddev_.Resize(TensorListShape<>({ param_shape_[0] })); | ||
} else { | ||
inv_stddev_.Resize(param_shape_); | ||
} | ||
} | ||
} | ||
mean_.set_type(float_type); | ||
inv_stddev_.set_type(float_type); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function is just moved from the header to .cc, do not look too much into this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should write this at the beginning of the function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Github comment to multiple lines appears at the bottom - I marked the whole function and it was highlighted, so I thought it would stay this way :(
!build |
for (int iter = 0; iter < 3; iter++) { | ||
auto req = kmgr_.Setup<Kernel>(0, ctx, data_shape_, param_shape_, | ||
use_scalar_base_, use_scalar_scale_, scale_is_stddev_); | ||
ASSERT_EQ(req.output_shapes.size(), 1u); | ||
ASSERT_EQ(req.output_shapes[0], data_shape_); | ||
out_.reshape(data_shape_); | ||
ref_.reshape(data_shape_); | ||
|
||
Launch(ctx); | ||
|
||
int param_samples = param_shape_.num_samples(); | ||
auto ref_base = use_scalar_base_ | ||
? ScalarTLV(scalar_base_, param_samples, data_shape_.sample_dim()) | ||
: base_.cpu(); | ||
auto ref_scale = use_scalar_scale_ | ||
? ScalarTLV(scalar_scale_, param_samples, data_shape_.sample_dim()) | ||
: scale_.cpu(); | ||
RefNormalize(ref_.cpu(), in_.cpu(), ref_base, ref_scale, | ||
global_scale_, shift_, scale_is_stddev_, epsilon_); | ||
|
||
if (scale_is_stddev_ && !std::is_integral<Out>::value) | ||
Check(out_.cpu(), ref_.cpu(), EqualEpsRel(1e-6, 1e-6)); | ||
else | ||
Check(out_.cpu(), ref_.cpu(), EqualUlp(4)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just added the loop - no changes inside.
for (int iter = 0; iter < 3; iter++) { | ||
test.Setup(in_shape, ref_out_shape, make_span(axes), false, true); | ||
EXPECT_GE(test.kernel.GetNumStages(), 4); // both reduced axes must be split | ||
test.FillData(0, 255); | ||
test.Run(); | ||
|
||
RefMean<int64_t>(test.ref.cpu(), test.in.cpu(), make_span(axes), false, true); | ||
test.Check(EqualEpsRel(1e-5, 1e-6)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just added the loop - no changes inside.
for (int iter = 0; iter < 3; iter++) { | ||
test.Setup(in_shape, ref_out_shape, make_span(axes), true, false); | ||
EXPECT_GE(test.kernel.GetNumStages(), 4); // both reduced axes must be split | ||
test.FillData(-100, 100); | ||
|
||
test.ref = RefStdDev(test.in.cpu(), mean_cpu); | ||
test.Run(fake_mean.gpu()); | ||
|
||
test.Check(EqualEpsRel(1e-5, 1e-6)); | ||
test.ref = RefStdDev(test.in.cpu(), mean_cpu); | ||
test.Check(EqualEpsRel(1e-5, 1e-6)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just added the loop - no changes inside.
for (int iter = 0; iter < 3; iter++) { | ||
test.Setup(in_shape, ref_out_shape, make_span(axes), true, false); | ||
EXPECT_GE(test.kernel.GetNumStages(), 2); // both reduced axes must be split | ||
test.FillData(-100, 100); | ||
|
||
test.ref = RefStdDev(test.in.cpu(), mean_cpu); | ||
test.Run(fake_mean.gpu()); | ||
|
||
test.Check(EqualEpsRel(1e-5, 1e-6)); | ||
test.ref = RefStdDev(test.in.cpu(), mean_cpu); | ||
test.Check(EqualEpsRel(1e-5, 1e-6)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just added the loop - no changes inside.
for (int iter = 0; iter < 3; iter++) { | ||
test.Setup(in_shape, ref_out_shape, make_span(axes), true, true); | ||
EXPECT_GE(test.kernel.GetNumStages(), 2); // both reduced axes must be split | ||
test.FillData(-100, 100); | ||
|
||
test.ref = RefStdDev(test.in.cpu(), mean_cpu, 1, 12000, true); | ||
test.Run(fake_mean.gpu(), 1, 12000); | ||
|
||
test.Check(EqualEpsRel(1e-5, 1e-6)); | ||
test.ref = RefStdDev(test.in.cpu(), mean_cpu, 1, 12000, true); | ||
test.Check(EqualEpsRel(1e-5, 1e-6)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just added the loop - no changes inside.
@@ -743,6 +745,7 @@ class ReduceImplGPU { | |||
void InitStages() { | |||
const int nsamples = in_shape_.num_samples(); | |||
const int in_dim = in_shape_.sample_dim(); | |||
stages_.clear(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an important bug fix.
for (int iter = 0; iter < 3; iter++) { | ||
test.Setup(in_shape, ref_out_shape, make_span(axes), false, true); | ||
test.FillData(0, 255); | ||
test.Run(); | ||
RefReduce(test.ref.cpu(), test.in.cpu(), make_span(axes), false, true, reductions::sum()); | ||
test.Check(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just added the loop - no changes inside.
|
||
def axes2names(axes, layout='abcdefghijklmnopqrstuvwxyz'): | ||
return "".join([layout[axis] for axis in axes]) | ||
|
||
def _test_up_to_5D_all_axis_combinations(device): | ||
batch_size = 10 | ||
batch_size = 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Batch size is reduced, but there are two iterations now.
CI MESSAGE: [1356273]: BUILD STARTED |
CI MESSAGE: [1356273]: BUILD FAILED |
dali/kernels/common/copy.h
Outdated
/** | ||
* @brief Copy in to out, merging contiguous samples. | ||
* | ||
* Contiguous samples are merged to reduce number of copies issuead and, at the same time, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
* Contiguous samples are merged to reduce number of copies issuead and, at the same time, | |
* Contiguous samples are merged to reduce number of copies issued and, at the same time, |
CI MESSAGE: [1356273]: BUILD PASSED |
} | ||
|
||
// We can't just Clear() the scratchpad to reuse it, because temporary buffers are also | ||
// stored there - so let's make a snapshot of current allocation state and restore it |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why before, and not clean after instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because the scratchpad is not empty at this point - it contains temporary tensor lists.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it is not most efficient regarding the amount of copies, but how about adding some scoped state saver so you can nest it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a jolly good idea, done.
!build |
CI MESSAGE: [1356695]: BUILD STARTED |
CI MESSAGE: [1356695]: BUILD PASSED |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, but please check my comments here, and couple of comments in
https://app.reviewnb.com/NVIDIA/DALI/pull/1986/files/
|
||
namespace { | ||
|
||
template <typename ToUpdate, typename Other> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
template <typename ToUpdate, typename Other> | |
template <typename InOut, typename Other> |
just a suggestion
if (in.data[i] != in_start + in_len || out.data[o] != out_start + out_len) { | ||
// discontinuity detected | ||
if (in_len < out_len) { | ||
assert(in.data[i] != in_start + in_len); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this assert seems a bit redundant to me, since this condition is checked 3 lines above
continue; | ||
} | ||
if (out_len < in_len) { | ||
assert(out.data[o] != out_start + out_len); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same here
dali/kernels/common/copy.h
Outdated
continue; | ||
} | ||
assert(in_len == out_len && "Groups of contiguous samples must have equal length"); | ||
if (out_len) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (out_len) | |
if (out_len > 0) |
reads better in my opinion
dali/kernels/common/copy.h
Outdated
assert(i == M && o == N); | ||
assert(in_len == out_len && "Groups of contiguous samples must have equal length"); | ||
|
||
if (out_len) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (out_len) | |
if (out_len > 0) |
Just a suggestion
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
* Extend reduction tests to multiple iterations. * Add multiple iterations to python tests for normalize operator. * Change Normalize tutorial notebook to use GPU backend. Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
CI MESSAGE: [1363718]: BUILD STARTED |
CI MESSAGE: [1363718]: BUILD PASSED |
Why we need this PR?
Pick one, remove the rest
What happened in this PR?
Fill relevant points, put NA otherwise. Replace anything inside []
JIRA TASK: DALI-1243