Add JPEG color conversion and chroma subsampling kernel #2771

jantonguirao · 2021-03-08T18:37:30Z

Signed-off-by: Joaquin Anton janton@nvidia.com

Why we need this PR?

Pick one, remove the rest

It adds new feature needed for the JPEG artifact augmentation

What happened in this PR?

Fill relevant points, put NA otherwise. Replace anything inside []

What solution was applied:
Added a CUDA kernel for JPEG RGB to YCbCr conversion plus chroma subsampling
Affected modules and functionalities:
New functionality
Key points relevant for the review:
CUDA kernel, performance
Validation and testing:
C++ tests added
Documentation (including examples):
NA

JIRA TASK: [DALI-1905]

JanuszL · 2021-03-10T16:11:20Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

+template <typename T>
+__inline__ __device__ vec<3, T> rgb_to_ycbcr(const vec<3, uint8_t> rgb) {
+  vec<3, T> ycbcr;
+  ycbcr.x = rgb_to_y<T>(rgb);


Do you need here as you are not using it when rgb_to_ycbcr is called?

not really. Fixed

mzient · 2021-03-10T16:20:27Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

+  return ycbcr;
+}
+
+template <bool horz_subsample, bool vert_subsample, typename T = uint8_t, int in_nchannels = 3>


Suggested change

template <bool horz_subsample, bool vert_subsample, typename T = uint8_t, int in_nchannels = 3>

template <bool horz_subsample, bool vert_subsample, typename T = uint8_t>

I don't think that RGBToYCbCr makes sense for anything other than 3 channels.

Signed-off-by: Joaquin Anton <janton@nvidia.com>

Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>

Signed-off-by: Joaquin Anton <janton@nvidia.com>

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

JanuszL · 2021-03-11T17:40:18Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu_test.cu

+//  chroma_subsample_params_t<uint8_t, false, true>,
+//  chroma_subsample_params_t<uint8_t, true, false>,
+//  chroma_subsample_params_t<uint8_t, false, false>


mzient · 2021-03-12T13:51:00Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

+  for (int i = 0; i < N; i++)
+    *ptr++ = v[i];


Suggested change

for (int i = 0; i < N; i++)

*ptr++ = v[i];

for (int i = 0; i < N; i++)

ptr[i] = v[i];

I think that 2 kinds of indexing are more confusing both for the human reader and compiler.

mzient · 2021-03-12T14:00:34Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

+  // Assuming CUDA block has:
+  // - width 32, leads to 4 horizontal blocks of 8
+  // - height 8, so a single block 8x8 fits vertically
+  __shared__ T luma_blk[4][luma_blk_h][luma_blk_w];


Suggested change

__shared__ T luma_blk[4][luma_blk_h][luma_blk_w];

__shared__ T luma_blk[4][luma_blk_h][luma_blk_strides[1]];

mzient · 2021-03-12T14:01:01Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

+  __shared__ T cb_blk[4][8][8];
+  __shared__ T cr_blk[4][8][8];


Suggested change

__shared__ T cb_blk[4][8][8];

__shared__ T cr_blk[4][8][8];

__shared__ T cb_blk[4][chroma_blk_sz[1]][chroma_blk_strides[1]];

__shared__ T cr_blk[4][chroma_blk_sz[1]][chroma_blk_strides[1]];

mzient · 2021-03-12T14:09:51Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

+      rgb_to_ycbcr_chroma_subsample<horz_subsample, vert_subsample>(
+          blk_offset, offset, luma, cb, cr, in);
+
+      __syncthreads();


In this kernel, the synchronization is not necessary - you're accessing only the elements produced by this thread.

I am aware of that. I just put it here, because I was planning to build on top of that. I also don't need the shared memory.

mzient · 2021-03-12T14:23:15Z

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh

+  __shared__ T luma_blk[4][luma_blk_h][luma_blk_w];
+  __shared__ T cb_blk[4][8][8];
+  __shared__ T cr_blk[4][8][8];
+
+  int blk_idx = threadIdx.x / 8;
+  int local_x = threadIdx.x % 8;
+  int local_y = threadIdx.y;


I think that having 8x8 blocks for both luma and chroma may simplify the DCT & quantization steps.

Suggested change

__shared__ T luma_blk[4][luma_blk_h][luma_blk_w];

__shared__ T cb_blk[4][8][8];

__shared__ T cr_blk[4][8][8];

int blk_idx = threadIdx.x / 8;

int local_x = threadIdx.x % 8;

int local_y = threadIdx.y;

__shared__ float Cb[2][4][8][9]; // yes, 9 - to reduce bank conflicts!

__shared__ float Cr[2][4][8][9];

__shared__ float Y[2 << vert_subsample][4 << horz_subsample][8][9];

int cx = threadIdx.x;

int cy = threadIdx.y;

chroma_x = cx & 7;

chroma_y = cy & 7;

chroma_bx = cx >> 3;

chroma_by = cy >> 3;

int lx = threadIdx.x << horz_subsample;

int ly = threadIdx.y << vert_subsample;

luma_x = lx & 7;

luma_y = ly & 7;

luma_bx = lx >> 3;

luma_by = ly >> 3;

With 16x32 block size we'd have
Cr 2489 * 4 bytes = 2304 B
Cb 2489 * 4 bytes = 2304 B
Y 488*9 * 4 bytes = 9216 B
total 13824 B - well within acceptable limits, we still have quite a lot of shared mem for DCT step

Signed-off-by: Joaquin Anton <janton@nvidia.com>

mzient · 2021-03-15T12:37:17Z

dali/kernels/imgproc/jpeg/jpeg_distortion_gpu.cuh

+      if (horz_subsample && vert_subsample) {
+        luma[luma_y][luma_x + 1] = ycbcr.luma[1];
+        luma[luma_y + 1][luma_x] = ycbcr.luma[2];
+        luma[luma_y + 1][luma_x + 1] = ycbcr.luma[3];
+      } else if (horz_subsample) {
+        luma[luma_y][luma_x + 1] = ycbcr.luma[1];
+      } else if (vert_subsample) {
+        luma[luma_y + 1][luma_x] = ycbcr.luma[1];
+      }


This would be shorter - and hopefully the compiler would be smart enough to unroll it.

Suggested change

if (horz_subsample && vert_subsample) {

luma[luma_y][luma_x + 1] = ycbcr.luma[1];

luma[luma_y + 1][luma_x] = ycbcr.luma[2];

luma[luma_y + 1][luma_x + 1] = ycbcr.luma[3];

} else if (horz_subsample) {

luma[luma_y][luma_x + 1] = ycbcr.luma[1];

} else if (vert_subsample) {

luma[luma_y + 1][luma_x] = ycbcr.luma[1];

}

for (int i = 0, k = 0; i < vert_subsample+1; i++)

for (int j = 0; j < horz_subsample+1; j++, k++)

luma[luma_y + i][luma_x + j] = ycbcr.luma[k];

mzient · 2021-03-15T12:37:43Z

dali/kernels/imgproc/jpeg/jpeg_distortion_gpu.cuh

+      ivec2 offset{x, y};
+
+      auto ycbcr = rgb_to_ycbcr_subsampled<horz_subsample, vert_subsample, T>(offset, in);
+      luma[luma_y][luma_x] = ycbcr.luma[0];


Move this line closer to other luma writes - or follow the suggestion below.

Signed-off-by: Joaquin Anton <janton@nvidia.com>

awolant · 2021-03-15T16:23:28Z

dali/kernels/imgproc/jpeg/jpeg_distortion_gpu.cuh

+  int chroma_x = threadIdx.x & 7;  // % 8
+  int chroma_y = threadIdx.y & 7;  // % 8


I checked and for both & 7 and % 8 NVCC generates the same code:

mov.u32 %r3, %tid.x; and.b32 %r4, %r3, 7;

Maybe we should stick to what is intended here then? Unless there is something here I'm not getting.

This goes for every time this trick was used.

@mzient This was originally your suggestion. What do you think?

The same for /8 vs >>8. For both it is:

mov.u32 %r3, %tid.x; shr.u32 %r4, %r3, 3;

My original suggestion was inside a function where these values were passed as signed integers. For signed integers, the division rounds towards zero and gives a negative remainder. It's not only slower (additional math), but also potentially dangerous.

jantonguirao · 2021-03-15T16:49:58Z

!build

dali-automaton · 2021-03-15T16:55:59Z

CI MESSAGE: [2167738]: BUILD STARTED

dali-automaton · 2021-03-15T17:45:17Z

CI MESSAGE: [2167738]: BUILD FAILED

Signed-off-by: Joaquin Anton <janton@nvidia.com>

jantonguirao · 2021-03-15T18:23:26Z

!build

dali-automaton · 2021-03-15T18:26:08Z

CI MESSAGE: [2168116]: BUILD STARTED

dali-automaton · 2021-03-15T20:07:17Z

CI MESSAGE: [2168116]: BUILD PASSED

jantonguirao assigned mzient and awolant Mar 10, 2021

JanuszL reviewed Mar 10, 2021

View reviewed changes

jantonguirao force-pushed the chroma_subsample branch from 8914fd2 to 6cac3ab Compare March 10, 2021 16:14

jantonguirao changed the title ~~[WIP] Chroma subsampling kernel~~ Add JPEG color converssion and chroma subsampling kernel Mar 10, 2021

jantonguirao marked this pull request as ready for review March 10, 2021 16:17

jantonguirao changed the title ~~Add JPEG color converssion and chroma subsampling kernel~~ Add JPEG color conversion and chroma subsampling kernel Mar 10, 2021

mzient reviewed Mar 10, 2021

View reviewed changes

jantonguirao force-pushed the chroma_subsample branch 2 times, most recently from fe4eddd to 2fd52c8 Compare March 10, 2021 16:27

jantonguirao added 6 commits March 10, 2021 17:45

[WIP] Chroma subsampling kernel

0ae43e8

Signed-off-by: Joaquin Anton <janton@nvidia.com>

WIP

dad289e

Signed-off-by: Joaquin Anton <janton@nvidia.com>

Tests passing

3d8fa9c

Signed-off-by: Joaquin Anton <janton@nvidia.com>

Fix for Horizontal only and Vertical only subsamplings

421f665

Signed-off-by: Joaquin Anton <janton@nvidia.com>

Add perf measurement

388dad9

Signed-off-by: Joaquin Anton <janton@nvidia.com>

Code review fixes

b6b8926

Signed-off-by: Joaquin Anton <janton@nvidia.com>

jantonguirao force-pushed the chroma_subsample branch from 2fd52c8 to b6b8926 Compare March 10, 2021 16:45

JanuszL approved these changes Mar 10, 2021

View reviewed changes

Code review fixes

3180b51

Signed-off-by: Joaquin Anton <janton@nvidia.com>

jantonguirao force-pushed the chroma_subsample branch from b6b8926 to 3180b51 Compare March 10, 2021 17:02

mzient and others added 4 commits March 10, 2021 19:09

Use average instead of sampler.

3661483

Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>

Merge.

9c328c3

Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>

Adjust block size in perf run.

0f85ec2

Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>

ChromaSubsampleDistortion kernel

a79a701

Signed-off-by: Joaquin Anton <janton@nvidia.com>

JanuszL reviewed Mar 11, 2021

View reviewed changes

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh Outdated Show resolved Hide resolved

JanuszL reviewed Mar 11, 2021

View reviewed changes

dali/kernels/imgproc/jpeg/chroma_subsample_gpu.cuh Outdated Show resolved Hide resolved

JanuszL reviewed Mar 11, 2021

View reviewed changes

jantonguirao force-pushed the chroma_subsample branch from 7ced1fc to 6416ca6 Compare March 12, 2021 12:05

mzient reviewed Mar 12, 2021

View reviewed changes

Working chroma subsampling artifact kernel

debc3bb

Signed-off-by: Joaquin Anton <janton@nvidia.com>

jantonguirao force-pushed the chroma_subsample branch from 6416ca6 to debc3bb Compare March 12, 2021 15:26

jantonguirao added 2 commits March 12, 2021 17:06

Rename to jpeg_artifacts_gpu.cuh

ba91cf4

Signed-off-by: Joaquin Anton <janton@nvidia.com>

Create a dedicated chroma subsample kernel

f0f6650

Signed-off-by: Joaquin Anton <janton@nvidia.com>

jantonguirao force-pushed the chroma_subsample branch from e41d1b6 to 3a2eac4 Compare March 15, 2021 12:11

Code review fixes

81be4a9

Signed-off-by: Joaquin Anton <janton@nvidia.com>

jantonguirao force-pushed the chroma_subsample branch from 3a2eac4 to 81be4a9 Compare March 15, 2021 12:19

mzient reviewed Mar 15, 2021

View reviewed changes

Code review fixes

10e0dbc

Signed-off-by: Joaquin Anton <janton@nvidia.com>

mzient approved these changes Mar 15, 2021

View reviewed changes

awolant approved these changes Mar 15, 2021

View reviewed changes

Code review fixes

e426f31

Signed-off-by: Joaquin Anton <janton@nvidia.com>

jantonguirao force-pushed the chroma_subsample branch from 6d4cd64 to e426f31 Compare March 15, 2021 18:22

jantonguirao merged commit 1ab45a2 into NVIDIA:master Mar 15, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add JPEG color conversion and chroma subsampling kernel #2771

Add JPEG color conversion and chroma subsampling kernel #2771

jantonguirao commented Mar 8, 2021 •

edited

Loading

JanuszL Mar 10, 2021

jantonguirao Mar 10, 2021

mzient Mar 10, 2021

jantonguirao Mar 10, 2021

JanuszL Mar 11, 2021

mzient Mar 12, 2021

mzient Mar 12, 2021

mzient Mar 12, 2021

mzient Mar 12, 2021

jantonguirao Mar 12, 2021

mzient Mar 12, 2021 •

edited

Loading

mzient Mar 15, 2021

mzient Mar 15, 2021

awolant Mar 15, 2021

jantonguirao Mar 15, 2021

awolant Mar 15, 2021

mzient Mar 15, 2021 •

edited

Loading

jantonguirao commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

jantonguirao commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

	template <bool horz_subsample, bool vert_subsample, typename T = uint8_t, int in_nchannels = 3>
	template <bool horz_subsample, bool vert_subsample, typename T = uint8_t>

	__shared__ T luma_blk[4][luma_blk_h][luma_blk_w];
	__shared__ T luma_blk[4][luma_blk_h][luma_blk_strides[1]];

-  __shared__ T luma_blk[4][luma_blk_h][luma_blk_w];
-  __shared__ T cb_blk[4][8][8];
-  __shared__ T cr_blk[4][8][8];
-  int blk_idx = threadIdx.x / 8;
-  int local_x = threadIdx.x % 8;
-  int local_y = threadIdx.y;
+  __shared__ float Cb[2][4][8][9];  // yes, 9 - to reduce bank conflicts!
+  __shared__ float Cr[2][4][8][9];
+  __shared__ float Y[2 << vert_subsample][4 << horz_subsample][8][9];
+  int cx = threadIdx.x;
+  int cy = threadIdx.y;
+  chroma_x = cx & 7;
+  chroma_y = cy & 7;
+  chroma_bx = cx >> 3;
+  chroma_by = cy >> 3;
+  int lx = threadIdx.x << horz_subsample;
+  int ly = threadIdx.y << vert_subsample;
+  luma_x = lx & 7;
+  luma_y = ly & 7;
+  luma_bx = lx >> 3;
+  luma_by = ly >> 3;

		int chroma_x = threadIdx.x & 7; // % 8
		int chroma_y = threadIdx.y & 7; // % 8

Add JPEG color conversion and chroma subsampling kernel #2771

Add JPEG color conversion and chroma subsampling kernel #2771

Conversation

jantonguirao commented Mar 8, 2021 • edited Loading

Why we need this PR?

What happened in this PR?

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

mzient Mar 12, 2021 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

mzient Mar 15, 2021 • edited Loading

Choose a reason for hiding this comment

jantonguirao commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

jantonguirao commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

dali-automaton commented Mar 15, 2021

jantonguirao commented Mar 8, 2021 •

edited

Loading

mzient Mar 12, 2021 •

edited

Loading

mzient Mar 15, 2021 •

edited

Loading