-+
-+#include "rpi_mailbox.h"
-+#include "rpi_qpu.h"
-+#include "rpi_shader.h"
-+#include "rpi_hevc_transform8.h"
-+#include "rpi_hevc_transform10.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
-+#pragma GCC diagnostic pop
-+
-+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
-+#define RPI_TRACE_TIME_VPU_QPU_WAIT 0
-+
-+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
-+// Beware this is expensive and will probably throw off all other timing by >10%
-+#define RPI_TRACE_QPU_PROFILE_ALL 0
-+
-+// QPU "noflush" flags
-+// a mixture of flushing & profiling
-+
-+#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed
-+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers
-+#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results
-+#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling
-+#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed)
-+
-+#define vcos_verify_ge0(x) ((x)>=0)
-+
-+// Size in 32bit words
-+#define QPU_CODE_SIZE 4098
-+#define VPU_CODE_SIZE 2048
-+
-+static const short rpi_transMatrix2even[32][16] = { // Even rows first
-+{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
-+{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
-+{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
-+{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
-+{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
-+{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
-+{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
-+{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
-+{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
-+{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
-+{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
-+{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
-+{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
-+{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
-+{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
-+{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
-+// Odd rows
-+{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
-+{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-+{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
-+{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
-+{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
-+{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
-+{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
-+{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
-+{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
-+{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
-+{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
-+{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
-+{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
-+{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
-+{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
-+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
-+};
-+
-+// Code/constants on GPU
-+struct GPU
-+{
-+ unsigned int qpu_code[QPU_CODE_SIZE];
-+ unsigned int vpu_code8[VPU_CODE_SIZE];
-+ unsigned int vpu_code10[VPU_CODE_SIZE];
-+ short transMatrix2even[16*16*2];
-+};
-+
-+#define CFE_ENTS_PER_A 8
-+// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
-+// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
-+// allow 128
-+#define CFE_ENT_COUNT 128
-+#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A)
-+
-+struct rpi_cache_flush_env_s {
-+// unsigned int n;
-+// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
-+ struct vcsm_user_clean_invalid2_s v;
-+};
-+
-+#define WAIT_COUNT_MAX 16
-+
-+typedef struct trace_time_one_s
-+{
-+ int count;
-+ int64_t start[WAIT_COUNT_MAX];
-+ int64_t total[WAIT_COUNT_MAX];
-+} trace_time_one_t;
-+
-+typedef struct trace_time_wait_s
-+{
-+ unsigned int jcount;
-+ int64_t start0;
-+ int64_t last_update;
-+ trace_time_one_t active;
-+ trace_time_one_t wait;
-+} trace_time_wait_t;
-+
-+typedef struct vq_wait_s
-+{
-+ sem_t sem;
-+ struct vq_wait_s * next;
-+} vq_wait_t;
-+
-+#define VQ_WAIT_POOL_SIZE 16
-+typedef struct vq_wait_pool_s
-+{
-+ vq_wait_t * head;
-+ vq_wait_t pool[VQ_WAIT_POOL_SIZE];
-+} vq_wait_pool_t;
-+
-+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
-+
-+typedef struct gpu_env_s
-+{
-+ int open_count;
-+ int init_count;
-+ int mb;
-+ int vpu_i_cache_flushed;
-+ GPU_MEM_PTR_T code_gm_ptr;
-+ vq_wait_pool_t wait_pool;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ trace_time_wait_t ttw;
-+#endif
-+} gpu_env_t;
-+
-+// Stop more than one thread trying to allocate memory or use the processing resources at once
-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static gpu_env_t * gpu = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+
-+static int64_t ns_time(void)
-+{
-+ struct timespec ts;
-+ clock_gettime(CLOCK_MONOTONIC, &ts);
-+ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
-+}
-+
-+
-+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
-+
-+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
-+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
-+#define T_ARG(t) T_SEC(t), T_MS(t)
-+#define T_FMT "%u.%03u"
-+
-+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
-+{
-+ // Update totals for levels that are still pending
-+ for (int i = 0; i < tto->count; ++i) {
-+ tto->total[i] += now - tto->start[i];
-+ tto->start[i] = now;
-+ }
-+
-+ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
-+ prefix,
-+ T_ARG(now - start0 - tto->total[0]),
-+ T_ARG(tto->total[0]),
-+ T_ARG(tto->total[1]),
-+ T_ARG(tto->total[2]),
-+ T_ARG(tto->total[3]));
-+}
-+
-+
-+static void tto_start(trace_time_one_t * const tto, const int64_t now)
-+{
-+ av_assert0(tto->count < WAIT_COUNT_MAX);
-+ tto->start[tto->count++] = now;
-+}
-+
-+static void tto_end(trace_time_one_t * const tto, const int64_t now)
-+{
-+ const int n = --tto->count;
-+ av_assert0(n >= 0);
-+ tto->total[n] += now - tto->start[n];
-+}
-+
-+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
-+{
-+ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
-+ tto_print(&ttw->active, now, ttw->start0, "Active");
-+ tto_print(&ttw->wait, now, ttw->start0, " Wait");
-+}
-+
-+#endif
-+
-+// GPU memory alloc fns (internal)
-+
-+// GPU_MEM_PTR_T alloc fns
-+static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
-+ p->numbytes = (numbytes + 255) & ~255; // Round up
-+ p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
-+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+ av_assert0(p->vcsm_handle);
-+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+ av_assert0(p->vc_handle);
-+ p->arm = vcsm_lock(p->vcsm_handle);
-+ av_assert0(p->arm);
-+ p->vc = mbox_mem_lock(mb, p->vc_handle);
-+ av_assert0(p->vc);
-+// printf("***** %s, %d\n", __func__, numbytes);
-+
-+ return 0;
-+}
-+
-+static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
-+ p->numbytes = numbytes;
-+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
-+ av_assert0(p->vcsm_handle);
-+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+ av_assert0(p->vc_handle);
-+ p->arm = vcsm_lock(p->vcsm_handle);
-+ av_assert0(p->arm);
-+ p->vc = mbox_mem_lock(mb, p->vc_handle);
-+ av_assert0(p->vc);
-+// printf("***** %s, %d\n", __func__, numbytes);
-+ return 0;
-+}
-+
-+static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
-+ mbox_mem_unlock(mb, p->vc_handle);
-+ vcsm_unlock_ptr(p->arm);
-+ vcsm_free(p->vcsm_handle);
-+ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
-+// printf("***** %s\n", __func__);
-+}
-+
-+
-+// GPU init, free, lock, unlock
-+
-+static void gpu_term(void)
-+{
-+ gpu_env_t * const ge = gpu;
-+
-+ // We have to hope that eveything has terminated...
-+ gpu = NULL;
-+
-+ vc_gpuserv_deinit();
-+
-+ gpu_free_internal(ge->mb, &ge->code_gm_ptr);
-+
-+ vcsm_exit();
-+
-+ mbox_close(ge->mb);
-+
-+ vq_wait_pool_deinit(&ge->wait_pool);
-+
-+ free(ge);
-+}
-+
-+
-+// Connect to QPU, returns 0 on success.
-+static int gpu_init(gpu_env_t ** const gpu) {
-+ volatile struct GPU* ptr;
-+ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
-+ *gpu = NULL;
-+
-+ if (ge == NULL)
-+ return -1;
-+
-+ if ((ge->mb = mbox_open()) < 0)
-+ return -1;
-+
-+ vq_wait_pool_init(&ge->wait_pool);
-+
-+ vcsm_init();
-+
-+ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
-+ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
-+
-+ // Zero everything so we have zeros between the code bits
-+ memset((void *)ptr, 0, sizeof(*ptr));
-+
-+ // Now copy over the QPU code into GPU memory
-+ {
-+ int num_bytes = (char *)mc_end - (char *)rpi_shader;
-+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+ memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-+ }
-+ // And the VPU code
-+ {
-+ int num_bytes = sizeof(rpi_hevc_transform8);
-+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
-+ }
-+ {
-+ int num_bytes = sizeof(rpi_hevc_transform10);
-+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
-+ }
-+ // And the transform coefficients
-+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+
-+ *gpu = ge;
-+ return 0;
-+}
-+
-+
-+
-+static void gpu_unlock(void) {
-+ pthread_mutex_unlock(&gpu_mutex);
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static gpu_env_t * gpu_lock(void) {
-+ pthread_mutex_lock(&gpu_mutex);
-+
-+ av_assert0(gpu != NULL);
-+ return gpu;
-+}
-+
-+static gpu_env_t * gpu_lock_ref(void)
-+{
-+ pthread_mutex_lock(&gpu_mutex);
-+
-+ if (gpu == NULL) {
-+ int rv = gpu_init(&gpu);
-+ if (rv != 0) {
-+ gpu_unlock();
-+ return NULL;
-+ }
-+ }
-+
-+ ++gpu->open_count;
-+ return gpu;
-+}
-+
-+static void gpu_unlock_unref(gpu_env_t * const ge)
-+{
-+ if (--ge->open_count == 0)
-+ gpu_term();
-+
-+ gpu_unlock();
-+}
-+
-+static inline gpu_env_t * gpu_ptr(void)
-+{
-+ av_assert0(gpu != NULL);
-+ return gpu;
-+}
-+
-+// Public gpu fns
-+
-+// Allocate memory on GPU
-+// Fills in structure containing ARM pointer, videocore handle, videocore memory address, numbytes
-+// Returns 0 on success.
-+// This allocates memory that will not be cached in ARM's data cache.
-+// Therefore safe to use without data cache flushing.
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+ int r;
-+ gpu_env_t * const ge = gpu_lock_ref();
-+ if (ge == NULL)
-+ return -1;
-+ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
-+ gpu_unlock();
-+ return r;
-+}
-+
-+// This allocates data that will be
-+// Cached in ARM L2
-+// Uncached in VPU L2
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+ int r;
-+ gpu_env_t * const ge = gpu_lock_ref();
-+ if (ge == NULL)
-+ return -1;
-+ r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
-+ gpu_unlock();
-+ return r;
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T * const p) {
-+ gpu_env_t * const ge = gpu_lock();
-+ gpu_free_internal(ge->mb, p);
-+ gpu_unlock_unref(ge);
-+}
-+
-+unsigned int vpu_get_fn(const unsigned int bit_depth) {
-+ // Make sure that the gpu is initialized
-+ av_assert0(gpu != NULL);
-+ switch (bit_depth){
-+ case 8:
-+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
-+ case 10:
-+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
-+ default:
-+ av_assert0(0);
-+ }
-+ return 0;
-+}
-+
-+unsigned int vpu_get_constants(void) {
-+ av_assert0(gpu != NULL);
-+ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
-+}
-+
-+int gpu_get_mailbox(void)
-+{
-+ av_assert0(gpu);
-+ return gpu->mb;
-+}
-+
-+void gpu_ref(void)
-+{
-+ gpu_lock_ref();
-+ gpu_unlock();
-+}
-+
-+void gpu_unref(void)
-+{
-+ gpu_env_t * const ge = gpu_lock();
-+ gpu_unlock_unref(ge);
-+}
-+
-+// ----------------------------------------------------------------------------
-+//
-+// Cache flush functions
-+
-+#define CACHE_EL_MAX 16
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init()
-+{
-+ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
-+ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
-+ if (rfe == NULL)
-+ return NULL;
-+
-+ rfe->v.op_count = 0;
-+ return rfe;
-+}
-+
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
-+{
-+ if (rfe != NULL)
-+ free(rfe);
-+}
-+
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
-+{
-+ int rc = 0;
-+ if (rfe->v.op_count != 0) {
-+ if (vcsm_clean_invalid2(&rfe->v) != 0)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno);
-+ rc = -1;
-+ }
-+ rfe->v.op_count = 0;
-+ }
-+ return rc;
-+}
-+
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
-+{
-+ int rc = rpi_cache_flush_execute(rfe);;
-+
-+ free(rfe);
-+ return rc;
-+}
-+
-+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
-+{
-+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+
-+ av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
-+
-+ b->invalidate_mode = mode;
-+ b->block_count = blocks;
-+ b->start_address = gm->arm + offset0;
-+ b->block_size = block_size;
-+ b->inter_block_stride = block_stride;
-+}
-+
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+ const unsigned int offset, const unsigned int size)
-+{
-+ // Deal with empty pointer trivially
-+ if (gm == NULL || size == 0)
-+ return;
-+
-+ av_assert0(offset <= gm->numbytes);
-+ av_assert0(size <= gm->numbytes);
-+ av_assert0(offset + size <= gm->numbytes);
-+
-+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
-+}
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
-+{
-+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
-+}
-+
-+
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
-+{
-+#if !RPI_ONE_BUF
-+#error Fixme! (NIF)
-+#endif
-+ if (gpu_is_buf1(frame)) {
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
-+ }
-+ else
-+ {
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
-+ }
-+}
-+
-+// Flush an area of a frame
-+// Width, height, x0, y0 in luma pels
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+ const unsigned int uv_shift, const int do_luma, const int do_chroma)
-+{
-+ const unsigned int y_offset = frame->linesize[0] * y0;
-+ const unsigned int y_size = frame->linesize[0] * height;
-+ // Round UV up/down to get everything
-+ const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
-+ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
-+
-+#if 0
-+ // *** frame->height is cropped height so not good
-+ // As all unsigned they will also reject -ve
-+ // Test individually as well as added to reject overflow
-+ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped
-+ av_assert0(n <= (unsigned int)frame->height);
-+ av_assert0(start_line + n <= (unsigned int)frame->height);
-+#endif
-+
-+ if (!gpu_is_buf1(frame))
-+ {
-+ if (do_luma) {
-+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
-+ }
-+ if (do_chroma) {
-+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
-+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
-+ }
-+ }
-+ else if (!av_rpi_is_sand_frame(frame))
-+ {
-+ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+ if (do_luma) {
-+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
-+ }
-+ if (do_chroma) {
-+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
-+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
-+ }
-+ }
-+ else
-+ {
-+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+ const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
-+ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
-+ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C
-+ av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
-+
-+ if (do_chroma)
-+ {
-+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+ b->invalidate_mode = mode;
-+ b->block_count = block_count;
-+ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
-+ b->block_size = uv_size;
-+ b->inter_block_stride = stride1 * stride2;
-+ }
-+ if (do_luma)
-+ {
-+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+ b->invalidate_mode = mode;
-+ b->block_count = block_count;
-+ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
-+ b->block_size = y_size;
-+ b->inter_block_stride = stride1 * stride2;
-+ }
-+ }
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
-+{
-+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
-+ rpi_cache_flush_add_gm_ptr(rfe, p, mode);
-+ rpi_cache_flush_finish(rfe);
-+}
-+
-+
-+// ----------------------------------------------------------------------------
-+
-+
-+// Wait abstractions - mostly so we can easily add profile code
-+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
-+{
-+ unsigned int i;
-+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+ sem_init(&wp->pool[i].sem, 0, 0);
-+ wp->pool[i].next = wp->pool + i + 1;
-+ }
-+ wp->head = wp->pool + 0;
-+ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
-+}
-+
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
-+{
-+ unsigned int i;
-+ wp->head = NULL;
-+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+ sem_destroy(&wp->pool[i].sem);
-+ wp->pool[i].next = NULL;
-+ }
-+}
-+
-+
-+// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(void)
-+{
-+ gpu_env_t * const ge = gpu_lock_ref();
-+ vq_wait_t * const wait = ge->wait_pool.head;
-+ ge->wait_pool.head = wait->next;
-+ wait->next = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ tto_start(&ge->ttw.active, ns_time());
-+#endif
-+
-+ gpu_unlock();
-+ return wait;
-+}
-+
-+static void vq_wait_delete(vq_wait_t * const wait)
-+{
-+ gpu_env_t * const ge = gpu_lock();
-+ wait->next = ge->wait_pool.head;
-+ ge->wait_pool.head = wait;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ {
-+ trace_time_wait_t * const ttw = &ge->ttw;
-+ const int64_t now = ns_time();
-+ ++ttw->jcount;
-+ tto_end(&ttw->wait, now);
-+
-+ if (ttw->start0 == 0)
-+ {
-+ ttw->start0 = ttw->active.start[0];
-+ ttw->last_update = ttw->start0;
-+ }
-+ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
-+ {
-+ ttw->last_update += WAIT_TIME_PRINT_PERIOD;
-+ ttw_print(ttw, now);
-+ }
-+ }
-+#endif
-+ gpu_unlock_unref(ge);
-+}
-+
-+static void vq_wait_wait(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ {
-+ const int64_t now = ns_time();
-+ gpu_env_t * const ge = gpu_lock();
-+ tto_start(&ge->ttw.wait, now);
-+ gpu_unlock();
-+ }
-+#endif
-+
-+ while (sem_wait(&wait->sem) == -1 && errno == EINTR)
-+ /* loop */;
-+}
-+
-+static void vq_wait_post(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ {
-+ gpu_env_t *const ge = gpu_lock();
-+ tto_end(&ge->ttw.active, ns_time());
-+ gpu_unlock();
-+ }
-+#endif
-+
-+ sem_post(&wait->sem);
-+}
-+
-+
-+
-+// Header comments were wrong for these two
-+#define VPU_QPU_MASK_QPU 1
-+#define VPU_QPU_MASK_VPU 2
-+
-+#define VPU_QPU_JOB_MAX 4
-+struct vpu_qpu_job_env_s
-+{
-+ unsigned int n;
-+ unsigned int mask;
-+ struct gpu_job_s j[VPU_QPU_JOB_MAX];
-+};
-+
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_env_t * vpu_qpu_job_new(void)
-+{
-+ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
-+ return vqj;
-+}
-+
-+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
-+{
-+ memset(vqj, 0, sizeof(*vqj));
-+ free(vqj);
-+}
-+
-+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
-+{
-+ struct gpu_job_s * const j = vqj->j + vqj->n++;
-+ av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
-+ return j;
-+}
-+
-+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
-+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
-+{
-+ if (vpu_code != 0) {
-+ struct gpu_job_s *const j = new_job(vqj);
-+ vqj->mask |= VPU_QPU_MASK_VPU;
-+
-+ j->command = EXECUTE_VPU;
-+ // The bottom two bits of the execute address contain no-flush flags
-+ // b0 will flush the VPU I-cache if unset so we nearly always want that set
-+ // as we never reload code
-+ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
-+ j->u.v.q[1] = r0;
-+ j->u.v.q[2] = r1;
-+ j->u.v.q[3] = r2;
-+ j->u.v.q[4] = r3;
-+ j->u.v.q[5] = r4;
-+ j->u.v.q[6] = r5;
-+ gpu->vpu_i_cache_flushed = 1;
-+ }
-+}
-+
-+// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
-+{
-+ if (n != 0) {
-+ struct gpu_job_s *const j = new_job(vqj);
-+ vqj->mask |= VPU_QPU_MASK_QPU;
-+
-+ j->command = EXECUTE_QPU;
-+ j->u.q.jobs = n;
-+#if RPI_TRACE_QPU_PROFILE_ALL
-+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
-+#else
-+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
-+#endif
-+ j->u.q.timeout = 5000;
-+ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+ }
-+}
-+
-+// Convert callback to sem post
-+static void vpu_qpu_job_callback_wait(void * v)
-+{
-+ vq_wait_post(v);
-+}
-+
-+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
-+{
-+ vq_wait_t * wait;
-+
-+ if (vqj->mask == 0) {
-+ *wait_h = NULL;
-+ return;
-+ }
-+
-+ // We are going to want a sync object
-+ wait = vq_wait_new();
-+
-+ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+ // If we only posted one thing or only QPU jobs
-+ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+ {
-+ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+ av_assert0(j->callback.func == 0);
-+
-+ j->callback.func = vpu_qpu_job_callback_wait;
-+ j->callback.cookie = wait;
-+ }
-+ else
-+ {
-+ struct gpu_job_s *const j = new_job(vqj);
-+
-+ j->command = EXECUTE_SYNC;
-+ j->u.s.mask = vqj->mask;
-+ j->callback.func = vpu_qpu_job_callback_wait;
-+ j->callback.cookie = wait;
-+ }
-+
-+ vqj->mask = 0;
-+ *wait_h = wait;
-+}
-+
-+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
-+{
-+ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
-+}
-+
-+// Simple wrapper of start + delete
-+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
-+{
-+ int rv;
-+ rv = vpu_qpu_job_start(vqj);
-+ vpu_qpu_job_delete(vqj);
-+ return rv;
-+}
-+
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
-+{
-+ if (wait_h != NULL)
-+ {
-+ vq_wait_t * const wait = *wait_h;
-+ if (wait != NULL) {
-+ *wait_h = NULL;
-+ vq_wait_wait(wait);
-+ vq_wait_delete(wait);
-+ }
-+ }
-+}
-+
-+int vpu_qpu_init()
-+{
-+ gpu_env_t * const ge = gpu_lock_ref();
-+ if (ge == NULL)
-+ return -1;
-+
-+ if (ge->init_count++ == 0)
-+ {
-+ vc_gpuserv_init();
-+ }
-+
-+ gpu_unlock();
-+ return 0;
-+}
-+
-+void vpu_qpu_term()
-+{
-+ gpu_env_t * const ge = gpu_lock();
-+
-+ if (--ge->init_count == 0) {
-+ vc_gpuserv_deinit();
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ ttw_print(&ge->ttw, ns_time());
-+#endif
-+ }
-+
-+ gpu_unlock_unref(ge);
-+}
-+
-+uint32_t qpu_fn(const int * const mc_fn)
-+{
-+ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
-+}
-+
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
-+{
-+ // Dummy values we can catch with emulation
-+ qf->y_pxx = ~1U;
-+ qf->y_bxx = ~2U;
-+ qf->y_p00 = ~3U;
-+ qf->y_b00 = ~4U;
-+ qf->c_pxx = ~5U;
-+ qf->c_bxx = ~6U;
-+
-+ switch (bit_depth) {
-+ case 8:
-+ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+ qf->y_bxx = qpu_fn(mc_filter_y_bxx);
-+ qf->y_p00 = qpu_fn(mc_filter_y_p00);
-+ qf->y_b00 = qpu_fn(mc_filter_y_b00);
-+ qf->c_pxx = qpu_fn(mc_filter_c_p);
-+ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
-+ qf->c_bxx = qpu_fn(mc_filter_c_b);
-+ break;
-+ case 10:
-+ qf->c_pxx = qpu_fn(mc_filter_c10_p);
-+ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
-+ qf->c_bxx = qpu_fn(mc_filter_c10_b);
-+ qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
-+ qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
-+ qf->y_p00 = qpu_fn(mc_filter_y10_p00);
-+ qf->y_b00 = qpu_fn(mc_filter_y10_b00);
-+ break;
-+ default:
-+ return -1;
-+ }
-+ return 0;
-+}
-+
-+#endif // RPI
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-new file mode 100644
-index 0000000000..9389047f8e
---- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,208 @@
-+#ifndef RPI_QPU_H
-+#define RPI_QPU_H
-+
-+#define RPI_ONE_BUF 1
-+
-+typedef struct gpu_mem_ptr_s {
-+ unsigned char *arm; // Pointer to memory mapped on ARM side
-+ int vc_handle; // Videocore handle of relocatable memory
-+ int vcsm_handle; // Handle for use by VCSM
-+ int vc; // Address for use in GPU code
-+ int numbytes; // Size of memory block
-+} GPU_MEM_PTR_T;
-+
-+// General GPU functions
-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T * const p);
-+
-+#include "libavutil/frame.h"
-+#if !RPI_ONE_BUF
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
-+ return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+ return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
-+ return p->vc;
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
-+}
-+
-+#else
-+
-+static inline int gpu_is_buf1(const AVFrame * const frame)
-+{
-+ return frame->buf[1] == NULL;
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
-+{
-+ return av_buffer_get_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
-+{
-+ return av_buffer_pool_opaque(frame->buf[n]);
-+}
-+
-+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
-+{
-+ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
-+ return gm->vc + (frame->data[n] - gm->arm);
-+}
-+
-+
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+ return get_vc_address3(frame, 0);
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+ return get_vc_address3(frame, 1);
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+ return get_vc_address3(frame, 2);
-+}
-+
-+#if 0
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+ if (gpu_is_buf1(frame))
-+ {
-+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+ g.numbytes = frame->data[1] - frame->data[0];
-+ return g;
-+ }
-+ else
-+ return *gpu_buf3_gmem(frame, 0);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+ if (gpu_is_buf1(frame))
-+ {
-+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+ g.arm += frame->data[1] - frame->data[0];
-+ g.vc += frame->data[1] - frame->data[0];
-+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
-+ return g;
-+ }
-+ else
-+ return *gpu_buf3_gmem(frame, 1);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+ if (gpu_is_buf1(frame))
-+ {
-+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+ g.arm += frame->data[2] - frame->data[0];
-+ g.vc += frame->data[2] - frame->data[0];
-+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
-+ return g;
-+ }
-+ else
-+ return *gpu_buf3_gmem(frame, 2);
-+}
-+#endif
-+#endif
-+
-+// Cache flush stuff
-+
-+struct rpi_cache_flush_env_s;
-+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(void);
-+// Free env without flushing
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & clear but do not free the env
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & free the env
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
-+
-+typedef enum
-+{
-+ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1,
-+ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2,
-+ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3
-+} rpi_cache_flush_mode_t;
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
-+ const unsigned int offset, const unsigned int size);
-+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+ const unsigned int uv_shift, const int do_luma, const int do_chroma);
-+
-+// init, add, finish for one gm ptr
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
-+
-+
-+// QPU specific functions
-+
-+typedef struct HEVCRpiQpu {
-+ uint32_t c_pxx;
-+ uint32_t c_pxx_l1;
-+ uint32_t c_bxx;
-+ uint32_t y_pxx;
-+ uint32_t y_bxx;
-+ uint32_t y_p00;
-+ uint32_t y_b00;
-+} HEVCRpiQpu;
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
-+
-+uint32_t qpu_fn(const int * const mc_fn);
-+
-+#define QPU_N_GRP 4
-+#define QPU_N_MAX 12
-+
-+#define QPU_MAIL_EL_VALS 2
-+
-+struct vpu_qpu_wait_s;
-+typedef struct vq_wait_s * vpu_qpu_wait_h;
-+
-+// VPU specific functions
-+
-+struct vpu_qpu_job_env_s;
-+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
-+
-+vpu_qpu_job_h vpu_qpu_job_new(void);
-+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
-+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
-+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
-+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
-+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
-+
-+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
-+extern unsigned int vpu_get_constants(void);
-+
-+// Waits for previous post_codee to complete and Will null out *wait_h after use
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_init(void);
-+void vpu_qpu_term(void);
-+
-+extern int gpu_get_mailbox(void);
-+void gpu_ref(void);
-+void gpu_unref(void);
-+
-+#endif
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-new file mode 100644
-index 0000000000..2c6541a8fb
---- /dev/null
-+++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,1570 @@
-+#include "rpi_shader.h"
-+
-+#ifdef _MSC_VER
-+ #include
-+ /* cast through uintptr_t to avoid warnings */
-+ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-+#else
-+ #define POINTER_TO_UINT(X) ((unsigned int)(X))
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" { /* the types are probably wrong... */
-+#endif
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#ifdef _MSC_VER
-+__declspec(align(8))
-+#elif defined(__GNUC__)
-+__attribute__((aligned(8)))
-+#endif
-+unsigned int rpi_shader[] = {
++unsigned int ff_hevc_rpi_shader[] = {
+// ::mc_setup_c_q0
+// ::mc_start
+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
@@ -28869,2552 +17987,21710 @@ index 0000000000..2c6541a8fb
+/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+// ::mc_end
+};
-+#ifdef __HIGHC__
-+#pragma Align_to(8, rpi_shader)
++#ifdef __HIGHC__
++#pragma Align_to(8, rpi_shader)
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
+new file mode 100644
+index 0000000000..ddb351782d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.h
+@@ -0,0 +1,63 @@
++#ifndef rpi_hevc_shader_H
++#define rpi_hevc_shader_H
++
++extern unsigned int ff_hevc_rpi_shader[];
++
++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
++#define mc_start (ff_hevc_rpi_shader + 0)
++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
++#define mc_filter_c_p (ff_hevc_rpi_shader + 142)
++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 272)
++#define mc_filter_c_b (ff_hevc_rpi_shader + 402)
++#define mc_sync_q0 (ff_hevc_rpi_shader + 590)
++#define mc_sync_q1 (ff_hevc_rpi_shader + 608)
++#define mc_sync_q2 (ff_hevc_rpi_shader + 620)
++#define mc_sync_q3 (ff_hevc_rpi_shader + 632)
++#define mc_sync_q4 (ff_hevc_rpi_shader + 644)
++#define mc_sync_q5 (ff_hevc_rpi_shader + 662)
++#define mc_sync_q6 (ff_hevc_rpi_shader + 674)
++#define mc_sync_q7 (ff_hevc_rpi_shader + 686)
++#define mc_sync_q8 (ff_hevc_rpi_shader + 698)
++#define mc_sync_q9 (ff_hevc_rpi_shader + 716)
++#define mc_sync_q10 (ff_hevc_rpi_shader + 728)
++#define mc_sync_q11 (ff_hevc_rpi_shader + 740)
++#define mc_exit_c_qn (ff_hevc_rpi_shader + 752)
++#define mc_exit_y_qn (ff_hevc_rpi_shader + 752)
++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 770)
++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 770)
++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 790)
++#define mc_setup_y_qn (ff_hevc_rpi_shader + 792)
++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1032)
++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1162)
++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1292)
++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1382)
++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1462)
++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1464)
++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1600)
++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1728)
++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1856)
++#define mc_sync10_q0 (ff_hevc_rpi_shader + 2042)
++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2060)
++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2072)
++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2084)
++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2096)
++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2114)
++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2126)
++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2138)
++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2150)
++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2168)
++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2180)
++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2192)
++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2204)
++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2204)
++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2224)
++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2224)
++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2242)
++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2244)
++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2494)
++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2624)
++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2716)
++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2846)
++#define mc_end (ff_hevc_rpi_shader + 2926)
++
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
+new file mode 100644
+index 0000000000..f8572cdebe
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.qasm
+@@ -0,0 +1,1741 @@
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4. As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# However in the current world there seems to be no benefit (and a small
++# overhead) in setting this bigger than 2.
++
++.set PREREAD, 4
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8, 16
++.set C_BLK_HEIGHT_16, 8
++.set Y_BLK_HEIGHT_8, 16
++.set Y_BLK_HEIGHT_16, 8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
++
++.set N_QPU_8, 12
++.set N_QPU_16, 12
++
++# register allocation
++#
++
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
++
++# ra4-7
++# C: L0 H filter out FIFO
++# otherwise -- free --
++
++# ra8-11
++# temp in some places - check usage
++# Y: (with rb8-11) horiz out FIFO
++
++# ra12-15
++# -- free --
++
++# uniform: width:height
++.set ra_width_height, ra16
++.set ra_width, ra16.16b
++.set ra_height, ra16.16a
++
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2, ra17
++.set ra_y2, ra17.16a
++.set ra_y, ra17.16b
++
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1, ra18
++.set ra_wt_off_l1, ra18.16b
++.set ra_wt_mul_l1, ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next, ra19
++.set ra_y_next, ra19.16b
++.set ra_y2_next, ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff100100, ra20
++.set ra_k256, ra20.16a
++.set ra_k0, ra20.8a
++.set ra_k1, ra20.8b
++.set ra_k16, ra20.8c
++.set ra_k255, ra20.8d
++
++# Loop: xshifts
++.set ra_xshift, ra21.16a
++.set ra_xshift_next, ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0, ra22
++.set ra_wt_mul_l0, ra22.16a
++.set ra_wt_off_l0, ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++# 2nd byte but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax, ra23
++.set ra_pmax, ra23.16a
++.set ra_blk_height, ra23.8c
++# -- free -- ra23.8d
++
++# Loop: src frame base (L0)
++.set ra_base, ra24
++
++# Loop: src frame base (L1)
++.set ra_base2, ra25
++
++# Loop: next src frame base (L0)
++.set ra_base_next, ra26
++
++# -- free -- ra27
++# -- free -- ra28
++# -- free -- ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link, ra30
++
++# -- free -- ra31
++
++.set rb_xshift2, rb0
++.set rb_xshift2_next, rb1
++
++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x, rb2
++
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++.set rb_ef, rb3
++
++# rb4-7
++# C-B: L1 H filter out FIFO
++# Y: (with ra2.8x) Y vertical filter coeffs
++
++# rb8-11
++# C: Vertical filter coeffs
++# Y: (with ra8-11) horiz out FIFO
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off, rb12
++
++# Setup: denom + 6 + 9
++.set rb_wt_den_p15, rb13
++
++# -- free -- rb14
++# -- free -- rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch, rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu, rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount, rb18
++
++# frame_base2_next
++.set rb_base2_next, rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
++.set rb_xpitch, rb20
++
++# -- free -- rb21
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask, rb22
++
++# Loop: destination address
++.set rb_dest, rb23
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base, rb24
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x, rb25
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set rb_dma0, rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base, rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init, rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1, rb29
++
++# Setup: pic_height - 1
++.set rb_max_y, rb30
++
++# -- free -- rb31
++
++
++
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16, -16
++.set i_shift21, -11
++.set i_shift23, -9
++.set i_shift30, -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
++ mov r2, qpu_num
++.if v_bit_depth <= 8
++ # 8 bit version
++ asr r1, r2, 2
++ shl r1, r1, 6
++ and r0, r2, 3
++ or r0, r0, r1
++
++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++ add r_vpm, r0, r1 # VPM 8bit storage
++
++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++ shl r0, r0, 5
++
++.else
++ # 16 bit version
++ # Limited to 8 QPUs if blk height > 8
++ asr r1, r2, 1
++.if v_blk_height <= 8
++ shl r1, r1, 4
++.else
++ shl r1, r1, 5
++.endif
++ and r0, r2, 1
++ or r0, r0, r1
++
++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR
++ add r_vpm, r0, r1
++
++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
++ shl r0, r0, 6
++.endif
++ add r_dma, r0, r1 # DMA out
++.endm
++
++
++.macro m_setup_q0
++ srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
++################################################################################
++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_pmask, 0xff
++.set v_blk_height, C_BLK_HEIGHT_8
++.else
++.set v_x_shift, 2
++.set v_pmask, 0xffff
++.set v_blk_height, C_BLK_HEIGHT_16
++.endif
++
++ mov tmurs, 1 # No swap TMUs
++
++# Load first request location
++ mov ra0, unif # next_x_y
++
++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++ shl rb_ef, r0, i_shift30
++
++ mov ra_base, unif # Store frame c base
++
++# Read image dimensions
++ sub r0, unif, 1 # pic c width
++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes
++ sub rb_max_y, unif, 1 # pic c height
++
++# load constants
++ mov ra_kff100100, 0xff100100
++ mov rb_pmask, v_pmask
++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++
++# get source pitch
++ mov rb_xpitch, unif # stride2
++ mov rb_pitch, unif # stride1
++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly
++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1
++
++ and r0, 1, elem_num
++ nop ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++ add rb_elem_x, r0, elem_num
++.else
++ add r0, r0, elem_num
++ add rb_elem_x, r0, r0
++.endif
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay]
++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice
++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y
++ min r0, r0, rb_max_x
++
++# Get shift
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++ shl ra_xshift_next, r0, 3
++.else
++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++.endif
++
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++
++.if v_bit_depth <= 8
++ and r0, r0, -4
++.endif
++ sub r1, ra_k0, rb_pitch
++ and r1, r0, r1
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1
++ add ra_base, ra_base, r0
++
++ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator
++
++# Compute part of VPM to use for DMA output
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++# And again for L1, but only worrying about frame2 stuff
++
++# Load first request location
++ mov ra0, unif # next_x_y
++
++ mov ra_base2, unif # [ra0 delay] Store frame c base
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++ shl r0, ra0.16b, v_x_shift
++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++ shl rb_xshift2_next, r0, 3
++.endif
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++.if v_bit_depth <= 8
++ and r0, r0, -4
++.endif
++ sub r1, ra_k0, rb_pitch
++ and r1, r0, r1
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov r2, ra_y2
++ add ra_base2, ra_base2, r0
++
++# Do preloads
++# r0 = ra_y, r2 = ra_y2
++ mov r3, PREREAD ; mov r0, ra_y
++
++:1
++ sub.setf r3, r3, 1
++ max r1, r0, 0
++ min r1, r1, rb_max_y
++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t0s, ra_base, r1 ; mov ra_y, r0
++
++ max r1, r2, 0
++ brr.anynz -, r:1b
++ min r1, r1, rb_max_y
++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++ mov ra_link, unif # link
++# touch registers to keep simulator happy
++ # ra/b4..7: B0 -> B stash registers
++ mov ra4, 0 ; mov rb4, 0
++ bra -, ra_link
++ mov ra5, 0 ; mov rb5, 0
++ mov ra6, 0 ; mov rb6, 0
++ mov ra7, 0 ; mov rb7, 0
++# >>> ra_link
++.endm
++
++::mc_setup_c_q0
++ m_setup_q0
++::mc_setup_c_qn
++ m_setup_c 8
++
++################################################################################
++
++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++
++# At this point we have already issued two pairs of texture requests for the current block
++# ra_x, ra_x16_base point to the current coordinates for this block
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_x_mul, 2
++.set v_v_shift, 8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 2
++.set v_x_mul, 4
++.set v_v_shift, i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift, rb_xshift2 # b side more convienient
++.set vrx_xshift_next, ra_xshift_next
++.set vra_y_next, ra_y_next
++.set vrx_base_next, ra_base_next
++.set vra_y, ra_y
++.set vra_base, ra_base
++.set vr_txs, t0s
++.else
++.set vrx_xshift, ra_xshift # a side more convienient
++.set vrx_xshift_next, rb_xshift2_next
++.set vra_y_next, ra_y2_next
++.set vrx_base_next, rb_base2_next
++.set vra_y, ra_y2
++.set vra_base, ra_base2
++.set vr_txs, t1s
++.endif
++
++# per-channel shifts were calculated on the *previous* invocation
++# get base addresses and per-channel shifts for *next* invocation
++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++
++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base
++
++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs
++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++
++.if v_bit_depth <= 8
++ shl vrx_xshift_next, r0, 3
++ and r0, r0, -4
++.endif
++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced!
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs
++ add vrx_base_next, r3, r0 ; mov r1, ra_height
++
++# set up VPM write
++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
++
++# ; unpack filter coefficients
++
++ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a
++ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2)
++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register
++ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight
++
++ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y
++
++ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d
++
++ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link
++ sub ra3, rb_wt_den_p15, ra_k1
++
++# r5 = 0 (loop counter)
++# ra9 = alias for rb_max_y
++# ra_wt_mul_l0 = weight L0
++# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
++# rb_wt_off = (offset * 2 + 1) << (ra3 - 1)
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++# C0 : C3 : C1 : C4 : C2 : C5 : ...
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++.if v_tmu == 0
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment
++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++.else
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment
++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++.endif
++
++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15
++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
++ min r3, r3, ra9 ; mov.ifnc r0, r2
++
++ mov ra4, ra5 ; mul24 r2, r3, rb_pitch
++ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++
++# apply horizontal filter
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are vaild for all QPUs
++
++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1
++
++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
++# Have to dup block as we need to move the brr - code is more common than it
++# looks at first glance
++.if v_bit_depth <= 8
++ brr.anyn -, r:1b
++ add r2, r2, r3 ; mov ra5, ra6
++ mov ra6, ra7 ; mul24 r1, ra7, rb10
++ sub ra7, r2, r0 ; mul24 r0, ra4, rb8
++.else
++ add r2, r2, r3 ; mov ra5, ra6
++ brr.anyn -, r:1b
++ mov ra6, ra7 ; mul24 r1, ra7, rb10
++ sub r2, r2, r0 ; mul24 r0, ra4, rb8
++ asr ra7, r2, v_bit_depth - 8
++.endif
++# >>> .anyn 1b
++
++ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay]
++ add r1, r1, r0 ; mul24 r0, ra7, rb11
++ sub r1, r1, r0
++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++ asr r1, r1, 14
++ nop ; mul24 r1, r1, ra_wt_mul_l0
++ shl r1, r1, 8 ; mov r3, ra_blk_height
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++ brr.anyn -, r:1b
++ asr r1, r1, ra3
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ add rb_lcount, rb_lcount, r0
++ brr -, r:1b
++ add rb_dma0, rb_dma0, r1
++ add rb_dest, rb_dest, r2
++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++# At 10 bits
++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
++# (P)
++# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
++# ... should be OK
++#
++# (B)
++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
++# So signed overflow if we sign extend here :-(
++#
++# In practice this doesn't happen (we need a maximal offset and a very unlucky
++# filter).
++#
++# This could be fixed by offsetting the filters s.t. they are unsigned until
++# weight mul and then removing the offset with the weighting offset (I think
++# this should work) or splitting the rounding & offsetting
++
++::mc_filter_c_p
++ m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++ m_filter_c_p 1, 8
++
++################################################################################
++
++# mc_filter_c_b
++
++# At this point we have already issued two pairs of texture requests for the current block
++# ra_x, ra_x16_base point to the current coordinates for this block
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 1
++.set v_v_shift, 8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 2
++.set v_v_shift, i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++.set v_x_mul, (1 << v_x_shift)
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++
++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base
++
++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0
++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs
++
++.if v_bit_depth <= 8
++ shl ra_xshift_next, r0, 3
++.endif
++
++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs
++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs)
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height
++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++
++# set up VPM write
++
++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight
++
++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2
++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base
++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register
++ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
++
++# L1 - uniform layout could possibly be optimized
++
++ shl r0, ra3.16b, v_x_shift # r0=x*2
++ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs
++ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
++ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs
++ min r0, r0, rb_max_x ; mov rb9, ra3.8b
++
++.if v_bit_depth <= 8
++ shl rb_xshift2_next, r0, 3
++.endif
++
++ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++ and r1, r0, r1 ; mov rb10, ra3.8c
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov rb_dest, unif # Add stripe offsets ; dst_addr
++ add rb_base2_next, r3, r0
++
++ mov ra9, rb_max_y ; mov rb11, ra3.8d
++ shl r1, ra_wt_off_l1, rb_wt_den_p15
++ asr rb_wt_off, r1, 9 ; mov ra_link, unif # link
++
++# r5 loop counter
++# ra0 H coeffs L0
++# ra1 H coeffs L1
++# ra2 V coeffs L0
++# ra3 temp
++# ra4-7 L0 H FIFO
++# rb4-7 L1 H FIFO
++# rb8-rb11 V coeffs L1
++# ra9 rb_max_y alias
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment
++ shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++ add ra_y, 1, ra_y ; mov r3, ra_y
++
++ max r3, r3, ra_k0 ; mov r0, r1 << 15
++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++
++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++
++# L0 H-filter
++# H FIFO scrolls are spread all over this loop
++ mov rb4, rb5 ; mov ra4, ra5 # ? Just moves
++
++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8d, r1
++.if v_bit_depth <= 8
++ sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1
++.else
++ sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1
++ asr ra3, r2, (v_bit_depth - 8)
++.endif
++
++ shr r2, r4, rb_xshift2 ; mov ra5, ra6
++ shr r1, r2, v_v_shift ; mov r3, ra_y2
++ add ra_y2, r3, ra_k1 ; mov rb6, rb7
++
++ max r3, r3, ra_k0 ; mov r0, r1 << 15
++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++
++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
++ add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++
++# L1 H-filter
++
++ and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0
++ nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1
++# V filters - start in branch delay slots of H
++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
++ add r2, r2, r3 ; mul24 r1, rb5, ra2.8b
++ brr.anyn -, r:1b
++ mov ra6, ra7 ; mul24 r3, ra7, rb10
++ sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a
++ asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++# >>> .anyn 1b
++
++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay]
++ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d
++ sub r2, r1, r0 ; mul24 r0, ra4, rb8
++ sub r1, r3, r0 ; mul24 r0, ra5, rb9
++ add r1, r1, r0 ; mul24 r0, ra7, rb11
++ sub r1, r1, r0 ; mul24 r2, r2, ra_k256
++
++ asr r2, r2, 14 ; mul24 r1, r1, ra_k256
++ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0
++
++ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
++ add r1, r1, r2 ; mov r3, ra_blk_height
++
++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
++
++ brr.anyn -, r:1b
++ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ add rb_lcount, rb_lcount, r0
++ brr -, r:1b
++ add rb_dma0, rb_dma0, r1
++ add rb_dest, rb_dest, r2
++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_b
++ m_filter_c_b 8
++
++################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++ nop ; nop ; ldtmu0
++ nop ; nop ; ldtmu1
++ nop ; nop ; ldtmu0
++ mov -, vw_wait ; nop ; ldtmu1
++.else
++ mov.setf r3, PREREAD - 1
++:1
++ brr.anynz -, r:1b
++ nop ; nop ; ldtmu0
++ nop ; nop ; ldtmu1
++ sub.setf r3, r3, 1
++ # >>>
++ mov -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart. Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 - fns should never be called
++.if n_qpu < n_quads * 4
++ mov ra_link, unif # Can only branch to an a reg (not r0)
++ mov -, vw_wait # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in, 12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++ sacq -, n_sem_sync
++ sacq -, n_sem_sync
++ sacq -, n_sem_sync
++ bra -, ra_link
++ sacq -, n_sem_quad_in
++ srel -, n_sem_out
++ srel -, n_sem_quad_out
++
++.else
++ bra -, ra_link
++ srel -, n_sem_sync
++ sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++ srel -, n_sem_out
++.else
++ nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++ m_sync_q 0, v_quads8
++::mc_sync_q1
++ m_sync_q 1, v_quads8
++::mc_sync_q2
++ m_sync_q 2, v_quads8
++::mc_sync_q3
++ m_sync_q 3, v_quads8
++::mc_sync_q4
++ m_sync_q 4, v_quads8
++::mc_sync_q5
++ m_sync_q 5, v_quads8
++::mc_sync_q6
++ m_sync_q 6, v_quads8
++::mc_sync_q7
++ m_sync_q 7, v_quads8
++::mc_sync_q8
++ m_sync_q 8, v_quads8
++::mc_sync_q9
++ m_sync_q 9, v_quads8
++::mc_sync_q10
++ m_sync_q 10, v_quads8
++::mc_sync_q11
++ m_sync_q 11, v_quads8
++
++# mc_exit()
++# Chroma & Luma the same now
++
++.macro m_exit_qn
++ m_exit_drain
++ nop ; nop ; thrend
++ nop
++ nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++ m_exit_qn
++
++
++
++# mc_interrupt_exit12()
++
++.macro m_exit_q0
++ m_exit_drain
++ sacq -, 12
++ nop ; nop ; thrend
++ mov interrupt, 1
++ nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_q0
++::mc_exit_y_q0
++ m_exit_q0
++
++# LUMA CODE
++
++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
++# For P frames we make the second x,y coordinates offset by +8
++
++
++################################################################################
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++# qpu_mc_src_t next_src1;
++# qpu_mc_src_t next_src2;
++# uint16_t pic_h;
++# uint16_t pic_w;
++# uint32_t stride2;
++# uint32_t stride1;
++# uint32_t wdenom;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_pmask, 0xff
++.set v_blk_height, Y_BLK_HEIGHT_8
++.else
++.set v_x_shift, 1
++.set v_pmask, 0xffff
++.set v_blk_height, Y_BLK_HEIGHT_16
++.endif
++
++
++ # Need to save these because we need to know the frame dimensions before computing texture coordinates
++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
++ mov ra9, unif # ref_y_base
++ mov ra1, unif # x2_y2
++ mov ra11, unif # ref_y2_base
++
++# load constants
++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++ shl rb_ef, r0, i_shift30
++
++
++ mov ra_kff100100, 0xff100100
++ mov rb_pmask, v_pmask
++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++
++# Compute part of VPM to use
++
++# Read image dimensions
++ mov ra3, unif # width_height
++ mov rb_xpitch, unif # stride2
++.if v_x_shift == 0
++ sub rb_max_x, ra3.16b, 1
++.else
++ sub r0, ra3.16b, 1
++ shl rb_max_x, r0, v_x_shift
++.endif
++ sub rb_max_y, ra3.16a, 1
++ mov rb_pitch, unif # stride1
++
++# get destination pitch
++ mov r1, vdw_setup_1(0)
++ or rb_dma1_base, r1, rb_pitch
++
++# Compute base address for first and second access
++ mov r3, elem_num
++ add r0, ra0.16b, r3 # Load x + elem_num
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++ shl ra_xshift_next, r0, 3 # Compute shifts
++
++# X is byte offset - we can only load words - mask
++
++ and r0, r0, -4 ; v8subs r2, r2, r2
++ sub r2, r2, rb_pitch
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 # Add stripe offsets
++ add ra_base, ra9, r0
++
++ # r3 still contains elem_num
++ add r0, ra1.16b, r3 # Load x
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++ shl rb_xshift2_next, r0, 3 # Compute shifts
++
++ # r2 still contains mask
++ and r0, r0, -4
++ and r1, r0, r2
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 # Add stripe offsets
++ add ra_base2, ra11, r0
++
++# Do preloads
++ nop ; mov r0, ra0.16a # ; r0 = y
++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2
++
++:1
++ sub.setf r3, r3, 1
++ max r1, r0, 0
++ min r1, r1, rb_max_y
++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t0s, ra_base, r1 ; mov ra_y, r0
++
++ max r1, r2, 0
++ brr.anynz -, r:1b
++ min r1, r1, rb_max_y
++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
++ add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom
++
++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++ mov ra_link, unif # Next fn
++
++# touch vertical context to keep simulator happy
++ mov ra8, 0 ; mov rb8, 0
++ bra -, ra_link
++ mov ra9, 0 ; mov rb9, 0
++ mov ra10, 0 ; mov rb10, 0
++ mov ra11, 0 ; mov rb11, 0
++# >>> ra_link
++.endm
++
++::mc_setup_y_q0
++ m_setup_q0
++::mc_setup_y_qn
++ m_setup_y 8
++
++################################################################################
++#
++# Start of per-block setup code
++# P and B blocks share the same setup code to save on Icache space
++
++# luma_setup_delay3 done in delay slots of branch that got us here
++
++# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
++
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++# qpu_mc_src_t next_src1;
++# qpu_mc_src_t next_src2;
++# uint16_t h;
++# uint16_t w;
++# uint32_t mymx21;
++# uint32_t wo1;
++# uint32_t wo2;
++# uint32_t dst_addr;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++ brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++ brr ra_link, r:per_block_setup_10
++.endif
++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack??
++ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_x_mul, 1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 1
++.set v_x_mul, 2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
++ min r0, r0, rb_max_x
++
++ shl ra_xshift_next, r0, 3 # Compute shifts
++ and r0, r0, -4
++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base
++ and r1, r0, r2 ; mov ra_y_next, ra0.16a
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y
++ add ra_base_next, ra_base_next, r0 # [ra1 delay]
++
++ add r0, ra1.16b, r3 # Load x2
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a
++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base
++ shl rb_xshift2_next, r0, 3 # Compute shifts
++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height
++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
++ add rb_base2_next, rb_base2_next, r0
++
++# get width,height of block (unif load above), r1 = width * pel_size
++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
++ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++ add rb_lcount, r0, 7
++ shl r0, r0, v_dma_h_shift
++ add r0, r0, r1 # Combine width and height of destination area
++ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register
++ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets
++
++# get filter coefficients and discard unused B frame values
++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight
++ shl ra8, r0, 3 ; mov r3, ra_k255
++
++# Pack the 1st 4 filter coefs for H & V tightly
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++
++ mov r1,0x00010100 # -ve [ra8 delay]
++ ror ra2.8a, r1, ra8.8d
++ ror ra0.8a, r1, ra8.8c
++
++ mov r1, 0x01040400
++ ror ra2.8b, r1, ra8.8d
++ ror ra0.8b, r1, ra8.8c
++
++ mov r1,0x050b0a00 # -ve
++ ror ra2.8c, r1, ra8.8d
++ ror ra0.8c, r1, ra8.8c
++
++ mov r1,0x11283a40
++ ror ra2.8d, r1, ra8.8d
++ ror ra0.8d, r1, ra8.8c
++
++# In the 2nd vertical half we use b registers due to using a-side fifo regs
++
++ mov r1,0x3a281100
++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
++ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++
++ mov r1,0x0a0b0500 # -ve
++ ror r0, r1, ra8.8d
++ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++
++ mov r1,0x04040100
++ ror r0, r1, ra8.8d
++ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++
++ mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
++
++ mov r1,0x01010000 # -ve
++ ror r0, r1, ra8.8d
++
++ bra -, ra_link
++ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++
++ shl r0, ra_wt_off_l0, rb_wt_den_p15 # Offset calc
++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++ asr rb_wt_off, r0, 9 ; mov ra_link, unif # ; link - load after we've used its previous val
++# >>> branch ra_link
++
++# r5 = 0
++# ra_wt_mul_l1 = weight L1
++# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
++# rb_wt_den_p15 = weight denom + 6 + 9
++# rb_wt_mul_l0 = weight L0
++.endm
++
++:per_block_setup_8
++ m_per_block_setup 8
++
++
++
++################################################################################
++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# In a P block, y2_x2 should be y_x+8
++# At this point we have already issued two pairs of texture requests for the current block
++
++.macro m_filter_y_pxx, v_bit_depth
++ m_luma_setup v_bit_depth
++
++ shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++
++# r5 = 0 (loop count)
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# N.B. Whilst y == y2 as far as this loop is concerned we will start
++# the grab for the next block before we finish with this block and that
++# might be B where y != y2 so we must do full processing on both y and y2
++
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
++
++ max r2, ra_y2, 0
++ min r2, r2, rb_max_y ; mov ra7, ra8
++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++
++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9
++
++# apply horizontal filter
++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++ sub.setf -, r5, 8 ; mov ra9, ra10
++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
++ brr.anyn -, r:1b
++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
++ mov ra10, ra11 ; mov rb10, rb11
++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++ # >>> .anyn 1b
++
++ # apply vertical filter and write to VPM
++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
++ add r1, r1, r0 ; mul24 r0, ra8, rb4
++ add r1, r1, r0 ; mul24 r0, ra9, rb5
++ sub r1, r1, r0 ; mul24 r0, ra10, rb6
++ add r1, r1, r0 ; mul24 r0, ra11, rb7
++ sub r1, r1, r0
++# At this point r1 is a 22-bit signed quantity: 8 (original sample),
++# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
++# The top 8 bits have rubbish in them as mul24 is unsigned
++# The low 6 bits need discard before weighting
++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish
++ asr r1, r1, 14
++ nop ; mul24 r1, r1, ra_wt_mul_l0
++ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop
++
++ shl r1, r1, 8 ; v8subs r0, ra_height, r3
++ brr.anyn -, r:1b
++ asr r1, r1, rb_wt_den_p15
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++
++# >>> branch.anyn yloop
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ add rb_lcount, rb_lcount, r0
++ brr -, r:1b
++ add rb_dma0, rb_dma0, r1
++ add rb_dest, rb_dest, r2
++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_pxx
++ m_filter_y_pxx 8
++
++
++################################################################################
++
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# In a P block, only the first half of coefficients contain used information.
++# At this point we have already issued two pairs of texture requests for the current block
++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
++# Or possibly by taking advantage of symmetry?
++
++.macro m_filter_y_bxx, v_bit_depth
++ m_luma_setup v_bit_depth
++
++:1
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
++
++ max r2, ra_y2, 0
++ min r2, r2, rb_max_y ; mov ra7, ra8
++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++
++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9
++
++# apply horizontal filter
++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++ sub.setf -, r5, 8 ; mov ra9, ra10
++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
++ brr.anyn -, r:1b
++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
++ mov ra10, ra11 ; mov rb10, rb11
++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++ # >>> .anyn 1b
++
++ # apply vertical filter and write to VPM
++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
++ add r1, r1, r0 ; mul24 r0, ra8, rb4
++ add r1, r1, r0 ; mul24 r0, ra9, rb5
++ sub r1, r1, r0 ; mul24 r0, ra10, rb6
++ add r1, r1, r0 ; mul24 r0, ra11, rb7
++ sub r1, r1, r0 ; mov r2, rb_wt_off
++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
++# Top 8 bits are bad - low 6 bits should be discarded
++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++
++ asr r1, r1, 14
++ nop ; mul24 r0, r1, ra_wt_mul_l0
++ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0
++
++ add r1, r1, r0 ; mov r3, ra_blk_height
++ shl r1, r1, 8 ; v8subs r0, ra_height, r3
++ brr.anyn -, r:1b
++ asr r1, r1, rb_wt_den_p15
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ add rb_lcount, rb_lcount, r0
++ brr -, r:1b
++ add rb_dma0, rb_dma0, r1
++ add rb_dest, rb_dest, r2
++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_bxx
++ m_filter_y_bxx 8
++
++################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++# qpu_mc_src_t next_src1;
++# uint16_t h;
++# uint16_t w;
++# uint32_t wo1;
++# uint32_t dst_addr;
++# uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift, 0
++.set v_x_mul, 1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 7
++.set v_dma_wh_shift, i_shift16
++.else
++.set v_x_shift, 1
++.set v_x_mul, 2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift, 8
++.set v_dma_wh_shift, 15
++.endif
++
++ mov ra0, unif ; mov r3, elem_num # y_x
++ mov ra_xshift, ra_xshift_next # [ra0 delay]
++ add r0, ra0.16b, r3
++.if v_x_shift != 0
++ shl r0, r0, v_x_shift
++.endif
++
++ max r0, r0, 0
++ min r0, r0, rb_max_x
++
++ shl ra_xshift_next, r0, 3 # Compute shifts
++ and r0, r0, -4 ; v8subs r2, r2, r2
++ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base
++ and r1, r0, r2 ; mov ra_y_next, ra0.16a
++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
++ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height
++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++ shl r1, ra_width, v_x_shift
++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
++ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
++ add rb_dma0, r0, rb_dma0_base
++
++ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0
++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link
++
++:1
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
++
++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++ brr.anyn -, r:1b
++ asr r1, r1, rb_wt_den_p15
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ add rb_lcount, rb_lcount, r0
++ brr -, r:1b
++ add rb_dma0, rb_dma0, r1
++ add rb_dest, rb_dest, r2
++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++ m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++ m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++ mov r0, 7
++ sub rb_i_tmu, rb_i_tmu, r0
++ sub rb_lcount, rb_lcount, r0
++ mov r0, 8 ; mov r1, ra_wt_off_mul_l0
++ shl rb_wt_off, rb_wt_off, r0
++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
++
++ max r2, ra_y2, 0
++ min r2, r2, rb_max_y
++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++
++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++ add r1, r0, r1
++ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++ brr.anyn -, r:1b
++ asr r1, r1, rb_wt_den_p15
++ min r1, r1, ra_pmax ; mov -, vw_wait
++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++ bra.anyz -, ra_link
++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
++ shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++ add rb_lcount, rb_lcount, r0
++ brr -, r:1b
++ add rb_dma0, rb_dma0, r1
++ add rb_dest, rb_dest, r2
++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++ m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++ m_setup_q0
++::mc_setup_c10_qn
++ m_setup_c 10
++
++::mc_filter_c10_p
++ m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++ m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++ m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++ m_sync_q 0, v_quads10
++::mc_sync10_q1
++ m_sync_q 1, v_quads10
++::mc_sync10_q2
++ m_sync_q 2, v_quads10
++::mc_sync10_q3
++ m_sync_q 3, v_quads10
++::mc_sync10_q4
++ m_sync_q 4, v_quads10
++::mc_sync10_q5
++ m_sync_q 5, v_quads10
++::mc_sync10_q6
++ m_sync_q 6, v_quads10
++::mc_sync10_q7
++ m_sync_q 7, v_quads10
++::mc_sync10_q8
++ m_sync_q 8, v_quads10
++::mc_sync10_q9
++ m_sync_q 9, v_quads10
++::mc_sync10_q10
++ m_sync_q 10, v_quads10
++::mc_sync10_q11
++ m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++ m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++ m_exit_qn
++
++::mc_setup_y10_q0
++ m_setup_q0
++::mc_setup_y10_qn
++ m_setup_y 10
++
++:per_block_setup_10
++ m_per_block_setup 10
++
++::mc_filter_y10_pxx
++ m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++ m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++ m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++ m_filter_y_b00 10
++
++
++
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
+diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
+new file mode 100644
+index 0000000000..9f8983da52
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_cmd.h
+@@ -0,0 +1,128 @@
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++ int16_t y;
++ int16_t x;
++ qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++ qpu_mc_src_t next_src;
++ uint16_t h;
++ uint16_t w;
++ uint32_t coeffs_x;
++ uint32_t coeffs_y;
++ uint32_t wo_u;
++ uint32_t wo_v;
++ qpu_mc_dst_addr_t dst_addr_c;
++ uint32_t next_fn;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++ qpu_mc_src_t next_src1;
++ uint16_t h;
++ uint16_t w;
++ uint32_t coeffs_x1;
++ uint32_t coeffs_y1;
++ uint32_t weight_u1;
++ uint32_t weight_v1;
++ qpu_mc_src_t next_src2;
++ uint32_t coeffs_x2;
++ uint32_t coeffs_y2;
++ uint32_t wo_u2;
++ uint32_t wo_v2;
++ qpu_mc_dst_addr_t dst_addr_c;
++ uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++ qpu_mc_src_t next_src1;
++ uint32_t pic_cw; // C Width (== Y width / 2)
++ uint32_t pic_ch; // C Height (== Y Height / 2)
++ uint32_t stride2;
++ uint32_t stride1;
++ uint32_t wdenom;
++ qpu_mc_src_t next_src2;
++ uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
++ union {
++ qpu_mc_pred_c_p_t p;
++ qpu_mc_pred_c_b_t b;
++ qpu_mc_pred_c_s_t s;
++ };
++} qpu_mc_pred_c_t;
++
++
++typedef struct qpu_mc_pred_y_p_s {
++ qpu_mc_src_t next_src1;
++ qpu_mc_src_t next_src2;
++ uint16_t h;
++ uint16_t w;
++ uint32_t mymx21;
++ uint32_t wo1;
++ uint32_t wo2;
++ qpu_mc_dst_addr_t dst_addr;
++ uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++ qpu_mc_src_t next_src1;
++ uint16_t h;
++ uint16_t w;
++ uint32_t wo1;
++ qpu_mc_dst_addr_t dst_addr;
++ uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++ qpu_mc_src_t next_src1;
++ qpu_mc_src_t next_src2;
++ uint16_t pic_h;
++ uint16_t pic_w;
++ uint32_t stride2;
++ uint32_t stride1;
++ uint32_t wdenom;
++ uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++ union {
++ qpu_mc_pred_y_p_t p;
++ qpu_mc_pred_y_p00_t p00;
++ qpu_mc_pred_y_s_t s;
++ };
++} qpu_mc_pred_y_t;
++
++typedef union qpu_mc_pred_cmd_u {
++ qpu_mc_pred_y_t y;
++ qpu_mc_pred_c_t c;
++ uint32_t data[1];
++} qpu_mc_pred_cmd_t;
++
++#define QPU_MC_PRED_N_Y8 12
++#define QPU_MC_PRED_N_C8 12
++
++#define QPU_MC_PRED_N_Y10 12
++#define QPU_MC_PRED_N_C10 12
++
++#pragma pack(pop)
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
+new file mode 100644
+index 0000000000..0c80cf4de0
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.c
+@@ -0,0 +1,62 @@
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++
++typedef struct shader_track_s
++{
++ const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++ const struct qpu_mc_src_s *last_l0;
++ const struct qpu_mc_src_s *last_l1;
++ uint32_t width; // pic_width * PW
++ uint32_t height;
++ uint32_t stride2;
++ uint32_t stride1;
++ uint32_t wdenom;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++ return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++ int rv;
++ // As it happens we can take the 2nd filter term & divide it by 8
++ // (dropping fractions) to get the fractional move
++ rv = 8 - ((x >> 11) & 0xf);
++ av_assert2(rv >= 0 && rv <= 7);
++ return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++ return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCRpiContext *const s, int32_t x)
++{
++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCRpiContext *const s, int32_t x)
++{
++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++ return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_hevc_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_hevc_shader_template_fn.h"
++
+diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
+new file mode 100644
+index 0000000000..304d73ea4a
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.h
+@@ -0,0 +1,22 @@
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++struct HEVCRpiContext;
++struct HEVCRpiInterPredEnv;
++
++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
++ const struct HEVCRpiInterPredEnv *const ipe_y,
++ const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
++ const struct HEVCRpiInterPredEnv *const ipe_y,
++ const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
+new file mode 100644
+index 0000000000..b9e7c07fe3
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template_fn.h
+@@ -0,0 +1,477 @@
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++ const pixel s = *(const pixel *)src;
++ pixel * d = (pixel *)dst;
++ for (unsigned int j = 0; j < w; j += PW) {
++ *d++ = s;
++ }
++ }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++ for (unsigned int i = 0; i != h; ++i, dst += stride) {
++ memcpy(dst, src, w);
++ }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++ uint8_t * dst, const unsigned int dst_stride,
++ const qpu_mc_src_t *src,
++ unsigned int _w, unsigned int _h)
++{
++ int x = src->x * PW;
++ int y = src->y;
++ int w = _w * PW;
++ int h = _h;
++ int dl = 0;
++ int dr = 0;
++ int dt = 0;
++ int db = 0;
++
++ if (x < 0) {
++ if (-x >= w)
++ x = PW - w;
++ dl = -x;
++ w += x;
++ x = 0;
++ }
++ if (x + w > st->width) {
++ if (x >= st->width)
++ x = st->width - PW;
++ dr = (x + w) - st->width;
++ w = st->width - x;
++ }
++
++ // Y
++ if (y < 0) {
++ if (-y >= h)
++ y = 1 - h;
++ dt = -y;
++ h += y;
++ y = 0;
++ }
++ if (y + h > st->height) {
++ if (y >= st->height)
++ y = st->height - 1;
++ db = (y + h) - st->height;
++ h = st->height - y;
++ }
++
++ dst += dl + dt * dst_stride;
++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++ // Edge dup
++ if (dl != 0)
++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++ if (dr != 0)
++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++ w += dl + dr;
++ dst -= dl;
++
++ if (dt != 0)
++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++ if (db != 0)
++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++ const qpu_mc_src_t *src,
++ unsigned int _w, unsigned int _h)
++{
++ int x = src->x * PW;
++ int y = src->y;
++ int w = _w * PW;
++ int h = _h;
++ int dl = 0;
++ int dr = 0;
++ int dt = 0;
++ int db = 0;
++ const int width = st->width;
++ const int height = st->height;
++
++ if (x < 0) {
++ if (-x >= w)
++ x = PW - w;
++ dl = -x;
++ w += x;
++ x = 0;
++ }
++ if (x + w > width) {
++ if (x >= width)
++ x = width - PW;
++ dr = (x + w) - width;
++ w = width - x;
++ }
++
++ // Y
++ if (y < 0) {
++ if (-y >= h)
++ y = 1 - h;
++ dt = -y;
++ h += y;
++ y = 0;
++ }
++ if (y + h > height) {
++ if (y >= height)
++ y = height - 1;
++ db = (y + h) - height;
++ h = height - y;
++ }
++
++ dst_u += dl + dt * dst_stride;
++ dst_v += dl + dt * dst_stride;
++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++ // Edge dup
++ if (dl != 0)
++ {
++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++ }
++ if (dr != 0)
++ {
++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++ }
++ w += dl + dr;
++ dst_u -= dl;
++ dst_v -= dl;
++
++ if (dt != 0)
++ {
++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++ }
++ if (db != 0)
++ {
++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++ }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++ const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++ if (is_c) {
++ x *= 2;
++ w *= 2;
++ }
++
++ for (int i = y; i != y + h; ++i) {
++ for (int j = x; j != x + w; ++j) {
++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++ char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++ if (j < 0 || i < 0)
++ printf("..%c", sep);
++ else
++ printf("%02x%c", *(const pixel*)p, sep);
++#else
++ if (j < 0 || i < 0)
++ printf("...%c", sep);
++ else
++ printf("%03x%c", *(const pixel*)p, sep);
++#endif
++ }
++ printf("\n");
++ }
++}
++
++
++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
++ const HEVCRpiInterPredEnv *const ipe_y,
++ const HEVCRpiInterPredEnv *const ipe_c)
++{
++ for (int c_idx = 0; c_idx < 2; ++c_idx)
++ {
++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++ shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++ unsigned int exit_n = 0;
++
++ if (ipe == NULL || !ipe->used) {
++ continue;
++ }
++
++ do {
++ for (unsigned int i = 0; i != ipe->n; ++i) {
++ const HEVCRpiInterPredQ * const q = ipe->q + i;
++ shader_track_t * const st = tracka + i;
++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++ for (;;) {
++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++ if (link == q->code_setup) {
++ if (c_idx == 0) {
++ // Luma
++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++ st->height = c->pic_h;
++ st->width = c->pic_w * PW;
++ st->stride1 = c->stride1;
++ st->stride2 = c->stride2;
++ st->wdenom = c->wdenom;
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else {
++ // Chroma
++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++ st->height = c->pic_ch;
++ st->width = c->pic_cw * PW;
++ st->stride1 = c->stride1;
++ st->stride2 = c->stride2;
++ st->wdenom = c->wdenom;
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ }
++ else if (link == s->qpu.y_pxx) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++ const int w1 = FFMIN(c->w, 8);
++ const int w2 = c->w - w1;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++ if (w2 > 0) {
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h + 7);
++ }
++
++ // wo[offset] = offset*2+1
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++ if (w2 > 0) {
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++ }
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_bxx) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h + 7);
++
++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_p00) {
++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h + 7);
++
++ // wo[offset] = offset*2+1
++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++ st->last_l0 = &c->next_src1;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.y_b00) {
++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ av_assert0(c->w <= 16 && c->h <= 64);
++
++ FUNC(get_patch_y)(st,
++ patch_y1, PATCH_STRIDE,
++ st->last_l0,
++ 16, c->h);
++ FUNC(get_patch_y)(st,
++ patch_y2, PATCH_STRIDE,
++ st->last_l1,
++ 16, c->h);
++
++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++ patch_y3, patch_y1, PATCH_STRIDE,
++ c->h, 0, 0, c->w);
++
++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++ 0, woff_b(s, c->wo2), 0, 0, c->w);
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_pxx) {
++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++ const int mx = fctom(c->coeffs_x);
++ const int my = fctom(c->coeffs_y);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l0 = &c->next_src;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_pxx_l1) {
++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++ const int mx = fctom(c->coeffs_x);
++ const int my = fctom(c->coeffs_y);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l1 = &c->next_src;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == s->qpu.c_bxx) {
++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++ const int mx1 = fctom(c->coeffs_x1);
++ const int my1 = fctom(c->coeffs_y1);
++ const int mx2 = fctom(c->coeffs_x2);
++ const int my2 = fctom(c->coeffs_y2);
++
++ uint8_t patch_u1[PATCH_STRIDE * 72];
++ uint8_t patch_v1[PATCH_STRIDE * 72];
++ uint8_t patch_u2[PATCH_STRIDE * 72];
++ uint8_t patch_v2[PATCH_STRIDE * 72];
++ uint8_t patch_u3[8 * 16 * PW];
++ uint8_t patch_v3[8 * 16 * PW];
++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, mx1, my1, c->w);
++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++ c->h, mx1, my1, c->w);
++
++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++ st->last_l0 = &c->next_src1;
++ st->last_l1 = &c->next_src2;
++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++ }
++ else if (link == q->code_sync) {
++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++ break;
++ }
++ else if (link == q->code_exit) {
++ // We expect exit to occur without other sync
++ av_assert0(i == exit_n);
++ ++exit_n;
++ break;
++ }
++ else {
++ av_assert0(0);
++ }
++ }
++
++ st->qpu_mc_curr = cmd;
++ }
++ } while (exit_n == 0);
++ }
++}
++
++#undef FUNC
++#undef pixel
++
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+new file mode 100644
+index 0000000000..a08a1d6bef
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,927 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# HEVC VPU Transform
++# fe
++# Transform matrix can be thought of as
++# output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++# (a b c d) (1 2 2 1)
++# (3 4 -4 -3)
++# (5 6 6 5)
++# (7 8 -8 -7)
++#
++# x=(a c)(1 2) = 1a+5c 2a+6c
++# (5 6)
++#
++# y=(b d)(3 4) = 3b+7d 4b+8d
++# (7 8)
++#
++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++# Final results are (u , v[::-1])
++#
++#
++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++# Apply the even matrix first and stop before rounding
++# Then apply the odd matrix in a full manner:
++#
++# First step is to compute partial products with the first input (16 cycles)
++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output
++# 2a 4b 6c 8d
++# 2a -4b 6c -8d
++# 1a -3b 5c -7d
++#
++# Second step is to sum partial products into final position (8 cycles)
++# 1a+3b+5c+7d
++# 2a+4b+6c+8d
++# 2a-4b+6c-8d
++# 1a-3b+5c-7d
++#
++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++# For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++ cmp r5,1
++ beq memclear16
++ cmp r5,2
++ beq hevc_deblock_16x16
++ cmp r5,3
++ beq hevc_uv_deblock_16x16
++ cmp r5,4
++ beq hevc_uv_deblock_16x16_with_clear
++ cmp r5,5
++ beq hevc_run_command_list
++
++ push r6-r15, lr # TODO cut down number of used registers
++ mov r14,r3 # coeffs32
++ mov r15,r4 # num32
++ mov r3, 16*2 # Stride of transMatrix2 in bytes
++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++ # Now use r0 to describe which matrix we are working on.
++ # Allows us to prefetch the next block of coefficients for efficiency.
++ mov r0,0 # This describes the location where we read our coefficients from
++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++ mov r7,16*16*2 # Total block size
++ mov r8,64*16 # Value used to swap from current to next VRF location
++ vldh HX(0++,0)+r0,(r1 += r3) REP 16
++ mov r4,64 # Constant used for rounding first pass
++ mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++ # At start of block r0,r1 point to the current block (that has already been loaded)
++block_loop:
++ eor r0,r8
++ add r1,r7
++ # Prefetch the next block
++ vldh HX(0++,0)+r0,(r1 += r3) REP 16
++ eor r0,r8
++ sub r1,r7
++
++ # Transform the current block
++ bl col_trans_16
++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate
++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble?
++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position
++
++ bl col_trans_16
++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate
++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag)
++
++ # Save results - note there has been a transposition during the processing so we save columns
++ vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++ # Move onto next block
++ eor r0,r8
++ add r1,r7
++
++ addcmpbgt r2,-1,0,block_loop
++
++ # Now go and do any 32x32 transforms
++ b hevc_trans_32x32
++
++ pop r6-r15, pc
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++ add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++ # First compute partial products for a single column
++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++ # Then sum up the results and place back
++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++ addcmpblt r0,1,r6,col_trans_16_loop
++ sub r0,16 # put r0 back to its original value
++ b lr
++
++col_trans_odd_16:
++ add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++ # First compute partial products for a single column
++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++ # Then sum up the results and place back
++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++ addcmpblt r0,1,r6,col_trans_odd_16_loop
++ sub r0,16 # put r0 back to its original value
++ b lr
++
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++#
++hevc_trans_32x32:
++ mov r1,r14 # coeffs
++ mov r2,r15 # num
++
++ # Fetch odd transform matrix
++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++ #add r0, 16*16*2
++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++ mov r7, 16*16*2 # Total block size
++ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
++ # set r8 to 32byte aligned stack pointer
++ add r8,sp,31
++ lsr r8,5
++ lsl r8,5
++ mov r9,r8 # Backup of the temporary storage
++ mov r10,r1 # Backup of the coefficient buffer
++block_loop32:
++
++ # COLUMN TRANSFORM
++ mov r4, 64 # Constant used for rounding first pass
++ mov r5, 9 # left shift used for rounding first pass
++
++ # Transform the first 16 columns
++ mov r1,r10 # Input Coefficient buffer
++ mov r8,r9 # Output temporary storage
++ bl trans32
++ # Transform the second 16 columns
++ add r8,32*16*2
++ add r1,32
++ bl trans32
++
++ # ROW TRANSFORM
++ mov r4, TRANS_RND2 # Constant used for rounding second pass
++ mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++ mov r1,r9 # Input temporary storage
++ mov r8,r10 # Output Coefficient buffer
++ bl trans32
++ # Transform the second 16 columns
++ add r8,32*16*2
++ add r1,32
++ bl trans32
++
++ add r10, 32*32*2 # move onto next block of coefficients
++ addcmpbgt r2,-1,0,block_loop32
++
++ add sp,sp,32*32*2+32 # Restore stack
++
++ pop r6-r15, pc
++
++trans32:
++ push lr
++ # We can no longer afford the VRF space to do prefetching when doing 32x32
++ # Fetch the even rows
++ vldh HX(0++,0),(r1 += r3) REP 16
++ # Fetch the odd rows
++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++ # Transform the even rows using even matrix
++ mov r0, 0 # Even rows
++ bl col_trans_16
++
++ # Now transform the odd rows using odd matrix
++ mov r0, 64*16 # Odd rows
++ bl col_trans_odd_16
++
++ # Now apply butterfly to compute the first 16 results
++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
++ # 16bit results now in HX(48,32)
++ mov r0,r8
++ mov r6,32*2
++ vsth VX(48,32++),(r0+=r6) REP 16
++
++ # Now apply butterfly to compute the second 16 results (in reverse order)
++ vsub HY(63,0),HY(0 ,0),HY(16,0)
++ vsub HY(62,0),HY(1 ,0),HY(17,0)
++ vsub HY(61,0),HY(2 ,0),HY(18,0)
++ vsub HY(60,0),HY(3 ,0),HY(19,0)
++ vsub HY(59,0),HY(4 ,0),HY(20,0)
++ vsub HY(58,0),HY(5 ,0),HY(21,0)
++ vsub HY(57,0),HY(6 ,0),HY(22,0)
++ vsub HY(56,0),HY(7 ,0),HY(23,0)
++ vsub HY(55,0),HY(8 ,0),HY(24,0)
++ vsub HY(54,0),HY(9 ,0),HY(25,0)
++ vsub HY(53,0),HY(10,0),HY(26,0)
++ vsub HY(52,0),HY(11,0),HY(27,0)
++ vsub HY(51,0),HY(12,0),HY(28,0)
++ vsub HY(50,0),HY(13,0),HY(29,0)
++ vsub HY(49,0),HY(14,0),HY(30,0)
++ vsub HY(48,0),HY(15,0),HY(31,0)
++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
++ add r0,r8,32
++ vsth VX(48,32++),(r0+=r6) REP 16
++ pop pc
++
++memclear16:
++ # r0 is address
++ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
++ vmov HX(0++,0),0 REP 16
++ mov r2,32
++loop:
++ vsth HX(0++,0),(r0+=r2) REP 16
++ add r0,16*16*2
++ sub r1,16*16
++ cmp r1,0
++ bgt loop
++ b lr
++
++
++################################################################################
++# HEVC VPU Deblock
++#
++# Vertical edges before horizontal
++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
++#
++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
++# The VPU code works in units of 16x16 blocks.
++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
++# One final horizontal filter is required at the end.
++# PCM is not allowed in this code.
++#
++#
++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
++
++.set P0,63
++.set P1,62
++.set P2,61
++.set P3,60
++.set Q0,59
++.set Q1,58
++.set Q2,57
++.set Q3,56
++
++.set dp,32
++.set dq,33
++.set d,34
++.set decision,35
++.set beta,36
++.set beta2,37
++.set beta3,38
++.set ptest,39
++.set qtest,40
++.set pqtest,41
++.set thresh,42
++.set deltatest, 44
++.set deltap1, 45
++.set tc25, 46
++.set setup,47
++.set tc,48
++.set tc25,49
++.set tc2, 50
++.set do_filter, 51
++.set delta, 52
++.set tc10, 53
++.set delta0, 54
++.set delta1, 55
++.set zeros, 0
++.set setup_input, 1
++.set deltaq1, 2
++
++
++
++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
++# Row has num16 16x16 blocks across
++# Beta goes from 0 to 64
++# tc goes from 0 to 24
++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
++# has 8 bytes per edge
++# has 16 bytes per direction
++# has 32 bytes per 16x16 block
++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
++hevc_deblock_16x16:
++ push r6-r15, lr
++ mov r9,r4
++ mov r4,r3
++ mov r13,r2
++ mov r2,r0
++ mov r10,r0
++ subscale4 r0,r1
++ mov r8,63
++ mov r6,-3
++ vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++
++process_row:
++ # First iteration does not do horizontal filtering on previous
++ mov r7, r13
++ mov r3,0
++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
++ vldb H(16++,16)+r3,(r2 += r1) REP 16
++ vldb H(setup_input,0), (r4) # We may wish to prefetch these
++ vstb H(zeros,0),(r4)
++ bl vert_filter
++ add r3,8
++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++ bl vert_filter
++ sub r3,8
++ b start_deblock_loop
++deblock_loop:
++ # Middle iterations do vertical on current block and horizontal on preceding
++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
++ vldb H(16++,16)+r3,(r2 += r1) REP 16
++ vldb H(setup_input,0), (r4)
++ vstb H(zeros,0),(r4)
++ bl vert_filter
++ add r3,8
++ vadd H(setup_input,0),H(setup_input,8),0
++ bl vert_filter
++ sub r3,8
++ vldb H(setup_input,0), -16(r4)
++ vstb H(zeros,0),-16(r4)
++ bl horz_filter
++ mov r12,r11
++ add r3,8*64
++ vadd H(setup_input,0),H(setup_input,8),0
++ bl horz_filter
++ sub r3,8*64
++ addcmpbeq r12,0,0,skip_save_top
++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
++skip_save_top:
++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++start_deblock_loop:
++ # move onto next 16x16 (could do this with circular buffer support instead)
++ add r3,16
++ and r3,r8
++ add r4,32
++ # Perform loop counter operations (may work with an addcmpbgt as well?)
++ add r0,16
++ add r2,16
++ sub r7,1
++ cmp r7,0 # Are there still more blocks to load
++ bgt deblock_loop
++
++ # Final iteration needs to just do horizontal filtering
++ vldb H(setup_input,0), -16(r4)
++ vstb H(zeros,0),-16(r4)
++ bl horz_filter
++ mov r12,r11
++ add r3,8*64
++ vadd H(setup_input,0),H(setup_input,8),0
++ bl horz_filter
++ sub r3,64*8
++ addcmpbeq r12,0,0,skip_save_top2
++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
++skip_save_top2:
++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++ sub r9,1
++ cmp r9,0
++ bgt start_again
++ pop r6-r15, pc
++start_again:
++ # Need to sort out r0,r2 to point to next row down
++ addscale16 r10,r1
++ mov r2,r10
++ subscale4 r0,r2,r1
++ b process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++vert_filter:
++ push lr
++
++ vmov HX(P3,0), V(16,12)+r3
++ vmov HX(P2,0), V(16,13)+r3
++ vmov HX(P1,0), V(16,14)+r3
++ vmov HX(P0,0), V(16,15)+r3
++ vmov HX(Q0,0), V(16,16)+r3
++ vmov HX(Q1,0), V(16,17)+r3
++ vmov HX(Q2,0), V(16,18)+r3
++ vmov HX(Q3,0), V(16,19)+r3
++
++ bl do_luma_filter
++
++ vadds V(16,13)+r3, HX(P2,0), 0
++ vadds V(16,14)+r3, HX(P1,0), 0
++ vadds V(16,15)+r3, HX(P0,0), 0
++ # P3 and Q3 never change so don't bother saving back
++ vadds V(16,16)+r3, HX(Q0,0), 0
++ vadds V(16,17)+r3, HX(Q1,0), 0
++ vadds V(16,18)+r3, HX(Q2,0), 0
++
++ pop pc
++
++# Filter edge at H(16,0)+r3
++horz_filter:
++ push lr
++
++ vmov HX(P3,0), H(12,0)+r3
++ vmov HX(P2,0), H(13,0)+r3
++ vmov HX(P1,0), H(14,0)+r3
++ vmov HX(P0,0), H(15,0)+r3
++ vmov HX(Q0,0), H(16,0)+r3
++ vmov HX(Q1,0), H(17,0)+r3
++ vmov HX(Q2,0), H(18,0)+r3
++ vmov HX(Q3,0), H(19,0)+r3
++
++ bl do_luma_filter
++
++ vadds H(13,0)+r3, HX(P2,0), 0
++ vadds H(14,0)+r3, HX(P1,0), 0
++ vadds H(15,0)+r3, HX(P0,0), 0
++ # P3 and Q3 never change so don't bother saving back
++ vadds H(16,0)+r3, HX(Q0,0), 0
++ vadds H(17,0)+r3, HX(Q1,0), 0
++ vadds H(18,0)+r3, HX(Q2,0), 0
++
++ pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_luma_filter:
++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
++ valtl HX(beta,0),H(setup,0),H(setup,0)
++ valtu HX(tc,0),H(setup,0),H(setup,0)
++ vmul HX(tc25,0), HX(tc,0), 5
++ vadd HX(tc25,0),HX(tc25,0), 1
++ vasr HX(tc25,0), HX(tc25,0), 1
++
++ # Compute decision
++ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
++ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
++ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
++ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
++
++ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
++ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
++ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
++ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
++
++ vadd HX(d,0), HX(dp,0), HX(dq,0)
++ vasr HX(beta2,0),HX(beta,0),2
++ vasr HX(beta3,0),HX(beta,0),3
++
++ # Compute flags that are negative if all conditions pass
++ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
++ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
++ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
++
++ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
++ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
++ vadd HX(decision,0), HX(d,0), HX(d,0) IFN
++ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
++ vmov HX(decision,0), 1 IFNN
++ vadd H(decision,0),H(decision,3),0 IFN
++ vadd H(decision,16),H(decision,19),0 IFN
++ vmov -,HX(decision,0) SETF # N marks strong filter
++ vmov HX(decision,0), 1 IFNN # NN marks normal filter
++
++ vadd HX(do_filter,0), HX(d,3), HX(d,0)
++ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
++ vmov HX(decision,0),0 IFNN # Z marks no filter
++
++ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3
++ # First extract out even terms
++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3
++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123
++ # Now expand back
++ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
++ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
++
++ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
++
++ # Do a quick check to see if there is anything to do
++ mov r11, 0 # Signal no filtering
++ vmov -,1 IFNZ SUMS r5
++ cmp r5,0
++ beq filtering_done
++ mov r11, 1 # Signal some filtering
++ # And whether there is any strong filtering
++ vmov -,1 IFN SUMS r5
++ cmp r5,0
++ beq normal_filtering
++
++ ##############################################################################
++ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
++ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2
++
++ # Take a copy of the original pixels for use in decision calculation
++ vmov HX(P0,32),HX(P0,0)
++ vmov HX(Q0,32),HX(Q0,0)
++ vmov HX(P1,32),HX(P1,0)
++ vmov HX(Q1,32),HX(Q1,0)
++ vmov HX(P2,32),HX(P2,0)
++ vmov HX(Q2,32),HX(Q2,0)
++
++ vadd -,HX(P2,32),4 CLRA SACC
++ vshl -,HX(P1,32),1 SACC
++ vshl -,HX(P0,32),1 SACC
++ vshl -,HX(Q0,32),1 SACC
++ vshl HX(delta,0),HX(Q1,32),0 SACC
++ vasr HX(delta,0),HX(delta,0), 3
++ vsub HX(delta,0),HX(delta,0),HX(P0,32)
++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
++
++ vadd -,HX(P2,32),2 CLRA SACC
++ vadd -,HX(P1,32),HX(P0,32) SACC
++ vshl HX(delta,0),HX(Q0,32),0 SACC
++ vasr HX(delta,0),HX(delta,0), 2
++ vsub HX(delta,0),HX(delta,0),HX(P1,32)
++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
++
++ vadd -,HX(Q0,32),4 CLRA SACC
++ vadd -,HX(P1,32),HX(P0,32) SACC
++ vmul -,HX(P2,32),3 SACC
++ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
++ vasr HX(delta,0),HX(delta,0), 3
++ vsub HX(delta,0),HX(delta,0),HX(P2,32)
++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
++ #vmov HX(P2,0),3 IFN
++
++ # Now reverse all P/Qs
++
++ vadd -,HX(Q2,32),4 CLRA SACC
++ vshl -,HX(Q1,32),1 SACC
++ vshl -,HX(Q0,32),1 SACC
++ vshl -,HX(P0,32),1 SACC
++ vshl HX(delta,0),HX(P1,32),0 SACC
++ vasr HX(delta,0),HX(delta,0), 3
++ vsub HX(delta,0),HX(delta,0),HX(Q0,32)
++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
++
++ vadd -,HX(Q2,32),2 CLRA SACC
++ vadd -,HX(Q1,32),HX(Q0,32) SACC
++ vshl HX(delta,0),HX(P0,32),0 SACC
++ vasr HX(delta,0),HX(delta,0), 2
++ vsub HX(delta,0),HX(delta,0),HX(Q1,32)
++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
++
++ vadd -,HX(P0,32),4 CLRA SACC
++ vadd -,HX(Q1,32),HX(Q0,32) SACC
++ vmul -,HX(Q2,32),3 SACC
++ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
++ vasr HX(delta,0),HX(delta,0), 3
++ vsub HX(delta,0),HX(delta,0),HX(Q2,32)
++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
++
++ ##############################################################################
++ # Normal filtering
++normal_filtering:
++ # Invert the decision flags
++ # make instruction more complicated as assembler has error and loses SETF
++ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
++ vmov -, HX(tc10,0) SETF # IFN means normal filtering
++
++ vmov -,1 IFN SUMS r5
++ cmp r5,0
++ beq filtering_done
++
++ vasr HX(tc2,0), HX(tc,0), 1
++ vmul HX(tc10,0), HX(tc,0), 10
++
++ vasr HX(thresh,0), HX(beta,0), 1
++ vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
++ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
++
++ vadd HX(ptest,0),HX(dp,3),HX(dp,0)
++ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
++ vadd HX(qtest,0),HX(dq,3),HX(dq,0)
++ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
++ # Expand ptest and qtest together
++ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q
++ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
++ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
++ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
++ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
++
++ vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
++ vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
++ vmov -,8 CLRA SACC
++ vmul -,HX(delta0,0), 9 SACC
++ vmul HX(delta0,0),HX(delta1,0), r6 SACC
++ vasr HX(delta0,0), HX(delta0,0), 4
++ vdist HX(deltatest,0), HX(delta0,0), 0
++ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
++ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
++
++ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
++
++ vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
++ vadd HX(deltap1,0), HX(deltap1,0), 1
++ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
++ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
++ vasr HX(deltap1,0), HX(deltap1,0), 1
++ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
++
++ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
++ vadd HX(deltaq1,0), HX(deltaq1,0), 1
++ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
++ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
++ vrsub -, HX(delta0,0), 0 SACC
++ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
++ vasr HX(deltaq1,0), HX(deltaq1,0), 1
++ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
++
++ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
++ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
++
++ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
++ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
++
++ vmov -,HX(deltatest,0) SETF
++ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
++ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
++
++ #vmov HX(P2,0),1 IFN
++
++filtering_done:
++ b lr
++
++
++hevc_uv_deblock_16x16:
++ push r6-r15, lr
++ mov r14,0
++ b hevc_uv_start
++hevc_uv_deblock_16x16_with_clear:
++ push r6-r15, lr
++ mov r14,1
++ b hevc_uv_start
++
++hevc_uv_start:
++ mov r9,r4
++ mov r4,r3
++ mov r13,r2
++ mov r2,r0
++ mov r10,r0
++ subscale4 r0,r1
++ mov r8,63
++ mov r6,-3
++ vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++# r14 is 1 if we should clear the old contents, or 0 if not
++
++uv_process_row:
++ # First iteration does not do horizontal filtering on previous
++ mov r7, r13
++ mov r3,0
++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block
++ vldb H(16++,16)+r3,(r2 += r1) REP 16
++ vldb H(setup_input,0), (r4) # We may wish to prefetch these
++ cmp r14,1
++ bne uv_skip0
++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++ vstb H(zeros,0),(r4)
++uv_skip0:
++ bl uv_vert_filter
++ add r3,8
++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++ bl uv_vert_filter
++ sub r3,8
++ b uv_start_deblock_loop
++uv_deblock_loop:
++ # Middle iterations do vertical on current block and horizontal on preceding
++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block
++ vldb H(16++,16)+r3,(r2 += r1) REP 16
++ vldb H(setup_input,0), (r4)
++ cmp r14,1
++ bne uv_skip1
++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++ vstb H(zeros,0),(r4)
++uv_skip1:
++ bl uv_vert_filter
++ add r3,8
++ vadd H(setup_input,0),H(setup_input,8),0
++ bl uv_vert_filter
++ sub r3,8
++ vldb H(setup_input,0), -16(r4)
++ cmp r14,1
++ bne uv_skip3
++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++ vstb H(zeros,0),-16(r4)
++uv_skip3:
++ bl uv_horz_filter
++ mov r12,r11
++ add r3,8*64
++ vadd H(setup_input,0),H(setup_input,8),0
++ bl uv_horz_filter
++ sub r3,8*64
++ addcmpbeq r12,0,0,uv_skip_save_top
++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
++uv_skip_save_top:
++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++uv_start_deblock_loop:
++ # move onto next 16x16 (could do this with circular buffer support instead)
++ add r3,16
++ and r3,r8
++ add r4,32
++ # Perform loop counter operations (may work with an addcmpbgt as well?)
++ add r0,16
++ add r2,16
++ sub r7,1
++ cmp r7,0 # Are there still more blocks to load
++ bgt uv_deblock_loop
++
++ # Final iteration needs to just do horizontal filtering
++ vldb H(setup_input,0), -16(r4)
++ cmp r14,1
++ bne uv_skip2
++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++ vstb H(zeros,0),-16(r4)
++uv_skip2:
++ bl uv_horz_filter
++ mov r12,r11
++ add r3,8*64
++ vadd H(setup_input,0),H(setup_input,8),0
++ bl uv_horz_filter
++ sub r3,64*8
++ addcmpbeq r12,0,0,uv_skip_save_top2
++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block
++uv_skip_save_top2:
++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++ sub r9,1
++ cmp r9,0
++ bgt uv_start_again
++ pop r6-r15, pc
++uv_start_again:
++ # Need to sort out r0,r2 to point to next row down
++ addscale16 r10,r1
++ mov r2,r10
++ subscale4 r0,r2,r1
++ b uv_process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++uv_vert_filter:
++ push lr
++
++ vmov HX(P1,0), V(16,14)+r3
++ vmov HX(P0,0), V(16,15)+r3
++ vmov HX(Q0,0), V(16,16)+r3
++ vmov HX(Q1,0), V(16,17)+r3
++
++ bl do_chroma_filter
++
++ vadds V(16,15)+r3, HX(P0,0), 0
++ vadds V(16,16)+r3, HX(Q0,0), 0
++
++ pop pc
++
++# Filter edge at H(16,0)+r3
++uv_horz_filter:
++ push lr
++
++ vmov HX(P1,0), H(14,0)+r3
++ vmov HX(P0,0), H(15,0)+r3
++ vmov HX(Q0,0), H(16,0)+r3
++ vmov HX(Q1,0), H(17,0)+r3
++
++ bl do_chroma_filter
++
++ vadds H(15,0)+r3, HX(P0,0), 0
++ # P3 and Q3 never change so don't bother saving back
++ vadds H(16,0)+r3, HX(Q0,0), 0
++
++ pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_chroma_filter:
++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
++ valtl HX(tc,0),H(setup,0),H(setup,0)
++
++ vsub HX(delta,0),HX(Q0,0),HX(P0,0)
++ vshl HX(delta,0),HX(delta,0),2 CLRA SACC
++ vsub -,HX(P1,0),HX(Q1,0) SACC
++ vmov HX(delta,0),4 SACC
++ vasr HX(delta,0),HX(delta,0),3
++ vclamps HX(delta,0), HX(delta,0), HX(tc,0)
++ vadd HX(P0,0),HX(P0,0),HX(delta,0)
++ vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
++ b lr
++
++# r0 = list
++# r1 = number
++hevc_run_command_list:
++ push r6-r7, lr
++ mov r6, r0
++ mov r7, r1
++loop_cmds:
++ ld r0,(r6) # How to encode r6++?
++ add r6,4
++ ld r1,(r6)
++ add r6,4
++ ld r2,(r6)
++ add r6,4
++ ld r3,(r6)
++ add r6,4
++ ld r4,(r6)
++ add r6,4
++ ld r5,(r6)
++ add r6,4
++ bl hevc_trans_16x16
++ sub r7,1
++ cmp r7,0
++ bgt loop_cmds
++
++ pop r6-r7, pc
+diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
+new file mode 100644
+index 0000000000..ee4e357f38
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
+@@ -0,0 +1,3110 @@
++static const unsigned char rpi_hevc_transform10 [] = {
++21,
++106,
++0,
++144,
++47,
++1,
++37,
++106,
++0,
++144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++240,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++2,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++6,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++57,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++2,
++0,
++0,
++101,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
++169,
++3,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++137,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++129,
++0,
++131,
++102,
++0,
++158,
++67,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++108,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++100,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++161,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++182,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++112,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++101,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++103,
++255,
++239,
++3,
++0,
++254,
++0,
++143,
++92,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++93,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++210,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++211,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++107,
++0,
++8,
++255,
++99,
++23,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++23,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++52,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++52,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++0,
++143,
++12,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++13,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++18,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++19,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++33,
++0,
++8,
++255,
++99,
++3,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++3,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++4,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++4,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++137,
++47,
++240,
++40,
++246,
++2,
++140,
++47,
++240,
++128,
++245,
++99,
++140,
++5,
++4,
++0,
++247,
++99,
++140,
++1,
++20,
++88,
++246,
++99,
++140,
++1,
++20,
++0,
++247,
++35,
++136,
++62,
++226,
++32,
++247,
++35,
++136,
++32,
++210,
++0,
++247,
++34,
++136,
++63,
++2,
++208,
++246,
++34,
++136,
++0,
++4,
++0,
++247,
++99,
++136,
++58,
++162,
++32,
++247,
++99,
++136,
++33,
++146,
++0,
++247,
++98,
++136,
++59,
++18,
++208,
++246,
++98,
++136,
++0,
++20,
++0,
++247,
++162,
++136,
++33,
++2,
++88,
++246,
++98,
++137,
++2,
++68,
++88,
++246,
++162,
++137,
++3,
++68,
++208,
++254,
++227,
++136,
++60,
++242,
++192,
++243,
++188,
++11,
++208,
++254,
++227,
++136,
++56,
++178,
++192,
++243,
++188,
++10,
++32,
++255,
++226,
++136,
++38,
++58,
++192,
++243,
++60,
++0,
++208,
++254,
++227,
++136,
++59,
++242,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++49,
++58,
++192,
++243,
++60,
++128,
++0,
++255,
++226,
++136,
++34,
++34,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++37,
++58,
++192,
++243,
++60,
++128,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++194,
++8,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++255,
++202,
++40,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++254,
++0,
++240,
++35,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++226,
++140,
++34,
++34,
++195,
++243,
++60,
++0,
++32,
++255,
++227,
++140,
++36,
++58,
++192,
++243,
++60,
++0,
++0,
++254,
++192,
++136,
++0,
++4,
++0,
++240,
++0,
++160,
++16,
++246,
++226,
++136,
++35,
++50,
++16,
++246,
++226,
++136,
++35,
++50,
++32,
++246,
++226,
++136,
++35,
++50,
++32,
++254,
++226,
++136,
++35,
++58,
++192,
++243,
++60,
++0,
++11,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++115,
++5,
++106,
++0,
++144,
++173,
++1,
++27,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++227,
++0,
++64,
++246,
++163,
++140,
++1,
++4,
++0,
++246,
++192,
++175,
++63,
++2,
++0,
++246,
++192,
++174,
++59,
++2,
++0,
++246,
++128,
++175,
++62,
++2,
++0,
++246,
++128,
++174,
++58,
++2,
++0,
++246,
++64,
++175,
++61,
++2,
++0,
++246,
++64,
++174,
++57,
++2,
++0,
++255,
++43,
++240,
++4,
++212,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++228,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++191,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++143,
++52,
++242,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++212,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++180,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++190,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++143,
++52,
++226,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++180,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++212,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++196,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++189,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++143,
++52,
++210,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++148,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++164,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++228,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++187,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++142,
++52,
++178,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++148,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++244,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++186,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++142,
++52,
++162,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++244,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++148,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++132,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++185,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++142,
++52,
++146,
++192,
++243,
++60,
++128,
++64,
++255,
++98,
++141,
++0,
++52,
++192,
++243,
++0,
++0,
++0,
++254,
++0,
++240,
++53,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++177,
++0,
++88,
++246,
++163,
++140,
++1,
++4,
++128,
++245,
++99,
++141,
++10,
++4,
++88,
++246,
++162,
++138,
++1,
++68,
++0,
++247,
++162,
++138,
++36,
++162,
++88,
++254,
++162,
++138,
++3,
++164,
++192,
++243,
++128,
++11,
++0,
++255,
++226,
++137,
++32,
++2,
++195,
++243,
++60,
++0,
++32,
++247,
++226,
++137,
++42,
++114,
++0,
++255,
++34,
++138,
++33,
++18,
++195,
++243,
++60,
++0,
++32,
++247,
++34,
++138,
++42,
++130,
++16,
++246,
++98,
++138,
++40,
++114,
++16,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++226,
++137,
++41,
++146,
++40,
++246,
++34,
++138,
++41,
++146,
++32,
++247,
++163,
++141,
++63,
++178,
++32,
++247,
++227,
++141,
++62,
++162,
++0,
++254,
++0,
++240,
++8,
++4,
++0,
++240,
++128,
++11,
++128,
++253,
++35,
++240,
++9,
++100,
++192,
++243,
++128,
++10,
++128,
++253,
++163,
++141,
++128,
++115,
++192,
++243,
++152,
++10,
++88,
++246,
++163,
++141,
++4,
++100,
++208,
++246,
++35,
++139,
++0,
++100,
++32,
++255,
++34,
++139,
++53,
++202,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++139,
++0,
++4,
++0,
++240,
++0,
++160,
++240,
++246,
++163,
++141,
++48,
++98,
++0,
++247,
++99,
++139,
++63,
++210,
++0,
++247,
++98,
++139,
++1,
++212,
++88,
++254,
++98,
++139,
++1,
++212,
++192,
++243,
++128,
++11,
++32,
++255,
++99,
++139,
++62,
++98,
++192,
++243,
++188,
++10,
++88,
++246,
++98,
++139,
++1,
++212,
++240,
++246,
++98,
++139,
++50,
++210,
++0,
++247,
++163,
++128,
++59,
++146,
++0,
++247,
++160,
++128,
++1,
++36,
++88,
++254,
++160,
++128,
++1,
++36,
++192,
++243,
++128,
++11,
++0,
++247,
++163,
++128,
++58,
++98,
++64,
++255,
++35,
++240,
++0,
++100,
++192,
++243,
++128,
++10,
++64,
++255,
++163,
++128,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++160,
++128,
++1,
++36,
++240,
++246,
++160,
++128,
++50,
++34,
++8,
++255,
++227,
++143,
++54,
++242,
++192,
++243,
++60,
++128,
++40,
++255,
++227,
++142,
++54,
++178,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++39,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++143,
++45,
++226,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++44,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++40,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++142,
++2,
++162,
++192,
++243,
++60,
++128,
++90,
++0,
++169,
++3,
++14,
++96,
++4,
++31,
++169,
++3,
++30,
++96,
++1,
++31,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++158,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++131,
++102,
++0,
++158,
++81,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++122,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++114,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++128,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++117,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++168,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++72,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++61,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++75,
++255,
++239,
++3,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++47,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++13,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++140,
++47,
++240,
++32,
++247,
++35,
++141,
++63,
++178,
++64,
++254,
++35,
++141,
++2,
++68,
++192,
++243,
++128,
++11,
++32,
++255,
++35,
++240,
++58,
++226,
++192,
++243,
++188,
++10,
++0,
++254,
++0,
++141,
++4,
++4,
++0,
++240,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++240,
++246,
++35,
++141,
++48,
++66,
++0,
++247,
++227,
++143,
++52,
++242,
++32,
++247,
++227,
++142,
++52,
++178,
++90,
++0,
++161,
++3,
++6,
++64,
++23,
++64,
++96,
++8,
++70,
++98,
++97,
++8,
++70,
++98,
++98,
++8,
++70,
++98,
++99,
++8,
++70,
++98,
++100,
++8,
++70,
++98,
++101,
++8,
++70,
++98,
++255,
++159,
++244,
++249,
++23,
++102,
++7,
++106,
++112,
++30,
++33,
++3,
++};
+diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
+new file mode 100644
+index 0000000000..56d5206827
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,3110 @@
++static const unsigned char rpi_hevc_transform8 [] = {
++21,
++106,
++0,
++144,
++47,
++1,
++37,
++106,
++0,
++144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++240,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++8,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++4,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++57,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++8,
++0,
++0,
++69,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
++169,
++3,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++137,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++129,
++0,
++131,
++102,
++0,
++158,
++67,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++108,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++100,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++161,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++182,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++112,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++101,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++103,
++255,
++239,
++3,
++0,
++254,
++0,
++143,
++92,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++93,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++210,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++211,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++107,
++0,
++8,
++255,
++99,
++23,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++23,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++52,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++52,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++0,
++143,
++12,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++13,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++18,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++19,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++33,
++0,
++8,
++255,
++99,
++3,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++3,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++4,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++4,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++137,
++47,
++240,
++40,
++246,
++2,
++140,
++47,
++240,
++128,
++245,
++99,
++140,
++5,
++4,
++0,
++247,
++99,
++140,
++1,
++20,
++88,
++246,
++99,
++140,
++1,
++20,
++0,
++247,
++35,
++136,
++62,
++226,
++32,
++247,
++35,
++136,
++32,
++210,
++0,
++247,
++34,
++136,
++63,
++2,
++208,
++246,
++34,
++136,
++0,
++4,
++0,
++247,
++99,
++136,
++58,
++162,
++32,
++247,
++99,
++136,
++33,
++146,
++0,
++247,
++98,
++136,
++59,
++18,
++208,
++246,
++98,
++136,
++0,
++20,
++0,
++247,
++162,
++136,
++33,
++2,
++88,
++246,
++98,
++137,
++2,
++68,
++88,
++246,
++162,
++137,
++3,
++68,
++208,
++254,
++227,
++136,
++60,
++242,
++192,
++243,
++188,
++11,
++208,
++254,
++227,
++136,
++56,
++178,
++192,
++243,
++188,
++10,
++32,
++255,
++226,
++136,
++38,
++58,
++192,
++243,
++60,
++0,
++208,
++254,
++227,
++136,
++59,
++242,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++49,
++58,
++192,
++243,
++60,
++128,
++0,
++255,
++226,
++136,
++34,
++34,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++37,
++58,
++192,
++243,
++60,
++128,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++194,
++8,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++255,
++202,
++40,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++254,
++0,
++240,
++35,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++226,
++140,
++34,
++34,
++195,
++243,
++60,
++0,
++32,
++255,
++227,
++140,
++36,
++58,
++192,
++243,
++60,
++0,
++0,
++254,
++192,
++136,
++0,
++4,
++0,
++240,
++0,
++160,
++16,
++246,
++226,
++136,
++35,
++50,
++16,
++246,
++226,
++136,
++35,
++50,
++32,
++246,
++226,
++136,
++35,
++50,
++32,
++254,
++226,
++136,
++35,
++58,
++192,
++243,
++60,
++0,
++11,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++115,
++5,
++106,
++0,
++144,
++173,
++1,
++27,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++227,
++0,
++64,
++246,
++163,
++140,
++1,
++4,
++0,
++246,
++192,
++175,
++63,
++2,
++0,
++246,
++192,
++174,
++59,
++2,
++0,
++246,
++128,
++175,
++62,
++2,
++0,
++246,
++128,
++174,
++58,
++2,
++0,
++246,
++64,
++175,
++61,
++2,
++0,
++246,
++64,
++174,
++57,
++2,
++0,
++255,
++43,
++240,
++4,
++212,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++228,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++191,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++143,
++52,
++242,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++212,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++180,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++190,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++143,
++52,
++226,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++180,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++212,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++196,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++189,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++143,
++52,
++210,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++148,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++164,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++228,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++187,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++142,
++52,
++178,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++148,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++244,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++186,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++142,
++52,
++162,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++244,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++148,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++132,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++185,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++142,
++52,
++146,
++192,
++243,
++60,
++128,
++64,
++255,
++98,
++141,
++0,
++52,
++192,
++243,
++0,
++0,
++0,
++254,
++0,
++240,
++53,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++177,
++0,
++88,
++246,
++163,
++140,
++1,
++4,
++128,
++245,
++99,
++141,
++10,
++4,
++88,
++246,
++162,
++138,
++1,
++68,
++0,
++247,
++162,
++138,
++36,
++162,
++88,
++254,
++162,
++138,
++3,
++164,
++192,
++243,
++128,
++11,
++0,
++255,
++226,
++137,
++32,
++2,
++195,
++243,
++60,
++0,
++32,
++247,
++226,
++137,
++42,
++114,
++0,
++255,
++34,
++138,
++33,
++18,
++195,
++243,
++60,
++0,
++32,
++247,
++34,
++138,
++42,
++130,
++16,
++246,
++98,
++138,
++40,
++114,
++16,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++226,
++137,
++41,
++146,
++40,
++246,
++34,
++138,
++41,
++146,
++32,
++247,
++163,
++141,
++63,
++178,
++32,
++247,
++227,
++141,
++62,
++162,
++0,
++254,
++0,
++240,
++8,
++4,
++0,
++240,
++128,
++11,
++128,
++253,
++35,
++240,
++9,
++100,
++192,
++243,
++128,
++10,
++128,
++253,
++163,
++141,
++128,
++115,
++192,
++243,
++152,
++10,
++88,
++246,
++163,
++141,
++4,
++100,
++208,
++246,
++35,
++139,
++0,
++100,
++32,
++255,
++34,
++139,
++53,
++202,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++139,
++0,
++4,
++0,
++240,
++0,
++160,
++240,
++246,
++163,
++141,
++48,
++98,
++0,
++247,
++99,
++139,
++63,
++210,
++0,
++247,
++98,
++139,
++1,
++212,
++88,
++254,
++98,
++139,
++1,
++212,
++192,
++243,
++128,
++11,
++32,
++255,
++99,
++139,
++62,
++98,
++192,
++243,
++188,
++10,
++88,
++246,
++98,
++139,
++1,
++212,
++240,
++246,
++98,
++139,
++50,
++210,
++0,
++247,
++163,
++128,
++59,
++146,
++0,
++247,
++160,
++128,
++1,
++36,
++88,
++254,
++160,
++128,
++1,
++36,
++192,
++243,
++128,
++11,
++0,
++247,
++163,
++128,
++58,
++98,
++64,
++255,
++35,
++240,
++0,
++100,
++192,
++243,
++128,
++10,
++64,
++255,
++163,
++128,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++160,
++128,
++1,
++36,
++240,
++246,
++160,
++128,
++50,
++34,
++8,
++255,
++227,
++143,
++54,
++242,
++192,
++243,
++60,
++128,
++40,
++255,
++227,
++142,
++54,
++178,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++39,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++143,
++45,
++226,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++44,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++40,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++142,
++2,
++162,
++192,
++243,
++60,
++128,
++90,
++0,
++169,
++3,
++14,
++96,
++4,
++31,
++169,
++3,
++30,
++96,
++1,
++31,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++158,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++131,
++102,
++0,
++158,
++81,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++122,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++114,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++128,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++117,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++168,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++72,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++61,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++75,
++255,
++239,
++3,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++47,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++13,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++140,
++47,
++240,
++32,
++247,
++35,
++141,
++63,
++178,
++64,
++254,
++35,
++141,
++2,
++68,
++192,
++243,
++128,
++11,
++32,
++255,
++35,
++240,
++58,
++226,
++192,
++243,
++188,
++10,
++0,
++254,
++0,
++141,
++4,
++4,
++0,
++240,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++240,
++246,
++35,
++141,
++48,
++66,
++0,
++247,
++227,
++143,
++52,
++242,
++32,
++247,
++227,
++142,
++52,
++178,
++90,
++0,
++161,
++3,
++6,
++64,
++23,
++64,
++96,
++8,
++70,
++98,
++97,
++8,
++70,
++98,
++98,
++8,
++70,
++98,
++99,
++8,
++70,
++98,
++100,
++8,
++70,
++98,
++101,
++8,
++70,
++98,
++255,
++159,
++244,
++249,
++23,
++102,
++7,
++106,
++112,
++30,
++33,
++3,
++};
+diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
+new file mode 100644
+index 0000000000..00bd911a86
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.c
+@@ -0,0 +1,5630 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Wassim Hamidouche
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++#include "libavutil/display.h"
++#include "libavutil/internal.h"
++#include "libavutil/mastering_display_metadata.h"
++#include "libavutil/md5.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/stereo3d.h"
++
++#include "bswapdsp.h"
++#include "bytestream.h"
++#include "cabac_functions.h"
++#include "golomb.h"
++#include "hevc.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_parse.h"
++#include "rpi_hevcdec.h"
++#include "profiles.h"
++
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "pthread.h"
++#include "libavutil/atomic.h"
++
++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++ return a & ((1 << p) - 1);
++}
++# define av_mod_uintp2 av_mod_uintp2_c
++#endif
++
++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
++
++// UV & Y both have min 4x4 pred (no 2x2 chroma)
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
++#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
++
++#define QPU_C_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
++#define QPU_Y_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) + 2 * QPU_N_MAX)
++
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8] = {
++ ENCODE_COEFFS( 0, 64, 0, 0),
++ ENCODE_COEFFS( 2, 58, 10, 2),
++ ENCODE_COEFFS( 4, 54, 16, 2),
++ ENCODE_COEFFS( 6, 46, 28, 4),
++ ENCODE_COEFFS( 4, 36, 36, 4),
++ ENCODE_COEFFS( 4, 28, 46, 6),
++ ENCODE_COEFFS( 2, 16, 54, 4),
++ ENCODE_COEFFS( 2, 10, 58, 2)
++};
++
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++ const uint8_t bit_depth;
++ const uint8_t n;
++ const int * const * setup_fns;
++ const int * const * sync_fns;
++ const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++ ipe_chan_info_t luma;
++ ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16
++ { // 8
++ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++ },
++ { // 9
++ .luma = {0},
++ .chroma = {0}
++ },
++ { // 10
++ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++ }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++ const unsigned int n = ici->n;
++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word
++
++ ipe->n = n;
++ ipe->max_fill = q1_size - ipe->min_gap;
++ for(unsigned int i = 0; i < n; i++) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ q->qpu_mc_curr = q->qpu_mc_base =
++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++ q->code_setup = qpu_fn(ici->setup_fns[i]);
++ q->code_sync = qpu_fn(ici->sync_fns[i]);
++ q->code_exit = qpu_fn(ici->exit_fns[i]);
++ }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
++{
++ av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++}
++
++// Unsigned Trivial MOD
++static inline unsigned int utmod(const unsigned int x, const unsigned int n)
++{
++ return x >= n ? x - n : x;
++}
++
++// returns pq->job_n++
++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
++{
++ unsigned int const x2 = pq->job_n;
++ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
++ return x2;
++}
++
++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
++{
++ pq->terminate = 0;
++ pq->job_n = 0;
++ pq->context = s;
++ pq->worker = worker;
++ pq->psem_out = psem_out;
++ pq->pass_n = n;
++ pq->started = 0;
++ sem_init(&pq->sem_in, 0, 0);
++}
++
++static void pass_queue_kill(HEVCRpiPassQueue * const pq)
++{
++ sem_destroy(&pq->sem_in);
++}
++
++static inline void rpi_sem_wait(sem_t * const sem)
++{
++ while (sem_wait(sem) != 0) {
++ av_assert0(errno == EINTR);
++ }
++}
++
++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
++{
++ sem_post(&pq->sem_in);
++}
++
++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ // Do the various passes - common with the worker code
++ for (unsigned int i = 0; i != RPI_PASSES; ++i) {
++ s->passq[i].worker(s, jb);
++ }
++}
++
++
++#if 0
++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
++{
++ int x;
++ sem_getvalue((sem_t *)&jbc->sem_out, &x);
++ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
++}
++#endif
++
++
++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJob * jb;
++ HEVCRpiJobGlobal * const jbg = jbc->jbg;
++
++ pthread_mutex_lock(&jbg->lock);
++ // Check local 1st
++ if ((jb = jbc->jb1) != NULL)
++ {
++ // Only 1 - very easy :-)
++ jbc->jb1 = NULL;
++ }
++ else
++ {
++ // Now look for global free chain
++ if ((jb = jbg->free1) != NULL)
++ {
++ // Found one - unlink it
++ jbg->free1 = jb->next;
++ jb->next = NULL;
++ }
++ else
++ {
++ // Out of places to look - wait for one to become free - add to Qs
++
++ // Global
++ // If "good" lc then add after the last "good" el in the chain
++ // otherwise add to the tail
++ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
++ {
++ // Add to end as we had to wait last time or wait Q empty
++ if ((lc->jw_prev = jbg->wait_tail) == NULL)
++ jbg->wait_head = lc;
++ else
++ lc->jw_prev->jw_next = lc;
++ lc->jw_next = NULL;
++ jbg->wait_tail = lc;
++ }
++ else
++ {
++ // This is a "good" lc that we need to poke into the middle
++ // of the Q
++ // We know that the Q isn't empty and there is at least one
++ // !last_progess_good el in it from the previous test
++
++ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
++
++ if (p == NULL)
++ {
++ // No current good els - add to head
++ lc->jw_next = jbg->wait_head;
++ jbg->wait_head = lc;
++ }
++ else
++ {
++ lc->jw_next = p->jw_next;
++ p->jw_next = lc;
++ }
++
++ lc->jw_next->jw_prev = lc;
++ lc->jw_prev = p;
++ }
++
++ // If "good" then we are now the last good waiting el
++ if (lc->last_progress_good)
++ jbg->wait_good = lc;
++
++ // Local
++ if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
++ jbc->lcw_head = lc;
++ else
++ lc->ljw_prev->ljw_next = lc;
++ lc->ljw_next = NULL;
++ jbc->lcw_tail = lc;
++ }
++ }
++
++ pthread_mutex_unlock(&jbg->lock);
++
++ if (jb == NULL) // Need to wait
++ {
++ rpi_sem_wait(&lc->jw_sem);
++ jb = lc->jw_job; // Set by free code
++ }
++
++ return jb;
++}
++
++
++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
++{
++ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock
++ HEVCRpiJobCtl * jbc = jb->jbc_local;
++ HEVCRpiLocalContext * lc = NULL;
++
++ pthread_mutex_lock(&jbg->lock);
++
++ if (jbc != NULL)
++ {
++ av_assert1(jbc->jb1 == NULL);
++
++ // Release to Local if nothing waiting there
++ if ((lc = jbc->lcw_head) == NULL)
++ jbc->jb1 = jb;
++ }
++ else
++ {
++ // Release to global if nothing waiting there
++ if ((lc = jbg->wait_head) == NULL)
++ {
++ jb->next = jbg->free1;
++ jbg->free1 = jb;
++ }
++ else
++ {
++ // ? seems somehow mildy ugly...
++ jbc = lc->context->jbc;
++ }
++ }
++
++ if (lc != NULL)
++ {
++ // Something was waiting
++
++ // Unlink
++ // Global
++ if (lc->jw_next == NULL)
++ jbg->wait_tail = lc->jw_prev;
++ else
++ lc->jw_next->jw_prev = lc->jw_prev;
++
++ if (lc->jw_prev == NULL)
++ jbg->wait_head = lc->jw_next;
++ else
++ lc->jw_prev->jw_next = lc->jw_next;
++
++ // Local
++ if (lc->ljw_next == NULL)
++ jbc->lcw_tail = lc->ljw_prev;
++ else
++ lc->ljw_next->ljw_prev = lc->ljw_prev;
++
++ if (lc->ljw_prev == NULL)
++ jbc->lcw_head = lc->ljw_next;
++ else
++ lc->ljw_prev->ljw_next = lc->ljw_next;
++
++ // Update good if required
++ if (jbg->wait_good == lc)
++ jbg->wait_good = lc->jw_prev;
++
++ // Prod
++ lc->jw_job = jb;
++ sem_post(&lc->jw_sem);
++ }
++
++ pthread_mutex_unlock(&jbg->lock);
++}
++
++static void job_lc_kill(HEVCRpiLocalContext * const lc)
++{
++ sem_destroy(&lc->jw_sem);
++}
++
++static void job_lc_init(HEVCRpiLocalContext * const lc)
++{
++ lc->jw_next = NULL;
++ lc->jw_prev = NULL;
++ lc->ljw_next = NULL;
++ lc->ljw_prev = NULL;
++ lc->jw_job = NULL;
++ sem_init(&lc->jw_sem, 0, 0);
++}
++
++// Returns:
++// 0 if we have waited for MV or expect to wait for recon
++// 1 if we haven't waited for MV & do not need to wait for recon
++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
++{
++ if (jb->waited) // reset by rpi_begin
++ return 0;
++ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
++ {
++ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
++ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
++ return 0;
++ }
++ return 1;
++}
++
++// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl *const jbc = s->jbc;
++ HEVCRpiJob * const jb = lc->jb0;
++
++ av_assert1(jb != NULL);
++
++ if (jb->ctu_ts_last < 0) {
++ return;
++ }
++
++ lc->last_progress_good = progress_good(s, jb);
++ jb->waited = !lc->last_progress_good;
++ lc->jb0 = NULL;
++
++ if (s->offload_recon)
++ {
++ pthread_mutex_lock(&jbc->in_lock);
++ jbc->offloadq[jbc->offload_in] = jb;
++ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
++ pthread_mutex_unlock(&jbc->in_lock);
++
++ pass_queue_submit_job(s->passq + 0); // Consumes job eventually
++ }
++ else
++ {
++ pass_queue_do_all(s, jb); // Consumes job before return
++ }
++}
++
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++//
++// Now safe against multiple callers - needed for tiles
++// "normal" and WPP will only call here one at a time
++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++
++ // It is legit for us to already have a job allocated - do nothing in this case
++ if (lc->jb0 != NULL)
++ return;
++
++ if (s->offload_recon)
++ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much
++
++ lc->jb0 = job_alloc(jbc, lc);
++
++ rpi_begin(s, lc->jb0, lc->ts);
++}
++
++// Free up a job without submission
++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++ HEVCRpiJob * const jb = lc->jb0;
++
++ if (jb == NULL) {
++ return;
++ }
++
++ lc->jb0 = NULL;
++
++ job_free(jbc, jb);
++
++ // If offload then poke sem_out too
++ if (s->offload_recon) {
++ sem_post(&jbc->sem_out);
++ }
++}
++
++
++// Call this to wait for all jobs to have completed at the end of a frame
++// Slightly icky as there is no clean way to wait for a sem to count up
++// Not reentrant - call on main thread only
++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++ HEVCRpiJobCtl * const jbc = s->jbc;
++ int i = 0;
++
++ // We shouldn't reach here with an unsubmitted job
++ av_assert1(lc->jb0 == NULL);
++
++ // If no offload then there can't be anything to wait for
++ if (!s->offload_recon) {
++ return;
++ }
++
++ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
++ {
++ for (i = 0; i != RPI_MAX_JOBS; ++i) {
++ rpi_sem_wait(&jbc->sem_out);
++ }
++ for (i = 0; i != RPI_MAX_JOBS; ++i) {
++ sem_post(&jbc->sem_out);
++ }
++ }
++}
++
++static void * pass_worker(void *arg)
++{
++ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
++ HEVCRpiContext *const s = pq->context;
++
++ for (;;)
++ {
++ rpi_sem_wait(&pq->sem_in);
++
++ if (pq->terminate)
++ break;
++
++ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
++ // * should really set jb->passes_done here
++
++ sem_post(pq->psem_out);
++ }
++ return NULL;
++}
++
++static void pass_queues_start_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
++ pqs[i].started = 1;
++ }
++}
++
++static void pass_queues_term_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ pqs[i].terminate = 1;
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ if (pqs[i].started)
++ sem_post(&pqs[i].sem_in);
++ }
++ for (i = 0; i != RPI_PASSES; ++i)
++ {
++ if (pqs[i].started) {
++ pthread_join(pqs[i].thread, NULL);
++ pqs[i].started = 0;
++ }
++ }
++}
++
++static void pass_queues_kill_all(HEVCRpiContext *const s)
++{
++ unsigned int i;
++ HEVCRpiPassQueue * const pqs = s->passq;
++
++ for (i = 0; i != RPI_PASSES; ++i)
++ pass_queue_kill(pqs + i);
++}
++
++
++static void worker_pic_free_one(HEVCRpiJob * const jb)
++{
++ // Free coeff stuff - allocation not the same for all buffers
++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++ if (cf->s[0].buf != NULL)
++ av_freep(&cf->mptr);
++ if (cf->s[2].buf != NULL)
++ gpu_free(&cf->gptr);
++ memset(cf, 0, sizeof(*cf));
++}
++
++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
++{
++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++ goto fail;
++ cf->s[2].buf = (int16_t *)cf->gptr.arm;
++ cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++ // Must be 64 byte aligned for our zero zapping code so over-allocate &
++ // round
++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++ goto fail;
++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++ return 0;
++
++fail:
++ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
++ worker_pic_free_one(jb);
++ return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++ unsigned int i;
++ for (i = 0; i != 4; ++i) {
++ cf->s[i].n = 0;
++ }
++}
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
++{
++ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++ cfe->n += n;
++ return coeffs;
++}
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCFrame * const ref, const int val, const int field)
++{
++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
++ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
++ sem_t * sem = NULL;
++
++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++ if (((volatile int *)ref->tf.progress->data)[field] < val) {
++ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
++
++ av_assert1(pwait->req == -1 && pwait->next == NULL);
++ jb->waited = 1; // Remember that we had to wait for later scheduling
++
++ pwait->req = val;
++ pwait->next = NULL;
++ if (pstate->first == NULL)
++ pstate->first = pwait;
++ else
++ pstate->last->next = pwait;
++ pstate->last = pwait;
++ sem = &pwait->sem;
++ }
++ pthread_mutex_unlock(&pstate->lock);
++
++ if (sem != NULL) {
++ rpi_sem_wait(sem);
++ }
++ }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
++{
++ HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
++
++ ((int *)s->ref->tf.progress->data)[field] = val;
++
++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++ {
++ HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
++ HEVCRpiFrameProgressWait * pwait;
++
++ while ((pwait = *ppwait) != NULL) {
++ if (pwait->req > val)
++ {
++ ppwait = &pwait->next;
++ pstate->last = pwait;
++ }
++ else
++ {
++ *ppwait = pwait->next;
++ pwait->req = -1;
++ pwait->next = NULL;
++ sem_post(&pwait->sem);
++ }
++ }
++ }
++ pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
++{
++ pstate->first = NULL;
++ pstate->last = NULL;
++ pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++ pwait->req = -1;
++ pwait->next = NULL;
++ sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
++{
++ av_assert1(pstate->first == NULL);
++ pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++ sem_destroy(&pwait->sem);
++}
++
++
++/**
++ * NOTE: Each function hls_foo correspond to the function foo in the
++ * specification (HLS stands for High Level Syntax).
++ */
++
++/**
++ * Section 5.7
++ */
++
++/* free everything allocated by pic_arrays_init() */
++static void pic_arrays_free(HEVCRpiContext *s)
++{
++#ifdef RPI_DEBLOCK_VPU
++ {
++ int i;
++ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
++ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++
++ if (dvq->vpu_cmds_arm) {
++ gpu_free(&dvq->deblock_vpu_gmem);
++ dvq->vpu_cmds_arm = 0;
++ }
++ }
++ }
++#endif
++ av_freep(&s->sao);
++ av_freep(&s->deblock);
++
++ av_freep(&s->skip_flag);
++ av_freep(&s->tab_ct_depth);
++
++ av_freep(&s->tab_ipm);
++ av_freep(&s->cbf_luma);
++ av_freep(&s->is_pcm);
++
++ av_freep(&s->qp_y_tab);
++ av_freep(&s->tab_slice_address);
++ av_freep(&s->filter_slice_edges);
++
++ av_freep(&s->horizontal_bs);
++ av_freep(&s->vertical_bs);
++
++ av_freep(&s->sh.entry_point_offset);
++ av_freep(&s->sh.size);
++ av_freep(&s->sh.offset);
++
++ av_buffer_pool_uninit(&s->tab_mvf_pool);
++ av_buffer_pool_uninit(&s->rpl_tab_pool);
++}
++
++/* allocate arrays that depend on frame dimensions */
++static int pic_arrays_init(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++ int log2_min_cb_size = sps->log2_min_cb_size;
++ int width = sps->width;
++ int height = sps->height;
++ int pic_size_in_ctb = ((width >> log2_min_cb_size) + 1) *
++ ((height >> log2_min_cb_size) + 1);
++ int ctb_count = sps->ctb_width * sps->ctb_height;
++ int min_pu_size = sps->min_pu_width * sps->min_pu_height;
++
++#ifdef RPI_DEBLOCK_VPU
++ {
++ int i;
++ s->enable_rpi_deblock = !sps->sao_enabled;
++ s->setup_width = (sps->width+15) / 16;
++ s->setup_height = (sps->height+15) / 16;
++ s->uv_setup_width = ( (sps->width >> ctx_hshift(s, 1)) + 15) / 16;
++ s->uv_setup_height = ( (sps->height >> ctx_vshift(s, 1)) + 15) / 16;
++
++ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
++ {
++ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
++ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
++ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
++ const unsigned int total_size =- cmd_size + y_size + uv_size;
++ int p_vc;
++ uint8_t * p_arm;
++#if RPI_VPU_DEBLOCK_CACHED
++ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
++#else
++ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
++#endif
++ p_vc = dvq->deblock_vpu_gmem.vc;
++ p_arm = dvq->deblock_vpu_gmem.arm;
++
++ // Zap all
++ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
++
++ // Subdivide
++ dvq->vpu_cmds_arm = (void*)p_arm;
++ dvq->vpu_cmds_vc = p_vc;
++
++ p_arm += cmd_size;
++ p_vc += cmd_size;
++
++ dvq->y_setup_arm = (void*)p_arm;
++ dvq->y_setup_vc = (void*)p_vc;
++
++ p_arm += y_size;
++ p_vc += y_size;
++
++ dvq->uv_setup_arm = (void*)p_arm;
++ dvq->uv_setup_vc = (void*)p_vc;
++ }
++
++ s->dvq_n = 0;
++ s->dvq = s->dvq_ents + s->dvq_n;
++ }
++#endif
++
++ s->bs_width = (width >> 2) + 1;
++ s->bs_height = (height >> 2) + 1;
++
++ s->sao = av_mallocz_array(ctb_count, sizeof(*s->sao));
++ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock));
++ if (!s->sao || !s->deblock)
++ goto fail;
++
++ s->skip_flag = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
++ s->tab_ct_depth = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
++ if (!s->skip_flag || !s->tab_ct_depth)
++ goto fail;
++
++ s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height);
++ s->tab_ipm = av_mallocz(min_pu_size);
++ s->is_pcm = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
++ if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
++ goto fail;
++
++ s->filter_slice_edges = av_mallocz(ctb_count);
++ s->tab_slice_address = av_malloc_array(pic_size_in_ctb,
++ sizeof(*s->tab_slice_address));
++ s->qp_y_tab = av_malloc_array(pic_size_in_ctb,
++ sizeof(*s->qp_y_tab));
++ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
++ goto fail;
++
++ s->horizontal_bs = av_mallocz_array(s->bs_width, s->bs_height);
++ s->vertical_bs = av_mallocz_array(s->bs_width, s->bs_height);
++ if (!s->horizontal_bs || !s->vertical_bs)
++ goto fail;
++
++ s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField),
++ av_buffer_allocz);
++ s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab),
++ av_buffer_allocz);
++ if (!s->tab_mvf_pool || !s->rpl_tab_pool)
++ goto fail;
++
++ return 0;
++
++fail:
++ pic_arrays_free(s);
++ return AVERROR(ENOMEM);
++}
++
++static void default_pred_weight_table(HEVCRpiContext * const s)
++{
++ unsigned int i;
++ s->sh.luma_log2_weight_denom = 0;
++ s->sh.chroma_log2_weight_denom = 0;
++ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++ s->sh.luma_weight_l0[i] = 1;
++ s->sh.luma_offset_l0[i] = 0;
++ s->sh.chroma_weight_l0[i][0] = 1;
++ s->sh.chroma_offset_l0[i][0] = 0;
++ s->sh.chroma_weight_l0[i][1] = 1;
++ s->sh.chroma_offset_l0[i][1] = 0;
++ }
++ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++ s->sh.luma_weight_l1[i] = 1;
++ s->sh.luma_offset_l1[i] = 0;
++ s->sh.chroma_weight_l1[i][0] = 1;
++ s->sh.chroma_offset_l1[i][0] = 0;
++ s->sh.chroma_weight_l1[i][1] = 1;
++ s->sh.chroma_offset_l1[i][1] = 0;
++ }
++}
++
++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
++{
++ int i = 0;
++ int j = 0;
++ uint8_t luma_weight_l0_flag[16];
++ uint8_t chroma_weight_l0_flag[16];
++ uint8_t luma_weight_l1_flag[16];
++ uint8_t chroma_weight_l1_flag[16];
++ int luma_log2_weight_denom;
++
++ luma_log2_weight_denom = get_ue_golomb_long(gb);
++ if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7)
++ av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom);
++ s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3);
++ if (ctx_cfmt(s) != 0) {
++ int delta = get_se_golomb(gb);
++ s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3);
++ }
++
++ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++ luma_weight_l0_flag[i] = get_bits1(gb);
++ if (!luma_weight_l0_flag[i]) {
++ s->sh.luma_weight_l0[i] = 1 << s->sh.luma_log2_weight_denom;
++ s->sh.luma_offset_l0[i] = 0;
++ }
++ }
++ if (ctx_cfmt(s) != 0) {
++ for (i = 0; i < s->sh.nb_refs[L0]; i++)
++ chroma_weight_l0_flag[i] = get_bits1(gb);
++ } else {
++ for (i = 0; i < s->sh.nb_refs[L0]; i++)
++ chroma_weight_l0_flag[i] = 0;
++ }
++ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++ if (luma_weight_l0_flag[i]) {
++ int delta_luma_weight_l0 = get_se_golomb(gb);
++ s->sh.luma_weight_l0[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l0;
++ s->sh.luma_offset_l0[i] = get_se_golomb(gb);
++ }
++ if (chroma_weight_l0_flag[i]) {
++ for (j = 0; j < 2; j++) {
++ int delta_chroma_weight_l0 = get_se_golomb(gb);
++ int delta_chroma_offset_l0 = get_se_golomb(gb);
++
++ if ( (int8_t)delta_chroma_weight_l0 != delta_chroma_weight_l0
++ || delta_chroma_offset_l0 < -(1<<17) || delta_chroma_offset_l0 > (1<<17)) {
++ return AVERROR_INVALIDDATA;
++ }
++
++ s->sh.chroma_weight_l0[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l0;
++ s->sh.chroma_offset_l0[i][j] = av_clip((delta_chroma_offset_l0 - ((128 * s->sh.chroma_weight_l0[i][j])
++ >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
++ }
++ } else {
++ s->sh.chroma_weight_l0[i][0] = 1 << s->sh.chroma_log2_weight_denom;
++ s->sh.chroma_offset_l0[i][0] = 0;
++ s->sh.chroma_weight_l0[i][1] = 1 << s->sh.chroma_log2_weight_denom;
++ s->sh.chroma_offset_l0[i][1] = 0;
++ }
++ }
++ if (s->sh.slice_type == HEVC_SLICE_B) {
++ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++ luma_weight_l1_flag[i] = get_bits1(gb);
++ if (!luma_weight_l1_flag[i]) {
++ s->sh.luma_weight_l1[i] = 1 << s->sh.luma_log2_weight_denom;
++ s->sh.luma_offset_l1[i] = 0;
++ }
++ }
++ if (ctx_cfmt(s) != 0) {
++ for (i = 0; i < s->sh.nb_refs[L1]; i++)
++ chroma_weight_l1_flag[i] = get_bits1(gb);
++ } else {
++ for (i = 0; i < s->sh.nb_refs[L1]; i++)
++ chroma_weight_l1_flag[i] = 0;
++ }
++ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++ if (luma_weight_l1_flag[i]) {
++ int delta_luma_weight_l1 = get_se_golomb(gb);
++ s->sh.luma_weight_l1[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l1;
++ s->sh.luma_offset_l1[i] = get_se_golomb(gb);
++ }
++ if (chroma_weight_l1_flag[i]) {
++ for (j = 0; j < 2; j++) {
++ int delta_chroma_weight_l1 = get_se_golomb(gb);
++ int delta_chroma_offset_l1 = get_se_golomb(gb);
++
++ if ( (int8_t)delta_chroma_weight_l1 != delta_chroma_weight_l1
++ || delta_chroma_offset_l1 < -(1<<17) || delta_chroma_offset_l1 > (1<<17)) {
++ return AVERROR_INVALIDDATA;
++ }
++
++ s->sh.chroma_weight_l1[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l1;
++ s->sh.chroma_offset_l1[i][j] = av_clip((delta_chroma_offset_l1 - ((128 * s->sh.chroma_weight_l1[i][j])
++ >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
++ }
++ } else {
++ s->sh.chroma_weight_l1[i][0] = 1 << s->sh.chroma_log2_weight_denom;
++ s->sh.chroma_offset_l1[i][0] = 0;
++ s->sh.chroma_weight_l1[i][1] = 1 << s->sh.chroma_log2_weight_denom;
++ s->sh.chroma_offset_l1[i][1] = 0;
++ }
++ }
++ }
++ return 0;
++}
++
++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
++{
++ const HEVCRpiSPS *sps = s->ps.sps;
++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
++ int prev_delta_msb = 0;
++ unsigned int nb_sps = 0, nb_sh;
++ int i;
++
++ rps->nb_refs = 0;
++ if (!sps->long_term_ref_pics_present_flag)
++ return 0;
++
++ if (sps->num_long_term_ref_pics_sps > 0)
++ nb_sps = get_ue_golomb_long(gb);
++ nb_sh = get_ue_golomb_long(gb);
++
++ if (nb_sps > sps->num_long_term_ref_pics_sps)
++ return AVERROR_INVALIDDATA;
++ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
++ return AVERROR_INVALIDDATA;
++
++ rps->nb_refs = nb_sh + nb_sps;
++
++ for (i = 0; i < rps->nb_refs; i++) {
++ uint8_t delta_poc_msb_present;
++
++ if (i < nb_sps) {
++ uint8_t lt_idx_sps = 0;
++
++ if (sps->num_long_term_ref_pics_sps > 1)
++ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
++
++ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
++ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
++ } else {
++ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb);
++ rps->used[i] = get_bits1(gb);
++ }
++
++ delta_poc_msb_present = get_bits1(gb);
++ if (delta_poc_msb_present) {
++ int64_t delta = get_ue_golomb_long(gb);
++ int64_t poc;
++
++ if (i && i != nb_sps)
++ delta += prev_delta_msb;
++
++ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
++ if (poc != (int32_t)poc)
++ return AVERROR_INVALIDDATA;
++ rps->poc[i] = poc;
++ prev_delta_msb = delta;
++ }
++ }
++
++ return 0;
++}
++
++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
++ const HEVCRpiSPS *sps)
++{
++ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
++ const HEVCWindow *ow = &sps->output_window;
++ unsigned int num = 0, den = 0;
++
++ avctx->pix_fmt = sps->pix_fmt;
++ avctx->coded_width = sps->width;
++ avctx->coded_height = sps->height;
++ avctx->width = sps->width - ow->left_offset - ow->right_offset;
++ avctx->height = sps->height - ow->top_offset - ow->bottom_offset;
++ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
++ avctx->profile = sps->ptl.general_ptl.profile_idc;
++ avctx->level = sps->ptl.general_ptl.level_idc;
++
++ ff_set_sar(avctx, sps->vui.sar);
++
++ if (sps->vui.video_signal_type_present_flag)
++ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
++ : AVCOL_RANGE_MPEG;
++ else
++ avctx->color_range = AVCOL_RANGE_MPEG;
++
++ if (sps->vui.colour_description_present_flag) {
++ avctx->color_primaries = sps->vui.colour_primaries;
++ avctx->color_trc = sps->vui.transfer_characteristic;
++ avctx->colorspace = sps->vui.matrix_coeffs;
++ } else {
++ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
++ avctx->color_trc = AVCOL_TRC_UNSPECIFIED;
++ avctx->colorspace = AVCOL_SPC_UNSPECIFIED;
++ }
++
++ if (vps->vps_timing_info_present_flag) {
++ num = vps->vps_num_units_in_tick;
++ den = vps->vps_time_scale;
++ } else if (sps->vui.vui_timing_info_present_flag) {
++ num = sps->vui.vui_num_units_in_tick;
++ den = sps->vui.vui_time_scale;
++ }
++
++ if (num != 0 && den != 0)
++ av_reduce(&avctx->framerate.den, &avctx->framerate.num,
++ num, den, 1 << 30);
++}
++
++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
++
++ // Admit to no h/w formats
++
++ *fmt++ = sps->pix_fmt;
++ *fmt = AV_PIX_FMT_NONE;
++
++ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
++}
++
++static int is_sps_supported(const HEVCRpiSPS * const sps)
++{
++ return av_rpi_is_sand_format(sps->pix_fmt) &&
++ sps->width <= HEVC_RPI_MAX_WIDTH &&
++ sps->height <= HEVC_RPI_MAX_HEIGHT;
++}
++
++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
++ const enum AVPixelFormat pix_fmt)
++{
++ int ret;
++
++ pic_arrays_free(s);
++ s->ps.sps = NULL;
++ s->ps.vps = NULL;
++
++ if (sps == NULL)
++ return 0;
++
++ if (!is_sps_supported(sps))
++ return AVERROR_DECODER_NOT_FOUND;
++
++ ret = pic_arrays_init(s, sps);
++ if (ret < 0)
++ goto fail;
++
++ export_stream_params(s->avctx, &s->ps, sps);
++
++ s->avctx->pix_fmt = pix_fmt;
++
++ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth);
++ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
++ ff_videodsp_init (&s->vdsp, sps->bit_depth);
++
++ // * We don't support cross_component_prediction_enabled_flag but as that
++ // must be 0 unless we have 4:4:4 there is no point testing for it as we
++ // only deal with sand which is never 4:4:4
++ // [support wouldn't be hard]
++
++ rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++
++ av_freep(&s->sao_pixel_buffer_h[0]);
++ av_freep(&s->sao_pixel_buffer_v[0]);
++
++ if (sps->sao_enabled)
++ {
++ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
++ unsigned int c_idx;
++ size_t vsize[3] = {0};
++ size_t hsize[3] = {0};
++
++ for(c_idx = 0; c_idx < c_count; c_idx++) {
++ int w = sps->width >> ctx_hshift(s, c_idx);
++ int h = sps->height >> ctx_vshift(s, c_idx);
++ // ctb height & width are a min of 8 so this must a multiple of 16
++ // so no point rounding up!
++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++ }
++
++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++ // when we have plaited chroma
++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++ }
++
++ s->ps.sps = sps;
++ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++
++ return 0;
++
++fail:
++ pic_arrays_free(s);
++ s->ps.sps = NULL;
++ return ret;
++}
++
++static int hls_slice_header(HEVCRpiContext *s)
++{
++ GetBitContext *gb = &s->HEVClc->gb;
++ SliceHeader *sh = &s->sh;
++ int i, ret;
++
++ // Coded parameters
++ sh->first_slice_in_pic_flag = get_bits1(gb);
++ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ if (IS_IDR(s))
++ ff_hevc_rpi_clear_refs(s);
++ }
++ sh->no_output_of_prior_pics_flag = 0;
++ if (IS_IRAP(s))
++ sh->no_output_of_prior_pics_flag = get_bits1(gb);
++
++ sh->pps_id = get_ue_golomb_long(gb);
++ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
++ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
++ return AVERROR_INVALIDDATA;
++ }
++ if (!sh->first_slice_in_pic_flag &&
++ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
++ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
++ return AVERROR_INVALIDDATA;
++ }
++ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
++ sh->no_output_of_prior_pics_flag = 1;
++
++ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
++ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
++ const HEVCRpiSPS *last_sps = s->ps.sps;
++
++ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
++ if (sps->width != last_sps->width || sps->height != last_sps->height ||
++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
++ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
++ sh->no_output_of_prior_pics_flag = 0;
++ }
++ ff_hevc_rpi_clear_refs(s);
++
++ ret = set_sps(s, sps, get_format(s, sps));
++ if (ret < 0)
++ return ret;
++
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ }
++
++ sh->dependent_slice_segment_flag = 0;
++ if (!sh->first_slice_in_pic_flag) {
++ int slice_address_length;
++
++ if (s->ps.pps->dependent_slice_segments_enabled_flag)
++ sh->dependent_slice_segment_flag = get_bits1(gb);
++
++ slice_address_length = av_ceil_log2(s->ps.sps->ctb_width *
++ s->ps.sps->ctb_height);
++ sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
++ if (sh->slice_segment_addr >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid slice segment address: %u.\n",
++ sh->slice_segment_addr);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (!sh->dependent_slice_segment_flag) {
++ sh->slice_addr = sh->slice_segment_addr;
++ s->slice_idx++;
++ }
++ } else {
++ sh->slice_segment_addr = sh->slice_addr = 0;
++ s->slice_idx = 0;
++ s->slice_initialized = 0;
++ }
++
++ if (!sh->dependent_slice_segment_flag) {
++ s->slice_initialized = 0;
++
++ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
++ skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
++
++ sh->slice_type = get_ue_golomb_long(gb);
++ if (!(sh->slice_type == HEVC_SLICE_I ||
++ sh->slice_type == HEVC_SLICE_P ||
++ sh->slice_type == HEVC_SLICE_B)) {
++ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
++ sh->slice_type);
++ return AVERROR_INVALIDDATA;
++ }
++ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
++ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // when flag is not present, picture is inferred to be output
++ sh->pic_output_flag = 1;
++ if (s->ps.pps->output_flag_present_flag)
++ sh->pic_output_flag = get_bits1(gb);
++
++ if (s->ps.sps->separate_colour_plane_flag)
++ sh->colour_plane_id = get_bits(gb, 2);
++
++ if (!IS_IDR(s)) {
++ int poc, pos;
++
++ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
++ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
++ if (!sh->first_slice_in_pic_flag && poc != s->poc) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return AVERROR_INVALIDDATA;
++ poc = s->poc;
++ }
++ s->poc = poc;
++
++ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
++ pos = get_bits_left(gb);
++ if (!sh->short_term_ref_pic_set_sps_flag) {
++ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
++ if (ret < 0)
++ return ret;
++
++ sh->short_term_rps = &sh->slice_rps;
++ } else {
++ int numbits, rps_idx;
++
++ if (!s->ps.sps->nb_st_rps) {
++ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
++ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
++ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
++ }
++ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++ pos = get_bits_left(gb);
++ ret = decode_lt_rps(s, &sh->long_term_rps, gb);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return AVERROR_INVALIDDATA;
++ }
++ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++ if (s->ps.sps->sps_temporal_mvp_enabled_flag)
++ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
++ else
++ sh->slice_temporal_mvp_enabled_flag = 0;
++ } else {
++ s->sh.short_term_rps = NULL;
++ s->poc = 0;
++ }
++
++ /* 8.3.1 */
++ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
++ s->nal_unit_type != HEVC_NAL_TRAIL_N &&
++ s->nal_unit_type != HEVC_NAL_TSA_N &&
++ s->nal_unit_type != HEVC_NAL_STSA_N &&
++ s->nal_unit_type != HEVC_NAL_RADL_N &&
++ s->nal_unit_type != HEVC_NAL_RADL_R &&
++ s->nal_unit_type != HEVC_NAL_RASL_N &&
++ s->nal_unit_type != HEVC_NAL_RASL_R)
++ s->pocTid0 = s->poc;
++
++ if (s->ps.sps->sao_enabled) {
++ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
++ if (ctx_cfmt(s) != 0) {
++ sh->slice_sample_adaptive_offset_flag[1] =
++ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
++ }
++ } else {
++ sh->slice_sample_adaptive_offset_flag[0] = 0;
++ sh->slice_sample_adaptive_offset_flag[1] = 0;
++ sh->slice_sample_adaptive_offset_flag[2] = 0;
++ }
++
++ sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
++ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
++ int nb_refs;
++
++ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
++
++ if (get_bits1(gb)) { // num_ref_idx_active_override_flag
++ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
++ }
++ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
++ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
++ sh->nb_refs[L0], sh->nb_refs[L1]);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->rpl_modification_flag[0] = 0;
++ sh->rpl_modification_flag[1] = 0;
++ nb_refs = ff_hevc_rpi_frame_nb_refs(s);
++ if (!nb_refs) {
++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
++ sh->rpl_modification_flag[0] = get_bits1(gb);
++ if (sh->rpl_modification_flag[0]) {
++ for (i = 0; i < sh->nb_refs[L0]; i++)
++ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B) {
++ sh->rpl_modification_flag[1] = get_bits1(gb);
++ if (sh->rpl_modification_flag[1] == 1)
++ for (i = 0; i < sh->nb_refs[L1]; i++)
++ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
++ }
++ }
++
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->mvd_l1_zero_flag = get_bits1(gb);
++
++ if (s->ps.pps->cabac_init_present_flag)
++ sh->cabac_init_flag = get_bits1(gb);
++ else
++ sh->cabac_init_flag = 0;
++
++ sh->collocated_ref_idx = 0;
++ if (sh->slice_temporal_mvp_enabled_flag) {
++ sh->collocated_list = L0;
++ if (sh->slice_type == HEVC_SLICE_B)
++ sh->collocated_list = !get_bits1(gb);
++
++ if (sh->nb_refs[sh->collocated_list] > 1) {
++ sh->collocated_ref_idx = get_ue_golomb_long(gb);
++ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid collocated_ref_idx: %d.\n",
++ sh->collocated_ref_idx);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++ }
++
++ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) ||
++ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) {
++ int ret = pred_weight_table(s, gb);
++ if (ret < 0)
++ return ret;
++ }
++ else
++ {
++ // Give us unit weights
++ default_pred_weight_table(s);
++ }
++
++ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid number of merging MVP candidates: %d.\n",
++ sh->max_num_merge_cand);
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ sh->slice_qp_delta = get_se_golomb(gb);
++
++ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
++ sh->slice_cb_qp_offset = get_se_golomb(gb);
++ sh->slice_cr_qp_offset = get_se_golomb(gb);
++ } else {
++ sh->slice_cb_qp_offset = 0;
++ sh->slice_cr_qp_offset = 0;
++ }
++
++ if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
++ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
++ else
++ sh->cu_chroma_qp_offset_enabled_flag = 0;
++
++ if (s->ps.pps->deblocking_filter_control_present_flag) {
++ int deblocking_filter_override_flag = 0;
++
++ if (s->ps.pps->deblocking_filter_override_enabled_flag)
++ deblocking_filter_override_flag = get_bits1(gb);
++
++ if (deblocking_filter_override_flag) {
++ sh->disable_deblocking_filter_flag = get_bits1(gb);
++ if (!sh->disable_deblocking_filter_flag) {
++ int beta_offset_div2 = get_se_golomb(gb);
++ int tc_offset_div2 = get_se_golomb(gb) ;
++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
++ tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Invalid deblock filter offsets: %d, %d\n",
++ beta_offset_div2, tc_offset_div2);
++ return AVERROR_INVALIDDATA;
++ }
++ sh->beta_offset = beta_offset_div2 * 2;
++ sh->tc_offset = tc_offset_div2 * 2;
++ }
++ } else {
++ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
++ sh->beta_offset = s->ps.pps->beta_offset;
++ sh->tc_offset = s->ps.pps->tc_offset;
++ }
++ } else {
++ sh->disable_deblocking_filter_flag = 0;
++ sh->beta_offset = 0;
++ sh->tc_offset = 0;
++ }
++
++ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
++ (sh->slice_sample_adaptive_offset_flag[0] ||
++ sh->slice_sample_adaptive_offset_flag[1] ||
++ !sh->disable_deblocking_filter_flag)) {
++ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++ } else {
++ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
++ }
++ } else if (!s->slice_initialized) {
++ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->num_entry_point_offsets = 0;
++ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
++ unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
++ // It would be possible to bound this tighter but this here is simpler
++ if (num_entry_point_offsets > get_bits_left(gb)) {
++ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->num_entry_point_offsets = num_entry_point_offsets;
++ if (sh->num_entry_point_offsets > 0) {
++ int offset_len = get_ue_golomb_long(gb) + 1;
++
++ if (offset_len < 1 || offset_len > 32) {
++ sh->num_entry_point_offsets = 0;
++ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
++ return AVERROR_INVALIDDATA;
++ }
++
++ av_freep(&sh->entry_point_offset);
++ av_freep(&sh->offset);
++ av_freep(&sh->size);
++ sh->entry_point_offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(unsigned));
++ sh->offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
++ sh->size = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
++ if (!sh->entry_point_offset || !sh->offset || !sh->size) {
++ sh->num_entry_point_offsets = 0;
++ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
++ return AVERROR(ENOMEM);
++ }
++ for (i = 0; i < sh->num_entry_point_offsets; i++) {
++ unsigned val = get_bits_long(gb, offset_len);
++ sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size
++ }
++ if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) {
++ s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here
++ s->threads_number = 1;
++ } else
++ s->enable_parallel_tiles = 0;
++ } else
++ s->enable_parallel_tiles = 0;
++ }
++
++ if (s->ps.pps->slice_header_extension_present_flag) {
++ unsigned int length = get_ue_golomb_long(gb);
++ if (length*8LL > get_bits_left(gb)) {
++ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
++ return AVERROR_INVALIDDATA;
++ }
++ for (i = 0; i < length; i++)
++ skip_bits(gb, 8); // slice_header_extension_data_byte
++ }
++
++ // Inferred parameters
++ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
++ if (sh->slice_qp > 51 ||
++ sh->slice_qp < -s->ps.sps->qp_bd_offset) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "The slice_qp %d is outside the valid range "
++ "[%d, 51].\n",
++ sh->slice_qp,
++ -s->ps.sps->qp_bd_offset);
++ return AVERROR_INVALIDDATA;
++ }
++
++ sh->slice_ctb_addr_rs = sh->slice_segment_addr;
++
++ if (!s->sh.slice_ctb_addr_rs && s->sh.dependent_slice_segment_flag) {
++ av_log(s->avctx, AV_LOG_ERROR, "Impossible slice segment.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (get_bits_left(gb) < 0) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Overread slice header by %d bits\n", -get_bits_left(gb));
++ return AVERROR_INVALIDDATA;
++ }
++
++ s->slice_initialized = 1;
++ return 0;
++}
++
++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
++{
++ SAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
++ int c_idx, i;
++
++ if (s->sh.slice_sample_adaptive_offset_flag[0] ||
++ s->sh.slice_sample_adaptive_offset_flag[1]) {
++ if (lc->ctb_left_flag)
++ {
++ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++ if (sao_merge_left_flag) {
++ *sao = sao[-1];
++ return;
++ }
++ }
++ if (lc->ctb_up_flag)
++ {
++ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++ if (sao_merge_up_flag) {
++ *sao = sao[-(int)s->ps.sps->ctb_width];
++ return;
++ }
++ }
++ }
++
++ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
++ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
++ s->ps.pps->log2_sao_offset_scale_chroma;
++ int offset_abs[4];
++ char offset_sign[4] = {0};
++
++ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
++ sao->type_idx[c_idx] = SAO_NOT_APPLIED;
++ continue;
++ }
++
++ if (c_idx == 2) {
++ sao->type_idx[2] = sao->type_idx[1];
++ sao->eo_class[2] = sao->eo_class[1];
++ } else {
++ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
++ }
++
++ // ** Could use BY22 here quite plausibly - this is all bypass stuff
++ // though only per CTB so not very timing critical
++
++ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
++ continue;
++
++ for (i = 0; i < 4; i++)
++ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
++
++ if (sao->type_idx[c_idx] == SAO_BAND) {
++ for (i = 0; i < 4; i++) {
++ if (offset_abs[i] != 0)
++ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
++ }
++ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
++ } else if (c_idx != 2) {
++ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
++ }
++
++ // Inferred parameters
++ sao->offset_val[c_idx][0] = 0;
++ for (i = 0; i < 4; i++) {
++ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
++ if (sao->type_idx[c_idx] == SAO_EDGE) {
++ if (i > 1)
++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++ } else if (offset_sign[i]) {
++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++ }
++ }
++ }
++}
++
++
++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
++ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);
++
++ if (log2_res_scale_abs_plus1 != 0) {
++ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
++ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
++ (1 - 2 * res_scale_sign_flag);
++ } else {
++ lc->tu.res_scale_val = 0;
++ }
++
++
++ return 0;
++}
++
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
++{
++ return jb->intra.cmds + jb->intra.n++;
++}
++
++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx)
++{
++ // If rpi_enabled then sand - U & V done on U call
++ if (c_idx <= 1)
++ {
++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++ cmd->type = RPI_PRED_INTRA;
++ cmd->size = log2_trafo_size;
++ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
++ cmd->c_idx = c_idx;
++ cmd->i_pred.x = x0;
++ cmd->i_pred.y = y0;
++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++ }
++}
++
++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++ int xBase, int yBase, int cb_xBase, int cb_yBase,
++ int log2_cb_size, int log2_trafo_size,
++ int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr)
++{
++// const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1];
++ const int log2_trafo_size_c = log2_trafo_size - ctx_hshift(s, 1);
++ int i;
++
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ int trafo_size = 1 << log2_trafo_size;
++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size);
++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0);
++ }
++
++ if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
++ (ctx_cfmt(s) == 2 && (cbf_cb[1] || cbf_cr[1]))) {
++ int scan_idx = SCAN_DIAG;
++ int scan_idx_c = SCAN_DIAG;
++ int cbf_chroma = cbf_cb[0] || cbf_cr[0] ||
++ (ctx_cfmt(s) == 2 &&
++ (cbf_cb[1] || cbf_cr[1]));
++
++ if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) {
++ lc->tu.cu_qp_delta = ff_hevc_rpi_cu_qp_delta_abs(lc);
++ if (lc->tu.cu_qp_delta != 0)
++ if (ff_hevc_rpi_cu_qp_delta_sign_flag(lc) == 1)
++ lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta;
++ lc->tu.is_cu_qp_delta_coded = 1;
++
++ if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) ||
++ lc->tu.cu_qp_delta > (25 + s->ps.sps->qp_bd_offset / 2)) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "The cu_qp_delta %d is outside the valid range "
++ "[%d, %d].\n",
++ lc->tu.cu_qp_delta,
++ -(26 + s->ps.sps->qp_bd_offset / 2),
++ (25 + s->ps.sps->qp_bd_offset / 2));
++ return AVERROR_INVALIDDATA;
++ }
++
++ ff_hevc_rpi_set_qPy(s, lc, cb_xBase, cb_yBase, log2_cb_size);
++ }
++
++ if (!lc->tu.is_cu_chroma_qp_offset_coded && cbf_chroma &&
++ !lc->cu.cu_transquant_bypass_flag) {
++ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
++ if (cu_chroma_qp_offset_flag) {
++ int cu_chroma_qp_offset_idx = 0;
++ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
++ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
++ av_log(s->avctx, AV_LOG_ERROR,
++ "cu_chroma_qp_offset_idx not yet tested.\n");
++ }
++ lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
++ lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
++ }
++ lc->tu.is_cu_chroma_qp_offset_coded = 1;
++ }
++
++ if (lc->cu.pred_mode == MODE_INTRA && log2_trafo_size < 4) {
++ if (lc->tu.intra_pred_mode >= 6 &&
++ lc->tu.intra_pred_mode <= 14) {
++ scan_idx = SCAN_VERT;
++ } else if (lc->tu.intra_pred_mode >= 22 &&
++ lc->tu.intra_pred_mode <= 30) {
++ scan_idx = SCAN_HORIZ;
++ }
++
++ if (lc->tu.intra_pred_mode_c >= 6 &&
++ lc->tu.intra_pred_mode_c <= 14) {
++ scan_idx_c = SCAN_VERT;
++ } else if (lc->tu.intra_pred_mode_c >= 22 &&
++ lc->tu.intra_pred_mode_c <= 30) {
++ scan_idx_c = SCAN_HORIZ;
++ }
++ }
++
++ lc->tu.cross_pf = 0;
++
++ if (cbf_luma)
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
++ if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) {
++ const int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1));
++ const int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1));
++ lc->tu.cross_pf = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma &&
++ (lc->cu.pred_mode == MODE_INTER ||
++ (lc->tu.chroma_mode_c == 4)));
++
++ if (lc->tu.cross_pf) {
++ hls_cross_component_pred(lc, 0);
++ }
++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
++ }
++ if (cbf_cb[i])
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
++ log2_trafo_size_c, scan_idx_c, 1);
++ else
++ if (lc->tu.cross_pf) {
++ const ptrdiff_t stride = frame_stride1(s->frame, 1);
++ const int hshift = ctx_hshift(s, 1);
++ const int vshift = ctx_vshift(s, 1);
++ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
++ int16_t * const coeffs = (int16_t*)lc->edge_emu_buffer2;
++ int size = 1 << log2_trafo_size_c;
++
++ uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride +
++ ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++ for (i = 0; i < (size * size); i++) {
++ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++ }
++ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
++ }
++ }
++
++ if (lc->tu.cross_pf) {
++ hls_cross_component_pred(lc, 1);
++ }
++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
++ }
++ if (cbf_cr[i])
++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
++ log2_trafo_size_c, scan_idx_c, 2);
++ else
++ if (lc->tu.cross_pf) {
++ ptrdiff_t stride = frame_stride1(s->frame, 2);
++ const int hshift = ctx_hshift(s, 2);
++ const int vshift = ctx_vshift(s, 2);
++ int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
++ int16_t *coeffs = (int16_t*)lc->edge_emu_buffer2;
++ const int size = 1 << log2_trafo_size_c;
++
++ uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
++ ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++ for (i = 0; i < (size * size); i++) {
++ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++ }
++ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
++ }
++ }
++ } else if (ctx_cfmt(s) != 0 && blk_idx == 3) {
++ int trafo_size_h = 1 << (log2_trafo_size + 1);
++ int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1));
++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
++ trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
++ }
++ if (cbf_cb[i])
++ ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
++ log2_trafo_size, scan_idx_c, 1);
++ }
++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
++ trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
++ }
++ if (cbf_cr[i])
++ ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
++ log2_trafo_size, scan_idx_c, 2);
++ }
++ }
++ } else if (ctx_cfmt(s) != 0 && lc->cu.pred_mode == MODE_INTRA) {
++ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) {
++ int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1));
++ int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1));
++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1);
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2);
++ if (ctx_cfmt(s) == 2) {
++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c),
++ trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
++ }
++ } else if (blk_idx == 3) {
++ int trafo_size_h = 1 << (log2_trafo_size + 1);
++ int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1));
++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase,
++ trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1);
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2);
++ if (ctx_cfmt(s) == 2) {
++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)),
++ trafo_size_h, trafo_size_v);
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
++ }
++ }
++ }
++
++ return 0;
++}
++
++static void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
++{
++ int cb_size = 1 << log2_cb_size;
++ int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++
++ int min_pu_width = s->ps.sps->min_pu_width;
++ int x_end = FFMIN(x0 + cb_size, s->ps.sps->width);
++ int y_end = FFMIN(y0 + cb_size, s->ps.sps->height);
++ int i, j;
++
++ for (j = (y0 >> log2_min_pu_size); j < (y_end >> log2_min_pu_size); j++)
++ for (i = (x0 >> log2_min_pu_size); i < (x_end >> log2_min_pu_size); i++)
++ s->is_pcm[i + j * min_pu_width] = 2;
++}
++
++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++ int xBase, int yBase, int cb_xBase, int cb_yBase,
++ int log2_cb_size, int log2_trafo_size,
++ int trafo_depth, int blk_idx,
++ const int *base_cbf_cb, const int *base_cbf_cr)
++{
++ uint8_t split_transform_flag;
++ int cbf_cb[2];
++ int cbf_cr[2];
++ int ret;
++
++ cbf_cb[0] = base_cbf_cb[0];
++ cbf_cb[1] = base_cbf_cb[1];
++ cbf_cr[0] = base_cbf_cr[0];
++ cbf_cr[1] = base_cbf_cr[1];
++
++ if (lc->cu.intra_split_flag) {
++ if (trafo_depth == 1) {
++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
++ if (ctx_cfmt(s) == 3) {
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx];
++ } else {
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
++ }
++ }
++ } else {
++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0];
++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
++ }
++
++ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
++ log2_trafo_size > s->ps.sps->log2_min_tb_size &&
++ trafo_depth < lc->cu.max_trafo_depth &&
++ !(lc->cu.intra_split_flag && trafo_depth == 0)) {
++ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
++ } else {
++ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
++ lc->cu.pred_mode == MODE_INTER &&
++ lc->cu.part_mode != PART_2Nx2N &&
++ trafo_depth == 0;
++
++ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
++ (lc->cu.intra_split_flag && trafo_depth == 0) ||
++ inter_split;
++ }
++
++ if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) {
++ if (trafo_depth == 0 || cbf_cb[0]) {
++ cbf_cb[0] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++ if (ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
++ cbf_cb[1] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++ }
++ }
++
++ if (trafo_depth == 0 || cbf_cr[0]) {
++ cbf_cr[0] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++ if (ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
++ cbf_cr[1] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++ }
++ }
++ }
++
++ if (split_transform_flag) {
++ const int trafo_size_split = 1 << (log2_trafo_size - 1);
++ const int x1 = x0 + trafo_size_split;
++ const int y1 = y0 + trafo_size_split;
++
++#define SUBDIVIDE(x, y, idx) \
++do { \
++ ret = hls_transform_tree(s, lc, x, y, x0, y0, cb_xBase, cb_yBase, log2_cb_size, \
++ log2_trafo_size - 1, trafo_depth + 1, idx, \
++ cbf_cb, cbf_cr); \
++ if (ret < 0) \
++ return ret; \
++} while (0)
++
++ SUBDIVIDE(x0, y0, 0);
++ SUBDIVIDE(x1, y0, 1);
++ SUBDIVIDE(x0, y1, 2);
++ SUBDIVIDE(x1, y1, 3);
++
++#undef SUBDIVIDE
++ } else {
++ int min_tu_size = 1 << s->ps.sps->log2_min_tb_size;
++ int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
++ int min_tu_width = s->ps.sps->min_tb_width;
++ int cbf_luma = 1;
++
++ if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 ||
++ cbf_cb[0] || cbf_cr[0] ||
++ (ctx_cfmt(s) == 2 && (cbf_cb[1] || cbf_cr[1]))) {
++ cbf_luma = ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth);
++ }
++
++ ret = hls_transform_unit(s, lc, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
++ log2_cb_size, log2_trafo_size,
++ blk_idx, cbf_luma, cbf_cb, cbf_cr);
++ if (ret < 0)
++ return ret;
++ // TODO: store cbf_luma somewhere else
++ if (cbf_luma) {
++ int i, j;
++ for (i = 0; i < (1 << log2_trafo_size); i += min_tu_size)
++ for (j = 0; j < (1 << log2_trafo_size); j += min_tu_size) {
++ int x_tu = (x0 + j) >> log2_min_tu_size;
++ int y_tu = (y0 + i) >> log2_min_tu_size;
++ s->cbf_luma[y_tu * min_tu_width + x_tu] = 1;
++ }
++ }
++ if (!s->sh.disable_deblocking_filter_flag) {
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size);
++ if (s->ps.pps->transquant_bypass_enable_flag &&
++ lc->cu.cu_transquant_bypass_flag)
++ set_deblocking_bypass(s, x0, y0, log2_trafo_size);
++ }
++ }
++ return 0;
++}
++
++
++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++{
++ GetBitContext gb;
++ int ret;
++
++ ret = init_get_bits(&gb, pcm, length);
++ if (ret < 0)
++ return ret;
++
++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
++ frame_stride1(s->frame, 0),
++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
++ s->frame->linesize[1],
++ cb_size >> ctx_hshift(s, 1),
++ cb_size >> ctx_vshift(s, 1),
++ &gb, s->ps.sps->pcm.bit_depth_chroma);
++
++ return 0;
++}
++
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++ return x << (y * 2);
++}
++
++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
++{
++ // Length in bits
++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
++
++ const uint8_t * const pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
++
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
++
++ // Copy coeffs
++ {
++ const int blen = (length + 7) >> 3;
++ // Round allocated bytes up to nearest 32 to avoid alignment confusion
++ // Allocation is in int16_t s
++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++ // sample this rounding doesn't affect the total size we need to allocate for
++ // the coeff buffer
++ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
++ memcpy(coeffs, pcm, blen);
++
++ // Our coeff stash assumes that any partially allocated 64byte lump
++ // is zeroed so make that true.
++ {
++ uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++ if ((-(intptr_t)eopcm & 63) != 0)
++ memset(eopcm, 0, -(intptr_t)eopcm & 63);
++ }
++
++ // Add command
++ {
++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++ cmd->type = RPI_PRED_I_PCM;
++ cmd->size = log2_cb_size;
++ cmd->i_pcm.src = coeffs;
++ cmd->i_pcm.x = x0;
++ cmd->i_pcm.y = y0;
++ cmd->i_pcm.src_len = length;
++ }
++ return 0;
++ }
++}
++
++
++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCFrame * const ref,
++ const Mv * const mv, const int y0, const int height)
++{
++ if (s->threads_type == FF_THREAD_FRAME) {
++ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++
++ // Progress has to be attached to current job as the actual wait
++ // is in worker_core which can't use lc
++ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
++ if (*pr < y) {
++ *pr = y;
++ }
++ }
++}
++
++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0, const int nPbW,
++ const int nPbH, const int log2_cb_size, const int part_idx,
++ const int merge_idx, MvField * const mv)
++{
++ enum InterPredIdc inter_pred_idc = PRED_L0;
++ int mvp_flag;
++
++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH);
++ mv->pred_flag = 0;
++ if (s->sh.slice_type == HEVC_SLICE_B)
++ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
++
++ if (inter_pred_idc != PRED_L1) {
++ if (s->sh.nb_refs[L0])
++ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
++
++ mv->pred_flag = PF_L0;
++ ff_hevc_rpi_hls_mvd_coding(lc);
++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ part_idx, merge_idx, mv, mvp_flag, 0);
++ mv->mv[0].x += lc->pu.mvd.x;
++ mv->mv[0].y += lc->pu.mvd.y;
++ }
++
++ if (inter_pred_idc != PRED_L0) {
++ if (s->sh.nb_refs[L1])
++ mv->ref_idx[1]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
++
++ if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
++ AV_ZERO32(&lc->pu.mvd);
++ } else {
++ ff_hevc_rpi_hls_mvd_coding(lc);
++ }
++
++ mv->pred_flag += PF_L1;
++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ part_idx, merge_idx, mv, mvp_flag, 1);
++ mv->mv[1].x += lc->pu.mvd.x;
++ mv->mv[1].y += lc->pu.mvd.y;
++ }
++}
++
++
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++{
++ HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
++ HEVCRpiInterPredQ * ypt = yp + 1;
++ for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
++ if (ypt->load < yp->load)
++ yp = ypt;
++ }
++
++ yp->load += load_val;
++ ipe->used_grp = 1;
++ yp->qpu_mc_curr->data[-1] = fn; // Link is always last el of previous cmd
++
++ return yp;
++}
++
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++ for (unsigned int i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
++
++ q->qpu_mc_curr->data[-1] = q->code_sync;
++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
++ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
++ }
++}
++
++// Returns 0 on success, -1 if Q is dangerously full
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++ if (!ipe->used_grp)
++ return 0;
++
++ if ((ipe->curr += ipe->n_grp) >= ipe->n)
++ {
++ ipe->curr = 0;
++ rpi_inter_pred_sync(ipe);
++ }
++ ipe->used = 1;
++ ipe->used_grp = 0;
++
++ for (unsigned int i = 0; i != ipe->n_grp; ++i) {
++ HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
++ if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
++ return -1;
++ }
++ }
++ return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++
++ ipe->curr = 0;
++ ipe->used = 0;
++ ipe->used_grp = 0;
++ for (i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const q = ipe->q + i;
++ q->qpu_mc_curr = q->qpu_mc_base;
++ q->load = 0;
++ q->last_l0 = NULL;
++ q->last_l1 = NULL;
++ }
++}
++
++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++ const unsigned int n_max, const unsigned int n_grp,
++ const unsigned int total_size, const unsigned int min_gap)
++{
++ memset(ipe, 0, sizeof(*ipe));
++ av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
++ ipe->n_grp = n_grp;
++ ipe->min_gap = min_gap;
++
++ gpu_malloc_cached(total_size, &ipe->gptr);
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline int offset_depth_adj(const HEVCRpiContext *const s, const int wt)
++{
++ return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
++ wt << (s->ps.sps->bit_depth - 8);
++}
++
++static void
++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++ const int x0, const int y0,
++ const int nPbW, const int nPbH,
++ const Mv *const mv,
++ const int weight_mul,
++ const int weight_offset,
++ AVFrame *const src_frame)
++{
++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++ const unsigned int mx = mv->x & 3;
++ const unsigned int my = mv->y & 3;
++ const unsigned int my_mx = (my << 8) | mx;
++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx;
++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++ const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++
++ if (my_mx == 0)
++ {
++ const int x1 = x0 + (mv->x >> 2);
++ const int y1 = y0 + (mv->y >> 2);
++ const int bh = nPbH;
++
++ for (int start_x = 0; start_x < nPbW; start_x += 16)
++ {
++ const int bw = FFMIN(nPbW - start_x, 16);
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = &s->tstats;
++ ++ts->y_pred1_x0y0;
++
++ if (nPbW > 8)
++ ++ts->y_pred1_wgt8;
++ else
++ ++ts->y_pred1_wle8;
++
++ if (nPbH > 16)
++ ++ts->y_pred1_hgt16;
++ else
++ ++ts->y_pred1_hle16;
++ }
++#endif
++
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src_vc_address_y;
++ cmd_y->w = bw;
++ cmd_y->h = bh;
++ cmd_y->wo1 = wo;
++ cmd_y->dst_addr = dst_addr + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++ else
++ {
++ const int x1_m3 = x0 + (mv->x >> 2) - 3;
++ const int y1_m3 = y0 + (mv->y >> 2) - 3;
++ const unsigned int bh = nPbH;
++ int start_x = 0;
++
++#if 1
++ // As Y-pred operates on two independant 8-wide src blocks we can merge
++ // this pred with the previous one if it the previous one is 8 pel wide,
++ // the same height as the current block, immediately to the left of our
++ // current dest block and mono-pred.
++
++ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++ {
++ const int bw = FFMIN(nPbW, 8);
++ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
++
++ last_y8_src2->x = x1_m3;
++ last_y8_src2->y = y1_m3;
++ last_y8_src2->base = src_vc_address_y;
++ last_y8_p->w += bw;
++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++ last_y8_p->wo2 = wo;
++
++ jb->last_y8_p = NULL;
++ jb->last_y8_l1 = NULL;
++ start_x = bw;
++#if RPI_TSTATS
++ ++s->tstats.y_pred1_y8_merge;
++#endif
++ }
++#endif
++
++ for (; start_x < nPbW; start_x += 16)
++ {
++ const int bw = FFMIN(nPbW - start_x, 16);
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = &s->tstats;
++ if (mx == 0 && my == 0)
++ ++ts->y_pred1_x0y0;
++ else if (mx == 0)
++ ++ts->y_pred1_x0;
++ else if (my == 0)
++ ++ts->y_pred1_y0;
++ else
++ ++ts->y_pred1_xy;
++
++ if (nPbW > 8)
++ ++ts->y_pred1_wgt8;
++ else
++ ++ts->y_pred1_wle8;
++
++ if (nPbH > 16)
++ ++ts->y_pred1_hgt16;
++ else
++ ++ts->y_pred1_hle16;
++ }
++#endif
++ src1->x = x1_m3 + start_x;
++ src1->y = y1_m3;
++ src1->base = src_vc_address_y;
++ if (bw <= 8)
++ {
++ src2->x = MC_DUMMY_X;
++ src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++ src2->base = s->qpu_dummy_frame_emu;
++#else
++ src2->base = s->qpu_dummy_frame_qpu;
++#endif
++ }
++ else
++ {
++ src2->x = x1_m3 + start_x + 8;
++ src2->y = y1_m3;
++ src2->base = src_vc_address_y;
++ }
++ cmd_y->w = bw;
++ cmd_y->h = bh;
++ cmd_y->mymx21 = my2_mx2_my_mx;
++ cmd_y->wo1 = wo;
++ cmd_y->wo2 = wo;
++ cmd_y->dst_addr = dst_addr + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++ if (bw == 8) {
++ jb->last_y8_l1 = src2;
++ jb->last_y8_p = cmd_y;
++ }
++ }
++ }
++}
++
++static void
++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const int x0, const int y0,
++ const int nPbW, const int nPbH,
++ const struct MvField *const mv_field,
++ const AVFrame *const src_frame,
++ const AVFrame *const src_frame2)
++{
++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++ const Mv * const mv = mv_field->mv + 0;
++ const Mv * const mv2 = mv_field->mv + 1;
++
++ const unsigned int mx = mv->x & 3;
++ const unsigned int my = mv->y & 3;
++ const unsigned int my_mx = (my<<8) | mx;
++ const unsigned int mx2 = mv2->x & 3;
++ const unsigned int my2 = mv2->y & 3;
++ const unsigned int my2_mx2 = (my2<<8) | mx2;
++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++ const unsigned int ref_idx0 = mv_field->ref_idx[0];
++ const unsigned int ref_idx1 = mv_field->ref_idx[1];
++ const uint32_t wt_offset =
++ offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
++ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++
++ if (my2_mx2_my_mx == 0)
++ {
++ const int x1 = x0 + (mv->x >> 2);
++ const int y1 = y0 + (mv->y >> 2);
++ const int x2 = x0 + (mv2->x >> 2);
++ const int y2 = y0 + (mv2->y >> 2);
++ const int bh = nPbH;
++
++ // Can do chunks a full 16 wide if we don't want the H filter
++ for (int start_x=0; start_x < nPbW; start_x += 16)
++ {
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = &s->tstats;
++ ++ts->y_pred2_x0y0;
++
++ if (nPbH > 16)
++ ++ts->y_pred2_hgt16;
++ else
++ ++ts->y_pred2_hle16;
++ }
++#endif
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src1_base;
++ src2->x = x2 + start_x;
++ src2->y = y2;
++ src2->base = src2_base;
++ cmd_y->w = FFMIN(nPbW - start_x, 16);
++ cmd_y->h = bh;
++ cmd_y->mymx21 = 0;
++ cmd_y->wo1 = wo1;
++ cmd_y->wo2 = wo2;
++ cmd_y->dst_addr = dst + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++ else
++ {
++ // Filter requires a run-up of 3
++ const int x1 = x0 + (mv->x >> 2) - 3;
++ const int y1 = y0 + (mv->y >> 2) - 3;
++ const int x2 = x0 + (mv2->x >> 2) - 3;
++ const int y2 = y0 + (mv2->y >> 2) - 3;
++ const int bh = nPbH;
++
++ for (int start_x=0; start_x < nPbW; start_x += 8)
++ { // B blocks work 8 at a time
++ // B weights aren't doubled as the QPU code does the same
++ // amount of work as it does for P
++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++ qpu_mc_src_t *const src1 = yp->last_l0;
++ qpu_mc_src_t *const src2 = yp->last_l1;
++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = &s->tstats;
++ const unsigned int mmx = mx | mx2;
++ const unsigned int mmy = my | my2;
++ if (mmx == 0 && mmy == 0)
++ ++ts->y_pred2_x0y0;
++ else if (mmx == 0)
++ ++ts->y_pred2_x0;
++ else if (mmy == 0)
++ ++ts->y_pred2_y0;
++ else
++ ++ts->y_pred2_xy;
++
++ if (nPbH > 16)
++ ++ts->y_pred2_hgt16;
++ else
++ ++ts->y_pred2_hle16;
++ }
++#endif
++ src1->x = x1 + start_x;
++ src1->y = y1;
++ src1->base = src1_base;
++ src2->x = x2 + start_x;
++ src2->y = y2;
++ src2->base = src2_base;
++ cmd_y->w = FFMIN(nPbW - start_x, 8);
++ cmd_y->h = bh;
++ cmd_y->mymx21 = my2_mx2_my_mx;
++ cmd_y->wo1 = wo1;
++ cmd_y->wo2 = wo2;
++ cmd_y->dst_addr = dst + (start_x << xshl);
++ yp->last_l0 = &cmd_y->next_src1;
++ yp->last_l1 = &cmd_y->next_src2;
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++ }
++ }
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const unsigned int lx, const int x0_c, const int y0_c,
++ const int nPbW_c, const int nPbH_c,
++ const Mv * const mv,
++ const int16_t * const c_weights,
++ const int16_t * const c_offsets,
++ AVFrame * const src_frame)
++{
++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++ const int hshift = 1; // = s->ps.sps->hshift[1];
++ const int vshift = 1; // = s->ps.sps->vshift[1];
++
++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++ const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
++ const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++ const unsigned int bh = nPbH_c;
++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
++
++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++ {
++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++ qpu_mc_src_t * const last_lx = *plast_lx;
++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++ last_lx->x = x1_c + start_x;
++ last_lx->y = y1_c;
++ last_lx->base = src_base_u;
++ cmd_c->h = bh;
++ cmd_c->w = bw;
++ cmd_c->coeffs_x = x_coeffs;
++ cmd_c->coeffs_y = y_coeffs;
++ cmd_c->wo_u = wo_u;
++ cmd_c->wo_v = wo_v;
++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++ *plast_lx = &cmd_c->next_src;
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
++ }
++ return;
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const int x0_c, const int y0_c,
++ const int nPbW_c, const int nPbH_c,
++ const struct MvField * const mv_field,
++ const int16_t * const c_weights,
++ const int16_t * const c_offsets,
++ const int16_t * const c_weights2,
++ const int16_t * const c_offsets2,
++ AVFrame * const src_frame,
++ AVFrame * const src_frame2)
++{
++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++ const int hshift = 1; // s->ps.sps->hshift[1];
++ const int vshift = 1; // s->ps.sps->vshift[1];
++ const Mv * const mv = mv_field->mv + 0;
++ const Mv * const mv2 = mv_field->mv + 1;
++
++ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++
++ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++ const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
++ const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
++
++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++ const unsigned int bh = nPbH_c;
++
++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
++ {
++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++ qpu_mc_src_t * const src_l0 = cp->last_l0;
++ qpu_mc_src_t * const src_l1 = cp->last_l1;
++
++ src_l0->x = x1_c + start_x;
++ src_l0->y = y1_c;
++ src_l0->base = src1_base;
++ src_l1->x = x2_c + start_x;
++ src_l1->y = y2_c;
++ src_l1->base = src2_base;
++
++ u[0].h = bh;
++ u[0].w = bw;
++ u[0].coeffs_x1 = coefs0_x;
++ u[0].coeffs_y1 = coefs0_y;
++ u[0].weight_u1 = c_weights[0]; // Weight L0 U
++ u[0].weight_v1 = c_weights[1]; // Weight L0 V
++ u[0].coeffs_x2 = coefs1_x;
++ u[0].coeffs_y2 = coefs1_y;
++ u[0].wo_u2 = wo_u2;
++ u[0].wo_v2 = wo_v2;
++ u[0].dst_addr_c = dst_base_u + (start_x << xshl);
++
++ cp->last_l0 = &u[0].next_src1;
++ cp->last_l1 = &u[0].next_src2;
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++ }
++}
++
++
++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0,
++ const int nPbW, const int nPbH,
++ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++{
++ HEVCRpiJob * const jb = lc->jb0;
++
++ int merge_idx = 0;
++ struct MvField current_mv = {{{ 0 }}};
++
++ int min_pu_width = s->ps.sps->min_pu_width;
++
++ MvField * const tab_mvf = s->ref->tab_mvf;
++ const RefPicList *const refPicList = s->ref->refPicList;
++ const HEVCFrame *ref0 = NULL, *ref1 = NULL;
++ int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ int min_cb_width = s->ps.sps->min_cb_width;
++ int x_cb = x0 >> log2_min_cb_size;
++ int y_cb = y0 >> log2_min_cb_size;
++ int x_pu, y_pu;
++ int i, j;
++ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++
++ if (!skip_flag)
++ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
++
++ if (skip_flag || lc->pu.merge_flag) {
++ if (s->sh.max_num_merge_cand > 1)
++ merge_idx = ff_hevc_rpi_merge_idx_decode(s, lc);
++ else
++ merge_idx = 0;
++
++ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ partIdx, merge_idx, ¤t_mv);
++ } else {
++ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++ partIdx, merge_idx, ¤t_mv);
++ }
++
++ x_pu = x0 >> s->ps.sps->log2_min_pu_size;
++ y_pu = y0 >> s->ps.sps->log2_min_pu_size;
++
++ for (j = 0; j < nPbH >> s->ps.sps->log2_min_pu_size; j++)
++ for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++)
++ tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv;
++
++ if (current_mv.pred_flag & PF_L0) {
++ ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
++ if (!ref0)
++ return;
++ hevc_await_progress(s, lc, ref0, ¤t_mv.mv[0], y0, nPbH);
++ }
++ if (current_mv.pred_flag & PF_L1) {
++ ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
++ if (!ref1)
++ return;
++ hevc_await_progress(s, lc, ref1, ¤t_mv.mv[1], y0, nPbH);
++ }
++
++ if (current_mv.pred_flag == PF_L0) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 0,
++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++ ref0->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++ ref0->frame);
++ return;
++ }
++ } else if (current_mv.pred_flag == PF_L1) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 1,
++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++ ref1->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++ ref1->frame);
++ return;
++ }
++ } else if (current_mv.pred_flag == PF_BI) {
++ const int x0_c = x0 >> ctx_hshift(s, 1);
++ const int y0_c = y0 >> ctx_vshift(s, 1);
++ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame);
++
++ if (ctx_cfmt(s) != 0) {
++ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
++ ¤t_mv,
++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++ ref0->frame,
++ ref1->frame);
++ return;
++ }
++ }
++}
++
++/**
++ * 8.4.1
++ */
++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int pu_size,
++ int prev_intra_luma_pred_flag)
++{
++ int x_pu = x0 >> s->ps.sps->log2_min_pu_size;
++ int y_pu = y0 >> s->ps.sps->log2_min_pu_size;
++ int min_pu_width = s->ps.sps->min_pu_width;
++ int size_in_pus = pu_size >> s->ps.sps->log2_min_pu_size;
++ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
++ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
++
++ int cand_up = (lc->ctb_up_flag || y0b) ?
++ s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC;
++ int cand_left = (lc->ctb_left_flag || x0b) ?
++ s->tab_ipm[y_pu * min_pu_width + x_pu - 1] : INTRA_DC;
++
++ int y_ctb = (y0 >> (s->ps.sps->log2_ctb_size)) << (s->ps.sps->log2_ctb_size);
++
++ MvField *tab_mvf = s->ref->tab_mvf;
++ int intra_pred_mode;
++ int candidate[3];
++ int i, j;
++
++ // intra_pred_mode prediction does not cross vertical CTB boundaries
++ if ((y0 - 1) < y_ctb)
++ cand_up = INTRA_DC;
++
++ if (cand_left == cand_up) {
++ if (cand_left < 2) {
++ candidate[0] = INTRA_PLANAR;
++ candidate[1] = INTRA_DC;
++ candidate[2] = INTRA_ANGULAR_26;
++ } else {
++ candidate[0] = cand_left;
++ candidate[1] = 2 + ((cand_left - 2 - 1 + 32) & 31);
++ candidate[2] = 2 + ((cand_left - 2 + 1) & 31);
++ }
++ } else {
++ candidate[0] = cand_left;
++ candidate[1] = cand_up;
++ if (candidate[0] != INTRA_PLANAR && candidate[1] != INTRA_PLANAR) {
++ candidate[2] = INTRA_PLANAR;
++ } else if (candidate[0] != INTRA_DC && candidate[1] != INTRA_DC) {
++ candidate[2] = INTRA_DC;
++ } else {
++ candidate[2] = INTRA_ANGULAR_26;
++ }
++ }
++
++ if (prev_intra_luma_pred_flag) {
++ intra_pred_mode = candidate[lc->pu.mpm_idx];
++ } else {
++ if (candidate[0] > candidate[1])
++ FFSWAP(uint8_t, candidate[0], candidate[1]);
++ if (candidate[0] > candidate[2])
++ FFSWAP(uint8_t, candidate[0], candidate[2]);
++ if (candidate[1] > candidate[2])
++ FFSWAP(uint8_t, candidate[1], candidate[2]);
++
++ intra_pred_mode = lc->pu.rem_intra_luma_pred_mode;
++ for (i = 0; i < 3; i++)
++ if (intra_pred_mode >= candidate[i])
++ intra_pred_mode++;
++ }
++
++ /* write the intra prediction units into the mv array */
++ if (!size_in_pus)
++ size_in_pus = 1;
++ for (i = 0; i < size_in_pus; i++) {
++ memset(&s->tab_ipm[(y_pu + i) * min_pu_width + x_pu],
++ intra_pred_mode, size_in_pus);
++
++ for (j = 0; j < size_in_pus; j++) {
++ tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag = PF_INTRA;
++ }
++ }
++
++ return intra_pred_mode;
++}
++
++static av_always_inline void set_ct_depth(const HEVCRpiContext * const s, int x0, int y0,
++ int log2_cb_size, int ct_depth)
++{
++ int length = (1 << log2_cb_size) >> s->ps.sps->log2_min_cb_size;
++ int x_cb = x0 >> s->ps.sps->log2_min_cb_size;
++ int y_cb = y0 >> s->ps.sps->log2_min_cb_size;
++ int y;
++
++ for (y = 0; y < length; y++)
++ memset(&s->tab_ct_depth[(y_cb + y) * s->ps.sps->min_cb_width + x_cb],
++ ct_depth, length);
++}
++
++static const uint8_t tab_mode_idx[] = {
++ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20,
++ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
++
++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++ const int log2_cb_size)
++{
++ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
++ uint8_t prev_intra_luma_pred_flag[4];
++ int split = lc->cu.part_mode == PART_NxN;
++ int pb_size = (1 << log2_cb_size) >> split;
++ int side = split + 1;
++ int chroma_mode;
++ int i, j;
++
++ for (i = 0; i < side; i++)
++ for (j = 0; j < side; j++)
++ prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
++
++ for (i = 0; i < side; i++) {
++ for (j = 0; j < side; j++) {
++ if (prev_intra_luma_pred_flag[2 * i + j])
++ lc->pu.mpm_idx = ff_hevc_rpi_mpm_idx_decode(lc);
++ else
++ lc->pu.rem_intra_luma_pred_mode = ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
++
++ lc->pu.intra_pred_mode[2 * i + j] =
++ luma_intra_pred_mode(s, lc, x0 + pb_size * j, y0 + pb_size * i, pb_size,
++ prev_intra_luma_pred_flag[2 * i + j]);
++ }
++ }
++
++ if (ctx_cfmt(s) == 3) {
++ for (i = 0; i < side; i++) {
++ for (j = 0; j < side; j++) {
++ lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode])
++ lc->pu.intra_pred_mode_c[2 * i + j] = 34;
++ else
++ lc->pu.intra_pred_mode_c[2 * i + j] = intra_chroma_table[chroma_mode];
++ } else {
++ lc->pu.intra_pred_mode_c[2 * i + j] = lc->pu.intra_pred_mode[2 * i + j];
++ }
++ }
++ }
++ } else if (ctx_cfmt(s) == 2) {
++ int mode_idx;
++ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++ mode_idx = 34;
++ else
++ mode_idx = intra_chroma_table[chroma_mode];
++ } else {
++ mode_idx = lc->pu.intra_pred_mode[0];
++ }
++ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
++ } else if (ctx_cfmt(s) != 0) {
++ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++ if (chroma_mode != 4) {
++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++ lc->pu.intra_pred_mode_c[0] = 34;
++ else
++ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
++ } else {
++ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
++ }
++ }
++}
++
++static void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ int x0, int y0,
++ int log2_cb_size)
++{
++ int pb_size = 1 << log2_cb_size;
++ int size_in_pus = pb_size >> s->ps.sps->log2_min_pu_size;
++ int min_pu_width = s->ps.sps->min_pu_width;
++ MvField *tab_mvf = s->ref->tab_mvf;
++ int x_pu = x0 >> s->ps.sps->log2_min_pu_size;
++ int y_pu = y0 >> s->ps.sps->log2_min_pu_size;
++ int j, k;
++
++ if (size_in_pus == 0)
++ size_in_pus = 1;
++ for (j = 0; j < size_in_pus; j++)
++ memset(&s->tab_ipm[(y_pu + j) * min_pu_width + x_pu], INTRA_DC, size_in_pus);
++ if (lc->cu.pred_mode == MODE_INTRA)
++ for (j = 0; j < size_in_pus; j++)
++ for (k = 0; k < size_in_pus; k++)
++ tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA;
++}
++
++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int log2_cb_size)
++{
++ int cb_size = 1 << log2_cb_size;
++ int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++ int length = cb_size >> log2_min_cb_size;
++ int min_cb_width = s->ps.sps->min_cb_width;
++ int x_cb = x0 >> log2_min_cb_size;
++ int y_cb = y0 >> log2_min_cb_size;
++ int idx = log2_cb_size - 2;
++ int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
++ int x, y, ret;
++
++ lc->cu.x = x0;
++ lc->cu.y = y0;
++ lc->cu.pred_mode = MODE_INTRA;
++ lc->cu.part_mode = PART_2Nx2N;
++ lc->cu.intra_split_flag = 0;
++
++ SAMPLE_CTB(s->skip_flag, x_cb, y_cb) = 0;
++ for (x = 0; x < 4; x++)
++ lc->pu.intra_pred_mode[x] = 1;
++ if (s->ps.pps->transquant_bypass_enable_flag) {
++ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
++ if (lc->cu.cu_transquant_bypass_flag)
++ set_deblocking_bypass(s, x0, y0, log2_cb_size);
++ } else
++ lc->cu.cu_transquant_bypass_flag = 0;
++
++ if (s->sh.slice_type != HEVC_SLICE_I) {
++ uint8_t skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
++
++ x = y_cb * min_cb_width + x_cb;
++ for (y = 0; y < length; y++) {
++ memset(&s->skip_flag[x], skip_flag, length);
++ x += min_cb_width;
++ }
++ lc->cu.pred_mode = skip_flag ? MODE_SKIP : MODE_INTER;
++ } else {
++ x = y_cb * min_cb_width + x_cb;
++ for (y = 0; y < length; y++) {
++ memset(&s->skip_flag[x], 0, length);
++ x += min_cb_width;
++ }
++ }
++
++ if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
++ } else {
++ int pcm_flag = 0;
++
++ if (s->sh.slice_type != HEVC_SLICE_I)
++ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
++ if (lc->cu.pred_mode != MODE_INTRA ||
++ log2_cb_size == s->ps.sps->log2_min_cb_size) {
++ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
++ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
++ lc->cu.pred_mode == MODE_INTRA;
++ }
++
++ if (lc->cu.pred_mode == MODE_INTRA) {
++ if (lc->cu.part_mode == PART_2Nx2N && s->ps.sps->pcm_enabled_flag &&
++ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
++ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size) {
++ pcm_flag = ff_hevc_rpi_pcm_flag_decode(lc);
++ }
++ if (pcm_flag) {
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++ ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size);
++ if (s->ps.sps->pcm.loop_filter_disable_flag)
++ {
++ set_deblocking_bypass(s, x0, y0, log2_cb_size);
++ }
++
++ if (ret < 0)
++ return ret;
++ } else {
++ intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
++ }
++ } else {
++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++ switch (lc->cu.part_mode) {
++ case PART_2Nx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++ break;
++ case PART_2NxN:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx);
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
++ break;
++ case PART_Nx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
++ break;
++ case PART_2NxnU:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx);
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx);
++ break;
++ case PART_2NxnD:
++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx);
++ hls_prediction_unit(s, lc, x0, y0 + cb_size * 3 / 4, cb_size, cb_size / 4, log2_cb_size, 1, idx);
++ break;
++ case PART_nLx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2);
++ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
++ break;
++ case PART_nRx2N:
++ hls_prediction_unit(s, lc, x0, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2);
++ hls_prediction_unit(s, lc, x0 + cb_size * 3 / 4, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2);
++ break;
++ case PART_NxN:
++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
++ break;
++ }
++ }
++
++ if (!pcm_flag) {
++ int rqt_root_cbf = 1;
++
++ if (lc->cu.pred_mode != MODE_INTRA &&
++ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
++ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
++ }
++ if (rqt_root_cbf) {
++ const static int cbf[2] = { 0 };
++ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
++ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
++ s->ps.sps->max_transform_hierarchy_depth_inter;
++ ret = hls_transform_tree(s, lc, x0, y0, x0, y0, x0, y0,
++ log2_cb_size,
++ log2_cb_size, 0, 0, cbf, cbf);
++ if (ret < 0)
++ return ret;
++ } else {
++ if (!s->sh.disable_deblocking_filter_flag)
++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
++ }
++ }
++ }
++
++ if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0)
++ ff_hevc_rpi_set_qPy(s, lc, x0, y0, log2_cb_size);
++
++ x = y_cb * min_cb_width + x_cb;
++ for (y = 0; y < length; y++) {
++ memset(&s->qp_y_tab[x], lc->qp_y, length);
++ x += min_cb_width;
++ }
++
++ if(((x0 + (1<qPy_pred = lc->qp_y;
++ }
++
++ set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth);
++
++ return 0;
++}
++
++// Returns:
++// < 0 Error
++// 0 More data wanted
++// 1 EoSlice / EoPicture
++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++ const int log2_cb_size, const int cb_depth)
++{
++ const int cb_size = 1 << log2_cb_size;
++ int ret;
++ int split_cu;
++
++ lc->ct_depth = cb_depth;
++ if (x0 + cb_size <= s->ps.sps->width &&
++ y0 + cb_size <= s->ps.sps->height &&
++ log2_cb_size > s->ps.sps->log2_min_cb_size) {
++ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
++ } else {
++ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
++ }
++ if (s->ps.pps->cu_qp_delta_enabled_flag &&
++ log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth) {
++ lc->tu.is_cu_qp_delta_coded = 0;
++ lc->tu.cu_qp_delta = 0;
++ }
++
++ lc->tu.is_cu_chroma_qp_offset_coded = !(s->sh.cu_chroma_qp_offset_enabled_flag &&
++ log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth);
++ lc->tu.cu_qp_offset_cb = 0;
++ lc->tu.cu_qp_offset_cr = 0;
++
++ if (split_cu) {
++ int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
++ const int cb_size_split = cb_size >> 1;
++ const int x1 = x0 + cb_size_split;
++ const int y1 = y0 + cb_size_split;
++
++ int more_data = 0;
++
++ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++
++ if (more_data && x1 < s->ps.sps->width) {
++ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++ if (more_data && y1 < s->ps.sps->height) {
++ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++ if (more_data && x1 < s->ps.sps->width &&
++ y1 < s->ps.sps->height) {
++ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
++ if (more_data < 0)
++ return more_data;
++ }
++
++ if(((x0 + (1<qPy_pred = lc->qp_y;
++
++ if (more_data)
++ return ((x1 + cb_size_split) < s->ps.sps->width ||
++ (y1 + cb_size_split) < s->ps.sps->height);
++ else
++ return 0;
++ } else {
++ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
++ if (ret < 0)
++ return ret;
++ if ((!((x0 + cb_size) %
++ (1 << (s->ps.sps->log2_ctb_size))) ||
++ (x0 + cb_size >= s->ps.sps->width)) &&
++ (!((y0 + cb_size) %
++ (1 << (s->ps.sps->log2_ctb_size))) ||
++ (y0 + cb_size >= s->ps.sps->height))) {
++ int end_of_slice_flag = ff_hevc_rpi_end_of_slice_flag_decode(lc);
++ return !end_of_slice_flag;
++ } else {
++ return 1;
++ }
++ }
++
++ return 0; // NEVER
++}
++
++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x_ctb, const int y_ctb, const int ctb_addr_ts)
++{
++ const int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ const int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice
++ const int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
++
++ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
++
++ lc->end_of_tiles_x = idxX + 1 >= s->ps.pps->num_tile_columns ? s->ps.sps->width :
++ (s->ps.pps->col_bd[idxX + 1] << s->ps.sps->log2_ctb_size);
++
++ if (ctb_addr_ts == 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1] ||
++ (s->ps.pps->entropy_coding_sync_enabled_flag && (x_ctb >> s->ps.sps->log2_ctb_size) == s->ps.pps->col_bd[idxX]))
++ {
++// lc->first_qp_group = 1;
++ lc->qPy_pred = s->sh.slice_qp;
++ }
++
++ lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++
++ lc->boundary_flags = 0;
++
++ if (x_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]])
++ lc->boundary_flags |= BOUNDARY_LEFT_TILE;
++ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
++ lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
++ if (y_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]])
++ lc->boundary_flags |= BOUNDARY_UPPER_TILE;
++ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
++ lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
++
++ lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0;
++ lc->ctb_up_flag = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0;
++ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++ (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width);
++
++ lc->ctb_up_right_flag = ((y_ctb > 0) && (x_ctb + ctb_size) < lc->end_of_tiles_x &&
++ (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) &&
++ (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]]));
++}
++
++
++static void rpi_execute_dblk_cmds(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++ const unsigned int x0 = FFMAX(jb->bounds.x, ctb_size) - ctb_size;
++ const unsigned int y0 = FFMAX(jb->bounds.y, ctb_size) - ctb_size;
++ const unsigned int bound_r = jb->bounds.x + jb->bounds.w;
++ const unsigned int bound_b = jb->bounds.y + jb->bounds.h;
++ const int x_end = (bound_r >= s->ps.sps->width);
++ const int y_end = (bound_b >= s->ps.sps->height);
++ const unsigned int xr = bound_r - (x_end ? 0 : ctb_size);
++ const unsigned int yb = bound_b - (y_end ? 0 : ctb_size);
++ unsigned int x, y;
++
++ for (y = y0; y < yb; y += ctb_size ) {
++ for (x = x0; x < xr; x += ctb_size ) {
++ ff_hevc_rpi_hls_filter(s, x, y, ctb_size);
++ }
++ }
++
++ // Flush (SAO)
++ if (y > y0) {
++ const int tile_end = y_end ||
++ s->ps.pps->tile_id[jb->ctu_ts_last] != s->ps.pps->tile_id[jb->ctu_ts_last + 1];
++ const unsigned int xl = x0 > ctb_size ? x0 - ctb_size : 0;
++ const unsigned int yt = y0 > ctb_size ? y0 - ctb_size : 0;
++ const unsigned int yb = tile_end ? bound_b : y - ctb_size;
++
++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++ xl, yt, bound_r - xl, yb - yt,
++ ctx_vshift(s, 1), 1, 1);
++ rpi_cache_flush_finish(rfe);
++ }
++
++ // Signal
++ if (s->threads_type == FF_THREAD_FRAME && x_end && y0 > 0) {
++ ff_hevc_rpi_progress_signal_recon(s, y_end ? INT_MAX : y0 - 1);
++ }
++
++ // Job done now
++ // ? Move outside this fn
++ job_free(s->jbc, jb);
++}
++
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ unsigned int i;
++ HEVCRpiIntraPredEnv * const iap = &jb->intra;
++ const HEVCPredCmd *cmd = iap->cmds;
++
++ for (i = iap->n; i > 0; i--, cmd++)
++ {
++ switch (cmd->type)
++ {
++ case RPI_PRED_INTRA:
++ {
++ HEVCRpiLocalContextIntra lci; // Abbreviated local context
++ HEVCRpiLocalContext * const lc = (HEVCRpiLocalContext *)&lci;
++ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
++ lc->na.cand_bottom_left = (cmd->na >> 4) & 1;
++ lc->na.cand_left = (cmd->na >> 3) & 1;
++ lc->na.cand_up_left = (cmd->na >> 2) & 1;
++ lc->na.cand_up = (cmd->na >> 1) & 1;
++ lc->na.cand_up_right = (cmd->na >> 0) & 1;
++ if (cmd->c_idx == 0)
++ s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++ else
++ s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++ break;
++ }
++
++ case RPI_PRED_ADD_RESIDUAL:
++ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++ break;
++ case RPI_PRED_ADD_DC:
++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_U:
++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_V:
++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++ break;
++ case RPI_PRED_ADD_RESIDUAL_C:
++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++ break;
++ case RPI_PRED_ADD_DC_U:
++ case RPI_PRED_ADD_DC_V:
++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++ break;
++
++ case RPI_PRED_I_PCM:
++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++ break;
++
++ default:
++ av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++ abort();
++ }
++ }
++
++ // Mark done
++ iap->n = 0;
++}
++
++
++// Set initial uniform job values & zero ctu_count
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
++{
++ unsigned int i;
++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
++ const HEVCRpiSPS * const sps = s->ps.sps;
++
++ const uint16_t pic_width_y = sps->width;
++ const uint16_t pic_height_y = sps->height;
++
++ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1);
++ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1);
++
++ // We expect the pointer to change if we use another sps
++ if (sps != jb->sps)
++ {
++ worker_pic_free_one(jb);
++
++ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
++ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
++
++ {
++ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
++ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
++ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
++ }
++
++ jb->sps = sps;
++ }
++
++ jb->waited = 0;
++ jb->ctu_ts_first = ctu_ts_first;
++ jb->ctu_ts_last = -1;
++
++ rpi_inter_pred_reset(cipe);
++ for (i = 0; i < cipe->n; i++) {
++ HEVCRpiInterPredQ * const cp = cipe->q + i;
++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++
++ u->next_src1.x = 0;
++ u->next_src1.y = 0;
++ u->next_src1.base = 0;
++ u->pic_cw = pic_width_c;
++ u->pic_ch = pic_height_c;
++ u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++ u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++ u->wdenom = s->sh.chroma_log2_weight_denom;
++ cp->last_l0 = &u->next_src1;
++
++ u->next_fn = 0;
++ u->next_src2.x = 0;
++ u->next_src2.y = 0;
++ u->next_src2.base = 0;
++ cp->last_l1 = &u->next_src2;
++
++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++ }
++
++ rpi_inter_pred_reset(yipe);
++ for (i = 0; i < yipe->n; i++) {
++ HEVCRpiInterPredQ * const yp = yipe->q + i;
++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++
++ y->next_src1.x = 0;
++ y->next_src1.y = 0;
++ y->next_src1.base = 0;
++ y->next_src2.x = 0;
++ y->next_src2.y = 0;
++ y->next_src2.base = 0;
++ y->pic_h = pic_height_y;
++ y->pic_w = pic_width_y;
++ y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++ y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++ y->wdenom = s->sh.luma_log2_weight_denom;
++ y->next_fn = 0;
++ yp->last_l0 = &y->next_src1;
++ yp->last_l1 = &y->next_src2;
++
++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
++ }
++
++ jb->last_y8_p = NULL;
++ jb->last_y8_l1 = NULL;
++
++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++ jb->progress_req[i] = -1;
++ }
++
++ worker_pic_reset(&jb->coeffs);
++}
++
++
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
++ const vpu_qpu_job_h vqj,
++ rpi_cache_flush_env_t * const rfe,
++ HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++ unsigned int max_block = 0;
++
++ if (!ipe->used) {
++ return 0;
++ }
++
++ if (ipe->curr != 0) {
++ rpi_inter_pred_sync(ipe);
++ }
++
++ // Add final commands to Q
++ for(i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const yp = ipe->q + i;
++ qpu_mc_src_t *const p0 = yp->last_l0;
++ qpu_mc_src_t *const p1 = yp->last_l1;
++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++ if (block_size > max_block)
++ max_block = block_size;
++
++ yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++ p0->x = MC_DUMMY_X;
++ p0->y = MC_DUMMY_Y;
++ p0->base = s->qpu_dummy_frame_qpu;
++ p1->x = MC_DUMMY_X;
++ p1->y = MC_DUMMY_Y;
++ p1->base = s->qpu_dummy_frame_qpu;
++
++ yp->last_l0 = NULL;
++ yp->last_l1 = NULL;
++
++ // Add to mailbox list
++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++ mail[i][1] = yp->code_setup;
++ }
++
++ // We don't need invalidate here as the uniforms aren't changed by the QPU
++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++ // new values which seems to give us a small performance advantage
++ //
++ // In most cases we will not have a completely packed set of uniforms and as
++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++ // fullest
++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++ ipe->n, ipe->max_fill + ipe->min_gap);
++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++ return 1;
++}
++#endif
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
++ const vpu_qpu_job_h vqj,
++ rpi_cache_flush_env_t * const rfe,
++ HEVCRpiInterPredEnv * const ipe)
++{
++ unsigned int i;
++ if (!ipe->used) {
++ return 0;
++ }
++
++ if (ipe->curr != 0) {
++ rpi_inter_pred_sync(ipe);
++ }
++
++ // Add final commands to Q
++ for(i = 0; i != ipe->n; ++i) {
++ HEVCRpiInterPredQ * const yp = ipe->q + i;
++ qpu_mc_src_t *const p0 = yp->last_l0;
++ qpu_mc_src_t *const p1 = yp->last_l1;
++
++ yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++ p0->x = MC_DUMMY_X;
++ p0->y = MC_DUMMY_Y;
++ p0->base = s->qpu_dummy_frame_emu;
++ p1->x = MC_DUMMY_X;
++ p1->y = MC_DUMMY_Y;
++ p1->base = s->qpu_dummy_frame_emu;
++
++ yp->last_l0 = NULL;
++ yp->last_l1 = NULL;
++ }
++
++ return 1;
++}
++#endif
++
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++
++
++static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
++{
++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++ rpi_cache_flush_finish(rfe);
++}
++
++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
++ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
++ const unsigned int ctb_width = s->ps.sps->ctb_width;
++ RpiBlk *const bounds = &jb->bounds;
++ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
++ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
++ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
++ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
++ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
++}
++
++#if RPI_PASSES == 2
++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++ // Perform intra prediction and residual reconstruction
++ rpi_execute_pred_cmds(s, jb);
++
++ // Perform deblocking for CTBs in this row
++ rpi_execute_dblk_cmds(s, jb);
++}
++#endif
++
++
++// Core execution tasks
++static void worker_core(HEVCRpiContext * const s0, HEVCRpiJob * const jb)
++{
++ const HEVCRpiContext * const s = s0;
++ vpu_qpu_wait_h sync_y;
++ int pred_y, pred_c;
++ const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++
++ {
++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++ if (cf->s[3].n + cf->s[2].n != 0)
++ {
++ const unsigned int csize = sizeof(cf->s[3].buf[0]);
++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++ vpu_qpu_job_add_vpu(vqj,
++ vpu_get_fn(s->ps.sps->bit_depth),
++ vpu_get_constants(),
++ cf->gptr.vc,
++ cf->s[2].n >> 8,
++ cf->gptr.vc + offset32,
++ cf->s[3].n >> 10,
++ 0);
++
++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
++ }
++ }
++
++ pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
++
++// We could take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++
++ pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
++
++ vpu_qpu_job_add_sync_this(vqj, &sync_y);
++
++ rpi_cache_flush_execute(rfe);
++
++ // Await progress as required
++ // jb->waited will only be clear if we have already tested the progress values
++ // (in worker_submit_job) and found we don't have to wait
++ if (jb->waited)
++ {
++ unsigned int i;
++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++ if (jb->progress_req[i] >= 0) {
++ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
++ }
++ }
++ }
++
++ vpu_qpu_job_finish(vqj);
++
++ // We always work on a rectangular block
++ if (pred_y || pred_c)
++ {
++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
++ ctx_vshift(s, 1), pred_y, pred_c);
++ }
++
++ // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ if (av_rpi_is_sand8_frame(s->frame))
++ {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++ }
++ else
++ {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
++ }
++#endif
++
++ // Wait for transform completion
++ // ? Could/should be moved to next pass which would let us add more jobs
++ // to the VPU Q on this thread but when I tried that it all went a bit slower
++ vpu_qpu_wait(&sync_y);
++
++ rpi_cache_flush_finish(rfe);
++}
++
++
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++ av_freep(&ipe->q);
++ gpu_free(&ipe->gptr);
++}
++
++static HEVCRpiJob * job_new(void)
++{
++ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
++
++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++ jb->intra.n = 0;
++ jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
++
++ // * Sizeof the union structure might be overkill but at the moment it
++ // is correct (it certainly isn't going to be too small)
++ // *** really should add per ctu sync words to be accurate
++
++ rpi_inter_pred_alloc(&jb->chroma_ip,
++ QPU_N_MAX, QPU_N_GRP,
++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
++ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
++ rpi_inter_pred_alloc(&jb->luma_ip,
++ QPU_N_MAX, QPU_N_GRP,
++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
++ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
++
++ return jb;
++}
++
++static void job_delete(HEVCRpiJob * const jb)
++{
++ worker_pic_free_one(jb);
++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++ av_freep(&jb->intra.cmds);
++ rpi_free_inter_pred(&jb->chroma_ip);
++ rpi_free_inter_pred(&jb->luma_ip);
++}
++
++static void jbg_delete(HEVCRpiJobGlobal * const jbg)
++{
++ HEVCRpiJob * jb;
++
++ if (jbg == NULL)
++ return;
++
++ jb = jbg->free1;
++ while (jb != NULL)
++ {
++ HEVCRpiJob * const jb2 = jb;
++ jb = jb2->next;
++ job_delete(jb2);
++ }
++
++ pthread_mutex_destroy(&jbg->lock);
++ av_free(jbg);
++}
++
++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
++{
++ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
++ if (jbg == NULL)
++ return NULL;
++
++ pthread_mutex_init(&jbg->lock, NULL);
++
++ while (job_count-- != 0)
++ {
++ HEVCRpiJob * const jb = job_new();
++ if (jb == NULL)
++ goto fail;
++
++ jb->next = jbg->free1;
++ jbg->free1 = jb;
++ }
++
++ return jbg;
++
++fail:
++ jbg_delete(jbg);
++ return NULL;
++}
++
++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
++{
++ HEVCRpiJobGlobal * jbg;
++
++ if (jbc == NULL)
++ return;
++
++ jbg = jbc->jbg;
++
++ if (jbc->jb1 != NULL)
++ job_delete(jbc->jb1);
++
++ pthread_mutex_destroy(&jbc->in_lock);
++ sem_destroy(&jbc->sem_out);
++ av_free(jbc);
++
++ // Deref the global job context
++ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
++ jbg_delete(jbg);
++}
++
++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
++{
++ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
++
++ if (jbc == NULL)
++ return NULL;
++
++ jbc->jbg = jbg;
++ atomic_fetch_add(&jbg->ref_count, 1);
++
++ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
++ pthread_mutex_init(&jbc->in_lock, NULL);
++
++ if ((jbc->jb1 = job_new()) == NULL)
++ goto fail;
++ jbc->jb1->jbc_local = jbc;
++
++ return jbc;
++
++fail:
++ rpi_job_ctl_delete(jbc);
++ return NULL;
++}
++
++
++
++static av_cold void hevc_init_worker(HEVCRpiContext * const s)
++{
++#if RPI_PASSES == 2
++ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
++#elif RPI_PASSES == 3
++ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
++ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
++#else
++#error Passes confused
++#endif
++ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
++
++ pass_queues_start_all(s);
++}
++
++static av_cold void hevc_exit_worker(HEVCRpiContext *s)
++{
++ pass_queues_term_all(s);
++
++ pass_queues_kill_all(s);
++
++ rpi_job_ctl_delete(s->jbc);
++ s->jbc = NULL;
++}
++
++
++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
++{
++ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
++
++ // Check for obvious disasters
++ if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
++ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (s->sh.dependent_slice_segment_flag) {
++ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
++ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
++ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->ps.pps->tile_id[ctb_addr_ts] + s->sh.num_entry_point_offsets >= tiles)
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // Tiled stuff must start at start of tile if it has multiple entry points
++ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->sh.num_entry_point_offsets != 0 &&
++ s->sh.slice_ctb_addr_rs != s->ps.pps->tile_pos_rs[s->ps.pps->tile_id[ctb_addr_ts]])
++ {
++ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
++ return AVERROR_INVALIDDATA;
++ }
++
++ // Setup any required decode vars
++ if (!s->sh.dependent_slice_segment_flag)
++ lc->qPy_pred = s->sh.slice_qp;
++
++ lc->qp_y = s->sh.slice_qp;
++
++ // General setup
++ lc->wpp_init = 0;
++ lc->bt_line_no = 0;
++ lc->ts = ctb_addr_ts;
++ return 0;
++}
++
++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++ const GetBitContext * const gb = &s->HEVClc->gb;
++ int i, j;
++
++ const unsigned int length = nal->size;
++ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte
++ unsigned int cmpt;
++ unsigned int startheader;
++
++ if (s->sh.num_entry_point_offsets == 0) {
++ return 0;
++ }
++
++ for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++ startheader--;
++ cmpt++;
++ }
++ }
++
++ for (i = 1; i < s->sh.num_entry_point_offsets; i++) {
++ offset += (s->sh.entry_point_offset[i - 1] - cmpt);
++ for (j = 0, cmpt = 0, startheader = offset
++ + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) {
++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++ startheader--;
++ cmpt++;
++ }
++ }
++ s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt;
++ s->sh.offset[i - 1] = offset;
++ }
++ if (s->sh.num_entry_point_offsets != 0) {
++ offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt;
++ if (length < offset) {
++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++ return AVERROR_INVALIDDATA;
++ }
++ s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset;
++ s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset;
++ }
++ s->data = nal->data;
++ return 0;
++}
++
++
++// Return
++// < 0 Error
++// 0 OK
++//
++// jb->ctu_ts_last < 0 Job still filling
++// jb->ctu_ts_last >= 0 Job ready
++
++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
++{
++ const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++ HEVCRpiJob * const jb = lc->jb0;
++ int more_data = 1;
++ int ctb_addr_ts = lc->ts;
++
++ lc->unit_done = 0;
++ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
++ {
++ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++ const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
++ const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
++ int q_full;
++
++ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
++
++ ff_hevc_rpi_cabac_init(s, lc, ctb_addr_ts);
++
++ hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
++
++ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
++ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset;
++ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag;
++
++ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
++
++ if (more_data < 0) {
++ s->tab_slice_address[ctb_addr_rs] = -1;
++ return more_data;
++ }
++
++ // Inc TS to next.
++ // N.B. None of the other position vars have changed
++ ctb_addr_ts++;
++ ff_hevc_rpi_save_states(s, lc, ctb_addr_ts);
++
++ // Report progress so we can use our MVs in other frames
++ if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
++ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
++ }
++
++ // End of line || End of tile line || End of tile
++ // (EoL covers end of frame for our purposes here)
++ q_full = x_ctb + ctb_size >= s->ps.sps->width ||
++ s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts] != ctb_addr_rs + 1 ||
++ s->ps.pps->tile_id[ctb_addr_ts - 1] != s->ps.pps->tile_id[ctb_addr_ts];
++
++ // Allocate QPU chuncks on fixed size 64 pel boundries rather than
++ // whatever ctb_size is today.
++ // * We might quite like to continue to 64 pel vertical too but that
++ // currently confuses WPP
++ if (((x_ctb + ctb_size) & 63) == 0 || q_full)
++ {
++ int overflow = 0;
++ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
++ overflow = 1;
++ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
++ overflow = 1;
++ if (overflow)
++ {
++ // * This is very annoying (and slow) to cope with in WPP so
++ // we treat it as an error there (no known stream triggers this
++ // with the current buffer sizes). Non-wpp should cope fine.
++ av_log(s, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__);
++ q_full = 1;
++ }
++ }
++
++ if (q_full)
++ {
++ // Do job
++ // Prep for submission
++ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced
++ job_gen_bounds(s, jb);
++ break;
++ }
++
++ // If max_blocks started as 0 then this will never be true
++ if (--max_blocks == 0)
++ break;
++ }
++
++ lc->unit_done = (more_data <= 0);
++ lc->ts = ctb_addr_ts;
++ return 0;
++}
++
++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
++{
++ lc->context = s;
++ lc->jb0 = NULL;
++ lc->lc_n = n;
++ lc->bt_terminate = 0;
++ lc->bt_psem_out = NULL;
++ sem_init(&lc->bt_sem_in, 0, 0);
++}
++
++#define TRACE_WPP 0
++#if RPI_EXTRA_BIT_THREADS > 0
++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
++{
++ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
++ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
++}
++
++// Move local context parameters from an aux bit thread back to the main
++// thread at the end of a slice as processing is going to continue there.
++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
++{
++ if (src_lc == dst_lc) {
++ return;
++ }
++
++ // Move the job
++ // We will still have an active job if the final line terminates early
++ // Dest should always be null by now
++ av_assert1(dst_lc->jb0 == NULL);
++ dst_lc->jb0 = src_lc->jb0;
++ src_lc->jb0 = NULL;
++
++ // Always need to store where we are in the bitstream
++ dst_lc->ts = src_lc->ts;
++ dst_lc->gb = src_lc->gb;
++ // Need to store context if we might have a dependent seg
++ if (is_dep)
++ {
++ dst_lc->qPy_pred = src_lc->qPy_pred;
++ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
++ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
++ }
++}
++
++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
++{
++ rpi_sem_wait(&lc->bt_sem_in);
++ return lc->bt_terminate;
++}
++
++// Do one WPP line
++// Will not work correctly over horizontal tile boundries - vertical should be OK
++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
++{
++ const int is_tile = lc->bt_is_tile;
++ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
++ const unsigned int line = lc->bt_line_no;
++ const unsigned int line_inc = lc->bt_line_inc;
++ const int is_last = (line >= lc->bt_last_line);
++
++ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
++ const unsigned int ts_next =
++ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
++ INT_MAX :
++ is_tile ?
++ s->ps.pps->ctb_addr_rs_to_ts[s->ps.pps->tile_pos_rs[tile_id + line_inc]] :
++ lc->ts + lc->bt_line_width * line_inc;
++ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
++ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
++ unsigned int ts_prev;
++ int loop_n = 0;
++ int err = 0;
++
++ av_assert1(line <= s->sh.num_entry_point_offsets);
++
++#if TRACE_WPP
++ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
++ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id,
++ line, lc->bt_last_line, s->sh.num_entry_point_offsets,
++ lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
++#endif
++ if (line != 0)
++ {
++ const uint8_t * const data = s->data + s->sh.offset[line - 1];
++ const unsigned int len = s->sh.size[line - 1];
++ if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
++ return err;
++
++ ff_init_cabac_decoder(&lc->cc, data, len);
++
++ lc->wpp_init = 1; // Stop ff_hevc_rpi_cabac_init trying to read non-existant termination bits
++ }
++
++ // We should never be processing a dependent slice here so reset is good
++ // ?? These probably shouldn't be needed (as they should be set by later
++ // logic) but do seem to be required
++ lc->qPy_pred = s->sh.slice_qp;
++ lc->qp_y = s->sh.slice_qp;
++
++ do
++ {
++ if (!is_last && loop_n > 1) {
++#if TRACE_WPP
++ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ }
++ if (!is_first && loop_n != 0)
++ {
++#if TRACE_WPP
++ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
++#endif
++ if (wait_bt_sem_in(lc) != 0)
++ return AVERROR_EXIT;
++ }
++
++#if TRACE_WPP
++ {
++ int n;
++ sem_getvalue(&lc->bt_sem_in, &n);
++ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
++ }
++#endif
++
++ ts_prev = lc->ts;
++
++ // If we have had an error - do no further decode but do continue
++ // moving signals around so the other threads continue to operate
++ // correctly (or at least as correctly as they can with this line missing)
++ //
++ // Errors in WPP/Tile are less fatal than normal as we have a good idea
++ // of how to restart on the next line so there is no need to give up totally
++ if (err != 0)
++ {
++ lc->unit_done = 0;
++ lc->ts += partial_size;
++ }
++ else
++ {
++ worker_pass0_ready(s, lc);
++
++ if ((err = fill_job(s, lc, partial_size)) < 0 ||
++ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
++ {
++ if (err == 0) {
++ av_log(s, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
++ err = AVERROR_INVALIDDATA;
++ }
++ worker_free(s, lc);
++ lc->ts = ts_prev + partial_size; // Pretend we did all that
++ lc->unit_done = 0;
++ }
++ else if (is_tile)
++ {
++ worker_submit_job(s, lc);
++ }
++ }
++
++ ++loop_n;
++ } while (lc->ts < ts_eol && !lc->unit_done);
++
++ // If we are on the last line & we didn't get a whole line we must wait for
++ // and sink the sem_posts from the line above / tile to the left.
++ while ((ts_prev += partial_size) < ts_eol)
++ {
++#if TRACE_WPP
++ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
++#endif
++ if (wait_bt_sem_in(lc) != 0)
++ return AVERROR_EXIT;
++ }
++
++ lc->bt_line_no += line_inc;
++
++ if (!is_tile && err == 0)
++ worker_submit_job(s, lc);
++
++ if (!is_last) {
++ lc->ts = ts_next;
++
++#if TRACE_WPP
++ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ if (loop_n > 1) {
++#if TRACE_WPP
++ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++ sem_post(lc->bt_psem_out);
++ }
++ }
++ else
++ {
++ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);
++
++ // When all done poke the thread 0 sem_in one final time
++#if TRACE_WPP
++ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
++#endif
++ sem_post(&s->HEVClcList[0]->bt_sem_in);
++ }
++
++#if TRACE_WPP
++ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
++#endif
++ return err;
++}
++
++static void wpp_setup_lcs(HEVCRpiContext * const s)
++{
++ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const unsigned int line_width = line_ts_width(s, ts);
++
++ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
++ {
++ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++ lc->ts = ts;
++ lc->bt_is_tile = 0;
++ lc->bt_line_no = i;
++ lc->bt_line_width = line_width;
++ lc->bt_last_line = s->sh.num_entry_point_offsets;
++ lc->bt_line_inc = RPI_BIT_THREADS;
++ ts += line_width;
++ }
++}
++
++
++// Can only process tile single row at once
++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
++{
++ const HEVCRpiPPS * const pps = s->ps.pps;
++ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++ const unsigned int tile0 = pps->tile_id[ts0];
++ const unsigned int col0 = tile0 % pps->num_tile_columns;
++
++ const unsigned int col = (slice_row == 0) ? col0 : 0;
++ unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
++ const unsigned int last_line = FFMIN(
++ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
++
++ const unsigned int par =
++ FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
++#if TRACE_WPP
++ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
++ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
++#endif
++ for (unsigned int i = 0; i != par; ++i, ++line)
++ {
++ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++ const unsigned int tile = tile0 + line;
++
++ lc->ts = pps->ctb_addr_rs_to_ts[pps->tile_pos_rs[tile]];
++ lc->bt_line_no = line;
++ lc->bt_is_tile = 1;
++ lc->bt_line_width = line_ts_width(s, lc->ts);
++ lc->bt_last_line = last_line;
++ lc->bt_line_inc = par;
++ }
++}
++
++
++static void * bit_thread(void * v)
++{
++ HEVCRpiLocalContext * const lc = v;
++ HEVCRpiContext *const s = lc->context;
++
++ while (wait_bt_sem_in(lc) == 0)
++ {
++ int err;
++
++ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp
++ if (lc->bt_terminate) {
++ av_log(s, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
++ break;
++ }
++ av_log(s, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
++ }
++ }
++
++ return NULL;
++}
++
++static int bit_threads_start(HEVCRpiContext * const s)
++{
++ if (s->bt_started)
++ return 0;
++
++ for (int i = 1; i < RPI_BIT_THREADS; ++i)
++ {
++ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
++ if (s->HEVClcList[i] == NULL) {
++ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
++ return -1;
++ }
++
++ bt_lc_init(s, s->HEVClcList[i], i);
++ job_lc_init(s->HEVClcList[i]);
++ }
++
++ // Link the sems in a circle
++ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
++ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
++ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
++
++ // Init all lc before starting any threads
++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++ {
++ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
++ return -1;
++ }
++
++ s->bt_started = 1;
++ return 0;
++}
++
++static int bit_threads_kill(HEVCRpiContext * const s)
++{
++ if (!s->bt_started)
++ return 0;
++ s->bt_started = 0;
++
++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++ {
++ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
++ if (lc == NULL)
++ break;
++
++ lc->bt_terminate = 1;
++ sem_post(&lc->bt_sem_in);
++ pthread_join(s->bit_threads[i], NULL);
++
++ sem_destroy(&lc->bt_sem_in);
++ job_lc_kill(lc);
++ }
++ return 0;
++}
++#endif
++
++
++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++{
++ HEVCRpiContext * const s = avctxt->priv_data;
++ HEVCRpiLocalContext * const lc = s->HEVClc;
++ int err;
++
++ // Start of slice
++ if ((err = slice_start(s, lc)) != 0)
++ return err;
++
++#if RPI_EXTRA_BIT_THREADS > 0
++
++ if (s->sh.num_entry_point_offsets != 0 &&
++ s->ps.pps->num_tile_columns > 1)
++ {
++ unsigned int slice_row = 0;
++
++#if TRACE_WPP
++ printf("%s: Do Tiles\n", __func__);
++#endif
++ // Generate & start extra bit threads if they aren't already running
++ bit_threads_start(s);
++
++ do
++ {
++ // Reset lc lines etc.
++ tile_one_row_setup_lcs(s, slice_row);
++
++#if TRACE_WPP
++ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
++#if TRACE_WPP
++ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++ while (lc->bt_line_no <= lc->bt_last_line) {
++ rpi_sem_wait(&lc->bt_sem_in);
++ rpi_run_one_line(s, lc, 0);
++ }
++#if TRACE_WPP
++ printf("%s: Done body\n", __func__);
++#endif
++
++ // Wait for everything else to finish
++ rpi_sem_wait(&lc->bt_sem_in);
++
++ ++slice_row;
++ } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
++
++
++#if TRACE_WPP
++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++ else
++
++ // * We only cope with WPP in a single column
++ // Probably want to deal with that case as tiles rather than WPP anyway
++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly
++ if (s->ps.pps->entropy_coding_sync_enabled_flag &&
++ s->ps.pps->num_tile_columns == 1 &&
++ s->sh.num_entry_point_offsets != 0)
++ {
++#if TRACE_WPP
++ printf("%s: Do WPP\n", __func__);
++#endif
++ // Generate & start extra bit threads if they aren't already running
++ bit_threads_start(s);
++
++ // Reset lc lines etc.
++ wpp_setup_lcs(s);
++
++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
++#if TRACE_WPP
++ printf("%s: Done 1st\n", __func__);
++#endif
++
++ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
++ rpi_sem_wait(&lc->bt_sem_in);
++ rpi_run_one_line(s, lc, 0);
++ }
++#if TRACE_WPP
++ printf("%s: Done body\n", __func__);
++#endif
++
++ // Wait for everything else to finish
++ rpi_sem_wait(&lc->bt_sem_in);
++
++#if TRACE_WPP
++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++ else
++#endif
++ {
++#if TRACE_WPP
++ printf("%s: Single start: ts=%d\n", __func__, lc->ts);
++#endif
++ // Single bit thread
++ do {
++ // Make sure we have space to prepare the next job
++ worker_pass0_ready(s, lc);
++
++ if ((err = fill_job(s, lc, 0)) < 0)
++ goto fail;
++
++ worker_submit_job(s, lc);
++ } while (!lc->unit_done);
++
++#if TRACE_WPP
++ printf("%s: Single end: ts=%d\n", __func__, lc->ts);
++#endif
++ }
++
++ // If we have reached the end of the frame then wait for the worker to finish all its jobs
++ if (lc->ts >= s->ps.sps->ctb_size) {
++ worker_wait(s, lc);
++ }
++
++#if RPI_TSTATS
++ {
++ HEVCRpiStats *const ts = &s->tstats;
++
++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++ ts->y_pred2_hgt16, ts->y_pred2_hle16);
++ memset(ts, 0, sizeof(*ts));
++ }
++#endif
++
++ return lc->ts;
++
++fail:
++ // Cleanup
++ av_log(s, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
++ // Free our job & wait for temination
++ worker_free(s, lc);
++ worker_wait(s, lc);
++ return err;
++}
++
++
++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++ int err;
++ if ((err = gen_entry_points(s, nal)) < 0)
++ return err;
++
++ return rpi_decode_entry(s->avctx, NULL);
++}
++
++static int set_side_data(HEVCRpiContext *s)
++{
++ AVFrame *out = s->ref->frame;
++
++ if (s->sei.frame_packing.present &&
++ s->sei.frame_packing.arrangement_type >= 3 &&
++ s->sei.frame_packing.arrangement_type <= 5 &&
++ s->sei.frame_packing.content_interpretation_type > 0 &&
++ s->sei.frame_packing.content_interpretation_type < 3) {
++ AVStereo3D *stereo = av_stereo3d_create_side_data(out);
++ if (!stereo)
++ return AVERROR(ENOMEM);
++
++ switch (s->sei.frame_packing.arrangement_type) {
++ case 3:
++ if (s->sei.frame_packing.quincunx_subsampling)
++ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
++ else
++ stereo->type = AV_STEREO3D_SIDEBYSIDE;
++ break;
++ case 4:
++ stereo->type = AV_STEREO3D_TOPBOTTOM;
++ break;
++ case 5:
++ stereo->type = AV_STEREO3D_FRAMESEQUENCE;
++ break;
++ }
++
++ if (s->sei.frame_packing.content_interpretation_type == 2)
++ stereo->flags = AV_STEREO3D_FLAG_INVERT;
++ }
++
++ if (s->sei.display_orientation.present &&
++ (s->sei.display_orientation.anticlockwise_rotation ||
++ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
++ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
++ AVFrameSideData *rotation = av_frame_new_side_data(out,
++ AV_FRAME_DATA_DISPLAYMATRIX,
++ sizeof(int32_t) * 9);
++ if (!rotation)
++ return AVERROR(ENOMEM);
++
++ av_display_rotation_set((int32_t *)rotation->data, angle);
++ av_display_matrix_flip((int32_t *)rotation->data,
++ s->sei.display_orientation.hflip,
++ s->sei.display_orientation.vflip);
++ }
++
++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++ // so the side data persists for the entire coded video sequence.
++ if (s->sei.mastering_display.present > 0 &&
++ IS_IRAP(s) && s->no_rasl_output_flag) {
++ s->sei.mastering_display.present--;
++ }
++ if (s->sei.mastering_display.present) {
++ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
++ const int mapping[3] = {2, 0, 1};
++ const int chroma_den = 50000;
++ const int luma_den = 10000;
++ int i;
++ AVMasteringDisplayMetadata *metadata =
++ av_mastering_display_metadata_create_side_data(out);
++ if (!metadata)
++ return AVERROR(ENOMEM);
++
++ for (i = 0; i < 3; i++) {
++ const int j = mapping[i];
++ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
++ metadata->display_primaries[i][0].den = chroma_den;
++ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
++ metadata->display_primaries[i][1].den = chroma_den;
++ }
++ metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
++ metadata->white_point[0].den = chroma_den;
++ metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
++ metadata->white_point[1].den = chroma_den;
++
++ metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
++ metadata->max_luminance.den = luma_den;
++ metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
++ metadata->min_luminance.den = luma_den;
++ metadata->has_luminance = 1;
++ metadata->has_primaries = 1;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
++ av_q2d(metadata->display_primaries[0][0]),
++ av_q2d(metadata->display_primaries[0][1]),
++ av_q2d(metadata->display_primaries[1][0]),
++ av_q2d(metadata->display_primaries[1][1]),
++ av_q2d(metadata->display_primaries[2][0]),
++ av_q2d(metadata->display_primaries[2][1]),
++ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
++ av_log(s->avctx, AV_LOG_DEBUG,
++ "min_luminance=%f, max_luminance=%f\n",
++ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
++ }
++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++ // so the side data persists for the entire coded video sequence.
++ if (s->sei.content_light.present > 0 &&
++ IS_IRAP(s) && s->no_rasl_output_flag) {
++ s->sei.content_light.present--;
++ }
++ if (s->sei.content_light.present) {
++ AVContentLightMetadata *metadata =
++ av_content_light_metadata_create_side_data(out);
++ if (!metadata)
++ return AVERROR(ENOMEM);
++ metadata->MaxCLL = s->sei.content_light.max_content_light_level;
++ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
++ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
++ metadata->MaxCLL, metadata->MaxFALL);
++ }
++
++ if (s->sei.a53_caption.a53_caption) {
++ AVFrameSideData* sd = av_frame_new_side_data(out,
++ AV_FRAME_DATA_A53_CC,
++ s->sei.a53_caption.a53_caption_size);
++ if (sd)
++ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
++ av_freep(&s->sei.a53_caption.a53_caption);
++ s->sei.a53_caption.a53_caption_size = 0;
++ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
++ }
++
++ if (s->sei.alternative_transfer.present &&
++ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
++ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
++ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
++ }
++
++ return 0;
++}
++
++static int hevc_frame_start(HEVCRpiContext * const s)
++{
++ int pic_size_in_ctb = ((s->ps.sps->width >> s->ps.sps->log2_min_cb_size) + 1) *
++ ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
++ int ret;
++
++ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++ memset(s->vertical_bs, 0, s->bs_width * s->bs_height);
++ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++
++ s->is_decoded = 0;
++ s->first_nal_type = s->nal_unit_type;
++
++ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
++
++ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
++ if (ret < 0)
++ goto fail;
++
++ ret = ff_hevc_rpi_frame_rps(s);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
++ goto fail;
++ }
++
++ s->ref->frame->key_frame = IS_IRAP(s);
++
++ ret = set_side_data(s);
++ if (ret < 0)
++ goto fail;
++
++ s->frame->pict_type = 3 - s->sh.slice_type;
++
++ if (!IS_IRAP(s))
++ ff_hevc_rpi_bump_frame(s);
++
++ av_frame_unref(s->output_frame);
++ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
++ if (ret < 0)
++ goto fail;
++
++ ff_thread_finish_setup(s->avctx);
++
++ return 0;
++
++fail:
++ if (s->ref)
++ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++ s->ref = NULL;
++ return ret;
++}
++
++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
++{
++ GetBitContext * const gb = &s->HEVClc->gb;
++ int ctb_addr_ts, ret;
++
++ *gb = nal->gb;
++ s->nal_unit_type = nal->type;
++ s->temporal_id = nal->temporal_id;
++
++ switch (s->nal_unit_type) {
++ case HEVC_NAL_VPS:
++ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_SPS:
++ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
++ s->apply_defdispwin);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_PPS:
++ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_SEI_PREFIX:
++ case HEVC_NAL_SEI_SUFFIX:
++ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
++ if (ret < 0)
++ goto fail;
++ break;
++ case HEVC_NAL_TRAIL_R:
++ case HEVC_NAL_TRAIL_N:
++ case HEVC_NAL_TSA_N:
++ case HEVC_NAL_TSA_R:
++ case HEVC_NAL_STSA_N:
++ case HEVC_NAL_STSA_R:
++ case HEVC_NAL_BLA_W_LP:
++ case HEVC_NAL_BLA_W_RADL:
++ case HEVC_NAL_BLA_N_LP:
++ case HEVC_NAL_IDR_W_RADL:
++ case HEVC_NAL_IDR_N_LP:
++ case HEVC_NAL_CRA_NUT:
++ case HEVC_NAL_RADL_N:
++ case HEVC_NAL_RADL_R:
++ case HEVC_NAL_RASL_N:
++ case HEVC_NAL_RASL_R:
++ ret = hls_slice_header(s);
++ if (ret < 0)
++ return ret;
++
++ // The definition of _N unit types is "non-reference for other frames
++ // with the same temporal_id" so they may/will be ref frames for pics
++ // with a higher temporal_id.
++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++ !(s->nal_unit_type == HEVC_NAL_TRAIL_N ||
++ s->nal_unit_type == HEVC_NAL_TSA_N ||
++ s->nal_unit_type == HEVC_NAL_STSA_N ||
++ s->nal_unit_type == HEVC_NAL_RADL_N ||
++ s->nal_unit_type == HEVC_NAL_RASL_N);
++ s->offload_recon = s->used_for_ref;
++// s->offload_recon = 0;
++
++#if DEBUG_DECODE_N
++ {
++ static int z = 0;
++ if (IS_IDR(s)) {
++ z = 1;
++ }
++ if (z != 0 && z++ > DEBUG_DECODE_N) {
++ s->is_decoded = 0;
++ break;
++ }
++ }
++#endif
++ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
++ s->is_decoded = 0;
++ break;
++ }
++
++ if (s->sh.first_slice_in_pic_flag) {
++ if (s->max_ra == INT_MAX) {
++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
++ s->max_ra = s->poc;
++ } else {
++ if (IS_IDR(s))
++ s->max_ra = INT_MIN;
++ }
++ }
++
++ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
++ s->poc <= s->max_ra) {
++ s->is_decoded = 0;
++ break;
++ } else {
++ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
++ s->max_ra = INT_MIN;
++ }
++
++ ret = hevc_frame_start(s);
++ if (ret < 0)
++ return ret;
++ } else if (!s->ref) {
++ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
++ goto fail;
++ }
++
++ if (s->nal_unit_type != s->first_nal_type) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Non-matching NAL types of the VCL NALUs: %d %d\n",
++ s->first_nal_type, s->nal_unit_type);
++ return AVERROR_INVALIDDATA;
++ }
++
++ if (!s->sh.dependent_slice_segment_flag &&
++ s->sh.slice_type != HEVC_SLICE_I) {
++ ret = ff_hevc_rpi_slice_rpl(s);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Error constructing the reference lists for the current slice.\n");
++ goto fail;
++ }
++ }
++
++ ctb_addr_ts = hls_slice_data(s, nal);
++ if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) {
++ s->is_decoded = 1;
++ }
++
++ if (ctb_addr_ts < 0) {
++ ret = ctb_addr_ts;
++ goto fail;
++ }
++ break;
++ case HEVC_NAL_EOS_NUT:
++ case HEVC_NAL_EOB_NUT:
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ break;
++ case HEVC_NAL_AUD:
++ case HEVC_NAL_FD_NUT:
++ break;
++ default:
++ av_log(s->avctx, AV_LOG_INFO,
++ "Skipping NAL unit %d\n", s->nal_unit_type);
++ }
++
++ return 0;
++fail:
++ if (s->avctx->err_recognition & AV_EF_EXPLODE)
++ return ret;
++ return 0;
++}
++
++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
++{
++ int i, ret = 0;
++ int eos_at_start = 1;
++
++ s->ref = NULL;
++ s->last_eos = s->eos;
++ s->eos = 0;
++
++ /* split the input packet into NAL units, so we know the upper bound on the
++ * number of slices in the frame */
++ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
++ s->nal_length_size, s->avctx->codec_id, 1);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_ERROR,
++ "Error splitting the input into NAL units.\n");
++ return ret;
++ }
++
++ for (i = 0; i < s->pkt.nb_nals; i++) {
++ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
++ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
++ if (eos_at_start) {
++ s->last_eos = 1;
++ } else {
++ s->eos = 1;
++ }
++ } else {
++ eos_at_start = 0;
++ }
++ }
++
++ /* decode the NAL units */
++ for (i = 0; i < s->pkt.nb_nals; i++) {
++ ret = decode_nal_unit(s, &s->pkt.nals[i]);
++ if (ret < 0) {
++ av_log(s->avctx, AV_LOG_WARNING,
++ "Error parsing NAL unit #%d.\n", i);
++ goto fail;
++ }
++ }
++
++fail: // Also success path
++ if (s->ref != NULL) {
++ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
++ ff_hevc_rpi_progress_signal_all_done(s);
++ }
++ else {
++ // Flush frame to real memory as we expect to be able to pass
++ // it straight on to mmal
++ flush_frame(s, s->frame);
++ }
++ }
++ return ret;
++}
++
++static void print_md5(void *log_ctx, int level, uint8_t md5[16])
++{
++ int i;
++ for (i = 0; i < 16; i++)
++ av_log(log_ctx, level, "%02"PRIx8, md5[i]);
++}
++
++static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
++{
++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++ int pixel_shift;
++ int i, j;
++
++ if (!desc)
++ return AVERROR(EINVAL);
++
++ pixel_shift = desc->comp[0].depth > 8;
++
++ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
++ s->poc);
++
++ /* the checksums are LE, so we have to byteswap for >8bpp formats
++ * on BE arches */
++#if HAVE_BIGENDIAN
++ if (pixel_shift && !s->checksum_buf) {
++ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
++ FFMAX3(frame->linesize[0], frame->linesize[1],
++ frame->linesize[2]));
++ if (!s->checksum_buf)
++ return AVERROR(ENOMEM);
++ }
++#endif
++
++ for (i = 0; frame->data[i]; i++) {
++ int width = s->avctx->coded_width;
++ int height = s->avctx->coded_height;
++ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width;
++ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
++ uint8_t md5[16];
++
++ av_md5_init(s->sei.picture_hash.md5_ctx);
++ for (j = 0; j < h; j++) {
++ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
++#if HAVE_BIGENDIAN
++ if (pixel_shift) {
++ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
++ (const uint16_t *) src, w);
++ src = s->checksum_buf;
++ }
++#endif
++ av_md5_update(s->sei.picture_hash.md5_ctx, src, w << pixel_shift);
++ }
++ av_md5_final(s->sei.picture_hash.md5_ctx, md5);
++
++ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
++ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
++ print_md5(s->avctx, AV_LOG_DEBUG, md5);
++ av_log (s->avctx, AV_LOG_DEBUG, "; ");
++ } else {
++ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
++ print_md5(s->avctx, AV_LOG_ERROR, md5);
++ av_log (s->avctx, AV_LOG_ERROR, " != ");
++ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
++ av_log (s->avctx, AV_LOG_ERROR, "\n");
++ return AVERROR_INVALIDDATA;
++ }
++ }
++
++ av_log(s->avctx, AV_LOG_DEBUG, "\n");
++
++ return 0;
++}
++
++static int all_sps_supported(const HEVCRpiContext * const s)
++{
++ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ if (s->ps.sps_list[i] != NULL)
++ {
++ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++ if (!is_sps_supported(sps))
++ return 0;
++ }
++ }
++ return 1;
++}
++
++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
++{
++ int ret, i;
++
++ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
++ &s->nal_length_size, s->avctx->err_recognition,
++ s->apply_defdispwin, s->avctx);
++ if (ret < 0)
++ return ret;
++
++ /* export stream parameters from the first SPS */
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ if (first && s->ps.sps_list[i]) {
++ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++ export_stream_params(s->avctx, &s->ps, sps);
++ break;
++ }
++ }
++
++ return 0;
++}
++
++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
++ AVPacket *avpkt)
++{
++ int ret;
++ int new_extradata_size;
++ uint8_t *new_extradata;
++ HEVCRpiContext *s = avctx->priv_data;
++
++ if (!avpkt->size) {
++ ret = ff_hevc_rpi_output_frame(s, data, 1);
++ if (ret < 0)
++ return ret;
++
++ *got_output = ret;
++ return 0;
++ }
++
++ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
++ &new_extradata_size);
++ if (new_extradata && new_extradata_size > 0) {
++ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
++ if (ret < 0)
++ return ret;
++ }
++
++ s->ref = NULL;
++ ret = decode_nal_units(s, avpkt->data, avpkt->size);
++ if (ret < 0)
++ return ret;
++
++ /* verify the SEI checksum */
++ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
++ s->sei.picture_hash.is_md5) {
++ ret = verify_md5(s, s->ref->frame);
++ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
++ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++ return ret;
++ }
++ }
++ s->sei.picture_hash.is_md5 = 0;
++
++ if (s->is_decoded) {
++ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
++ s->is_decoded = 0;
++ }
++
++ if (s->output_frame->buf[0]) {
++ av_frame_move_ref(data, s->output_frame);
++ *got_output = 1;
++ }
++
++ return avpkt->size;
++}
++
++static int hevc_ref_frame(HEVCRpiContext *s, HEVCFrame *dst, HEVCFrame *src)
++{
++ int ret;
++
++ ret = ff_thread_ref_frame(&dst->tf, &src->tf);
++ if (ret < 0)
++ return ret;
++
++ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++ if (!dst->tab_mvf_buf)
++ goto fail;
++ dst->tab_mvf = src->tab_mvf;
++
++ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++ if (!dst->rpl_tab_buf)
++ goto fail;
++ dst->rpl_tab = src->rpl_tab;
++
++ dst->rpl_buf = av_buffer_ref(src->rpl_buf);
++ if (!dst->rpl_buf)
++ goto fail;
++
++ dst->poc = src->poc;
++ dst->ctb_count = src->ctb_count;
++ dst->flags = src->flags;
++ dst->sequence = src->sequence;
++ return 0;
++
++fail:
++ ff_hevc_rpi_unref_frame(s, dst, ~0);
++ return AVERROR(ENOMEM);
++}
++
++
++static av_cold int hevc_decode_free(AVCodecContext *avctx)
++{
++ HEVCRpiContext * const s = avctx->priv_data;
++ int i;
++
++ pic_arrays_free(s);
++
++ av_freep(&s->sei.picture_hash.md5_ctx);
++
++ av_freep(&s->cabac_state);
++
++#if RPI_EXTRA_BIT_THREADS
++ bit_threads_kill(s);
++#endif
++
++ hevc_exit_worker(s);
++ vpu_qpu_term();
++ for (i = 0; i != 2; ++i) {
++ ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++ }
++ job_lc_kill(s->HEVClc);
++ av_rpi_zc_uninit(avctx);
++
++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0]
++ av_freep(&s->sao_pixel_buffer_v[0]);
++ av_frame_free(&s->output_frame);
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++ av_frame_free(&s->DPB[i].frame);
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
++ av_buffer_unref(&s->ps.vps_list[i]);
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
++ av_buffer_unref(&s->ps.sps_list[i]);
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
++ av_buffer_unref(&s->ps.pps_list[i]);
++ s->ps.sps = NULL;
++ s->ps.pps = NULL;
++ s->ps.vps = NULL;
++
++ av_freep(&s->sh.entry_point_offset);
++ av_freep(&s->sh.offset);
++ av_freep(&s->sh.size);
++
++ for (i = 1; i < s->threads_number; i++) {
++ if (s->sList[i] != NULL) {
++ av_freep(&s->sList[i]);
++ }
++ }
++
++ // Free separately from sLists as used that way by RPI WPP
++ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
++ av_freep(s->HEVClcList + i);
++ }
++ s->HEVClc = NULL; // Allocated as part of HEVClcList
++
++ ff_h2645_packet_uninit(&s->pkt);
++
++ return 0;
++}
++
++
++static av_cold int hevc_init_context(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ int i;
++
++ s->avctx = avctx;
++
++ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
++ if (!s->HEVClc)
++ goto fail;
++ s->HEVClcList[0] = s->HEVClc;
++ s->sList[0] = s;
++
++ // Whilst FFmpegs init fn is only called once the close fn is called as
++ // many times as we have threads (init_thread_copy is called for the
++ // threads). So to match init & term put the init here where it will be
++ // called by both init & copy
++ av_rpi_zc_init(avctx);
++
++ if (vpu_qpu_init() != 0)
++ goto fail;
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ {
++ static const uint32_t dframe[1] = {0x80808080};
++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
++ }
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++ s->qpu_dummy_frame_qpu = qpu_fn(mc_start); // Use our code as a dummy frame
++#endif
++
++ bt_lc_init(s, s->HEVClc, 0);
++ job_lc_init(s->HEVClc);
++
++ for (i = 0; i != 2; ++i) {
++ ff_hevc_rpi_progress_init_state(s->progress_states + i);
++ }
++
++ s->cabac_state = av_malloc(HEVC_CONTEXTS);
++ if (!s->cabac_state)
++ goto fail;
++
++ s->output_frame = av_frame_alloc();
++ if (!s->output_frame)
++ goto fail;
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ s->DPB[i].frame = av_frame_alloc();
++ if (!s->DPB[i].frame)
++ goto fail;
++ s->DPB[i].tf.f = s->DPB[i].frame;
++ s->DPB[i].dpb_no = i;
++ }
++
++ s->max_ra = INT_MAX;
++
++ s->sei.picture_hash.md5_ctx = av_md5_alloc();
++ if (!s->sei.picture_hash.md5_ctx)
++ goto fail;
++
++ ff_bswapdsp_init(&s->bdsp);
++
++ s->context_initialized = 1;
++ s->eos = 0;
++
++ ff_hevc_rpi_reset_sei(&s->sei);
++
++ return 0;
++
++fail:
++ av_log(s, AV_LOG_ERROR, "%s: Failed\n", __func__);
++ hevc_decode_free(avctx);
++ return AVERROR(ENOMEM);
++}
++
++static int hevc_update_thread_context(AVCodecContext *dst,
++ const AVCodecContext *src)
++{
++ HEVCRpiContext *s = dst->priv_data;
++ HEVCRpiContext *s0 = src->priv_data;
++ int i, ret;
++
++ if (!s->context_initialized) {
++ ret = hevc_init_context(dst);
++ if (ret < 0)
++ return ret;
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++ if (s0->DPB[i].frame->buf[0]) {
++ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
++ if (ret < 0)
++ return ret;
++ }
++ }
++
++ if (s->ps.sps != s0->ps.sps)
++ s->ps.sps = NULL;
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
++ av_buffer_unref(&s->ps.vps_list[i]);
++ if (s0->ps.vps_list[i]) {
++ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
++ if (!s->ps.vps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++ av_buffer_unref(&s->ps.sps_list[i]);
++ if (s0->ps.sps_list[i]) {
++ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
++ if (!s->ps.sps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
++ av_buffer_unref(&s->ps.pps_list[i]);
++ if (s0->ps.pps_list[i]) {
++ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
++ if (!s->ps.pps_list[i])
++ return AVERROR(ENOMEM);
++ }
++ }
++
++ if (s->ps.sps != s0->ps.sps)
++ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
++ return ret;
++
++ s->seq_decode = s0->seq_decode;
++ s->seq_output = s0->seq_output;
++ s->pocTid0 = s0->pocTid0;
++ s->max_ra = s0->max_ra;
++ s->eos = s0->eos;
++ s->no_rasl_output_flag = s0->no_rasl_output_flag;
++
++ s->is_nalff = s0->is_nalff;
++ s->nal_length_size = s0->nal_length_size;
++
++ s->threads_number = s0->threads_number;
++ s->threads_type = s0->threads_type;
++
++ if (s0->eos) {
++ s->seq_decode = (s->seq_decode + 1) & 0xff;
++ s->max_ra = INT_MAX;
++ }
++
++ s->sei.frame_packing = s0->sei.frame_packing;
++ s->sei.display_orientation = s0->sei.display_orientation;
++ s->sei.mastering_display = s0->sei.mastering_display;
++ s->sei.content_light = s0->sei.content_light;
++ s->sei.alternative_transfer = s0->sei.alternative_transfer;
++
++ // * We do this here as it allows us to easily locate our parents
++ // global job pool, but there really should be a less nasty way
++ if (s->jbc == NULL)
++ {
++ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
++ hevc_init_worker(s);
++ }
++
++ return 0;
++}
++
++static av_cold int hevc_decode_init(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ int ret;
++
++ avctx->internal->allocate_progress = 1;
++
++ {
++ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
++ if (jbg == NULL)
++ {
++ av_log(s, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
++ return -1;
++ }
++
++ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
++ {
++ av_log(s, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
++ return -1;
++ }
++ }
++
++ ret = hevc_init_context(avctx);
++ if (ret < 0)
++ return ret;
++
++ hevc_init_worker(s);
++
++ s->enable_parallel_tiles = 0;
++ s->sei.picture_timing.picture_struct = 0;
++ s->eos = 1;
++
++ atomic_init(&s->wpp_err, 0);
++
++ if(avctx->active_thread_type & FF_THREAD_SLICE)
++ s->threads_number = avctx->thread_count;
++ else
++ s->threads_number = 1;
++
++ if (avctx->extradata_size > 0 && avctx->extradata) {
++ ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
++
++ if (ret == 0 && !all_sps_supported(s))
++ ret = AVERROR_DECODER_NOT_FOUND;
++
++ if (ret < 0)
++ {
++ hevc_decode_free(avctx);
++ return ret;
++ }
++ }
++
++ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++ s->threads_type = FF_THREAD_FRAME;
++ else
++ s->threads_type = FF_THREAD_SLICE;
++
++ return 0;
++}
++
++static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ int ret;
++
++ memset(s, 0, sizeof(*s));
++
++ ret = hevc_init_context(avctx);
++ if (ret < 0)
++ return ret;
++
++ return 0;
++}
++
++static void hevc_decode_flush(AVCodecContext *avctx)
++{
++ HEVCRpiContext *s = avctx->priv_data;
++ ff_hevc_rpi_flush_dpb(s);
++ s->max_ra = INT_MAX;
++ s->eos = 1;
++}
++
++#define OFFSET(x) offsetof(HEVCRpiContext, x)
++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
++
++
++static const AVOption options[] = {
++ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++ { NULL },
++};
++
++static const AVClass hevc_rpi_decoder_class = {
++ .class_name = "HEVC RPI decoder",
++ .item_name = av_default_item_name,
++ .option = options,
++ .version = LIBAVUTIL_VERSION_INT,
++};
++
++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
++ AV_PIX_FMT_SAND128,
++ AV_PIX_FMT_SAND64_10,
++ AV_PIX_FMT_NONE
++};
++
++AVCodec ff_hevc_rpi_decoder = {
++ .name = "hevc_rpi",
++ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
++ .type = AVMEDIA_TYPE_VIDEO,
++ .id = AV_CODEC_ID_HEVC,
++ .priv_data_size = sizeof(HEVCRpiContext),
++ .priv_class = &hevc_rpi_decoder_class,
++ .init = hevc_decode_init,
++ .close = hevc_decode_free,
++ .decode = hevc_rpi_decode_frame,
++ .flush = hevc_decode_flush,
++ .update_thread_context = hevc_update_thread_context,
++ .init_thread_copy = hevc_init_thread_copy,
++ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++#if 0
++ // Debugging is often easier without threads getting in the way
++ 0,
++#warning H265 threading turned off
++#else
++ // We only have decent optimisation for frame - so only admit to that
++ AV_CODEC_CAP_FRAME_THREADS,
++#endif
++ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING,
++ .pix_fmts = hevc_rpi_pix_fmts,
++ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++};
++
+diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
+new file mode 100644
+index 0000000000..f61b29e669
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.h
+@@ -0,0 +1,1054 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDEC_H
++#define AVCODEC_RPI_HEVCDEC_H
++
++#include "config.h"
++
++#include
++
++#include "libavutil/buffer.h"
++
++#include "avcodec.h"
++#include "bswapdsp.h"
++#include "cabac.h"
++#include "get_bits.h"
++#include "rpi_hevcpred.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++#include "rpi_hevcdsp.h"
++#include "internal.h"
++#include "thread.h"
++#include "videodsp.h"
++
++#define MAX_NB_THREADS 16
++#define SHIFT_CTB_WPP 2
++
++//TODO: check if this is really the maximum
++#define MAX_TRANSFORM_DEPTH 5
++
++#define MAX_TB_SIZE 32
++#define MAX_QP 51
++#define DEFAULT_INTRA_TC_OFFSET 2
++
++#define HEVC_CONTEXTS 199
++
++#define MRG_MAX_NUM_CANDS 5
++
++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64
++
++// Size of DPB array
++#define HEVC_DPB_ELS 32
++
++#define L0 0
++#define L1 1
++
++#define EPEL_EXTRA_BEFORE 1
++#define EPEL_EXTRA_AFTER 2
++#define EPEL_EXTRA 3
++#define QPEL_EXTRA_BEFORE 3
++#define QPEL_EXTRA_AFTER 4
++#define QPEL_EXTRA 7
++
++#define EDGE_EMU_BUFFER_STRIDE 80
++
++#include
++#include "rpi_qpu.h"
++
++// Max jobs per frame thread. Actual usage will be limited by the size
++// of the global job pool
++// ?? Limits
++#define RPI_MAX_JOBS 8
++
++// This is the number of _extra_ bit threads - we will have
++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
++//
++// 0 is legitimate and will disable our WPP processing
++//#define RPI_EXTRA_BIT_THREADS 0
++#define RPI_EXTRA_BIT_THREADS 2
++
++// Number of separate threads/passes in worker
++// 2 and 3 are the currently valid numbers
++// At the moment 3 seems fractionally faster
++//#define RPI_PASSES 2
++#define RPI_PASSES 3
++
++// Print out various usage stats
++#define RPI_TSTATS 0
++
++// Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++// (currently slower than deblocking on the ARM)
++// #define RPI_DEBLOCK_VPU
++
++#define RPI_VPU_DEBLOCK_CACHED 0
++
++// Use ARM emulation of QPU pred
++// These are for debug only as the emulation makes only limited
++// effort to be fast
++#define RPI_QPU_EMU_Y 0
++#define RPI_QPU_EMU_C 0
++
++// Max width & height we are prepared to consider
++// Sand frame shape calc becomes confused with large frames
++// Some buffer alloc also depends on this
++#define HEVC_RPI_MAX_WIDTH 2048
++#define HEVC_RPI_MAX_HEIGHT 1088
++
++
++/**
++ * Value of the luma sample at position (x, y) in the 2D array tab.
++ */
++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
++
++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
++ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
++
++enum RPSType {
++ ST_CURR_BEF = 0,
++ ST_CURR_AFT,
++ ST_FOLL,
++ LT_CURR,
++ LT_FOLL,
++ NB_RPS_TYPE,
++};
++
++enum SyntaxElement {
++ SAO_MERGE_FLAG = 0,
++ SAO_TYPE_IDX,
++ SAO_EO_CLASS,
++ SAO_BAND_POSITION,
++ SAO_OFFSET_ABS,
++ SAO_OFFSET_SIGN,
++ END_OF_SLICE_FLAG,
++ SPLIT_CODING_UNIT_FLAG,
++ CU_TRANSQUANT_BYPASS_FLAG,
++ SKIP_FLAG,
++ CU_QP_DELTA,
++ PRED_MODE_FLAG,
++ PART_MODE,
++ PCM_FLAG,
++ PREV_INTRA_LUMA_PRED_FLAG,
++ MPM_IDX,
++ REM_INTRA_LUMA_PRED_MODE,
++ INTRA_CHROMA_PRED_MODE,
++ MERGE_FLAG,
++ MERGE_IDX,
++ INTER_PRED_IDC,
++ REF_IDX_L0,
++ REF_IDX_L1,
++ ABS_MVD_GREATER0_FLAG,
++ ABS_MVD_GREATER1_FLAG,
++ ABS_MVD_MINUS2,
++ MVD_SIGN_FLAG,
++ MVP_LX_FLAG,
++ NO_RESIDUAL_DATA_FLAG,
++ SPLIT_TRANSFORM_FLAG,
++ CBF_LUMA,
++ CBF_CB_CR,
++ TRANSFORM_SKIP_FLAG,
++ EXPLICIT_RDPCM_FLAG,
++ EXPLICIT_RDPCM_DIR_FLAG,
++ LAST_SIGNIFICANT_COEFF_X_PREFIX,
++ LAST_SIGNIFICANT_COEFF_Y_PREFIX,
++ LAST_SIGNIFICANT_COEFF_X_SUFFIX,
++ LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
++ SIGNIFICANT_COEFF_GROUP_FLAG,
++ SIGNIFICANT_COEFF_FLAG,
++ COEFF_ABS_LEVEL_GREATER1_FLAG,
++ COEFF_ABS_LEVEL_GREATER2_FLAG,
++ COEFF_ABS_LEVEL_REMAINING,
++ COEFF_SIGN_FLAG,
++ LOG2_RES_SCALE_ABS,
++ RES_SCALE_SIGN_FLAG,
++ CU_CHROMA_QP_OFFSET_FLAG,
++ CU_CHROMA_QP_OFFSET_IDX,
++};
++
++enum PartMode {
++ PART_2Nx2N = 0,
++ PART_2NxN = 1,
++ PART_Nx2N = 2,
++ PART_NxN = 3,
++ PART_2NxnU = 4,
++ PART_2NxnD = 5,
++ PART_nLx2N = 6,
++ PART_nRx2N = 7,
++};
++
++enum PredMode {
++ MODE_INTER = 0,
++ MODE_INTRA,
++ MODE_SKIP,
++};
++
++enum InterPredIdc {
++ PRED_L0 = 0,
++ PRED_L1,
++ PRED_BI,
++};
++
++enum PredFlag {
++ PF_INTRA = 0,
++ PF_L0,
++ PF_L1,
++ PF_BI,
++};
++
++enum IntraPredMode {
++ INTRA_PLANAR = 0,
++ INTRA_DC,
++ INTRA_ANGULAR_2,
++ INTRA_ANGULAR_3,
++ INTRA_ANGULAR_4,
++ INTRA_ANGULAR_5,
++ INTRA_ANGULAR_6,
++ INTRA_ANGULAR_7,
++ INTRA_ANGULAR_8,
++ INTRA_ANGULAR_9,
++ INTRA_ANGULAR_10,
++ INTRA_ANGULAR_11,
++ INTRA_ANGULAR_12,
++ INTRA_ANGULAR_13,
++ INTRA_ANGULAR_14,
++ INTRA_ANGULAR_15,
++ INTRA_ANGULAR_16,
++ INTRA_ANGULAR_17,
++ INTRA_ANGULAR_18,
++ INTRA_ANGULAR_19,
++ INTRA_ANGULAR_20,
++ INTRA_ANGULAR_21,
++ INTRA_ANGULAR_22,
++ INTRA_ANGULAR_23,
++ INTRA_ANGULAR_24,
++ INTRA_ANGULAR_25,
++ INTRA_ANGULAR_26,
++ INTRA_ANGULAR_27,
++ INTRA_ANGULAR_28,
++ INTRA_ANGULAR_29,
++ INTRA_ANGULAR_30,
++ INTRA_ANGULAR_31,
++ INTRA_ANGULAR_32,
++ INTRA_ANGULAR_33,
++ INTRA_ANGULAR_34,
++};
++
++enum SAOType {
++ SAO_NOT_APPLIED = 0,
++ SAO_BAND,
++ SAO_EDGE,
++ SAO_APPLIED
++};
++
++enum SAOEOClass {
++ SAO_EO_HORIZ = 0,
++ SAO_EO_VERT,
++ SAO_EO_135D,
++ SAO_EO_45D,
++};
++
++enum ScanType {
++ SCAN_DIAG = 0,
++ SCAN_HORIZ,
++ SCAN_VERT,
++};
++
++typedef struct RefPicList {
++ struct HEVCFrame *ref[HEVC_MAX_REFS];
++ int list[HEVC_MAX_REFS];
++ int isLongTerm[HEVC_MAX_REFS];
++ int nb_refs;
++} RefPicList;
++
++typedef struct RefPicListTab {
++ RefPicList refPicList[2];
++} RefPicListTab;
++
++typedef struct CodingUnit {
++ int x;
++ int y;
++
++ enum PredMode pred_mode; ///< PredMode
++ enum PartMode part_mode; ///< PartMode
++
++ // Inferred parameters
++ uint8_t intra_split_flag; ///< IntraSplitFlag
++ uint8_t max_trafo_depth; ///< MaxTrafoDepth
++ uint8_t cu_transquant_bypass_flag;
++} CodingUnit;
++
++typedef struct NeighbourAvailable {
++ int cand_bottom_left;
++ int cand_left;
++ int cand_up;
++ int cand_up_left;
++ int cand_up_right;
++ int cand_up_right_sap;
++} NeighbourAvailable;
++
++typedef struct PredictionUnit {
++ int mpm_idx;
++ int rem_intra_luma_pred_mode;
++ uint8_t intra_pred_mode[4];
++ Mv mvd;
++ uint8_t merge_flag;
++ uint8_t intra_pred_mode_c[4];
++ uint8_t chroma_mode_c[4];
++} PredictionUnit;
++
++typedef struct TransformUnit {
++ int cu_qp_delta;
++
++ int res_scale_val;
++
++ // Inferred parameters;
++ int intra_pred_mode;
++ int intra_pred_mode_c;
++ int chroma_mode_c;
++ uint8_t is_cu_qp_delta_coded;
++ uint8_t is_cu_chroma_qp_offset_coded;
++ int8_t cu_qp_offset_cb;
++ int8_t cu_qp_offset_cr;
++ uint8_t cross_pf;
++} TransformUnit;
++
++typedef struct DBParams {
++ int8_t beta_offset; // -12 to +12
++ int8_t tc_offset; // -12 to +12
++} DBParams;
++
++#define HEVC_FRAME_FLAG_OUTPUT (1 << 0)
++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
++#define HEVC_FRAME_FLAG_LONG_REF (1 << 2)
++#define HEVC_FRAME_FLAG_BUMPING (1 << 3)
++
++struct HEVCRpiJob;
++
++typedef struct HEVCFrame {
++ AVFrame *frame;
++ ThreadFrame tf;
++ MvField *tab_mvf;
++ RefPicList *refPicList;
++ RefPicListTab **rpl_tab;
++ int ctb_count;
++ int poc;
++ struct HEVCFrame *collocated_ref;
++
++ AVBufferRef *tab_mvf_buf;
++ AVBufferRef *rpl_tab_buf;
++ AVBufferRef *rpl_buf;
++
++ /**
++ * A sequence counter, so that old frames are output first
++ * after a POC reset
++ */
++ uint16_t sequence;
++
++ /**
++ * A combination of HEVC_FRAME_FLAG_*
++ */
++ uint8_t flags;
++
++ // Entry no in DPB - can be used as a small unique
++ // frame identifier (within the current thread)
++ uint8_t dpb_no;
++} HEVCFrame;
++
++typedef struct HEVCRpiLocalContextIntra {
++ TransformUnit tu;
++ NeighbourAvailable na;
++} HEVCRpiLocalContextIntra;
++
++typedef struct HEVCRpiLocalContext {
++ TransformUnit tu; // Moved to start to match HEVCRpiLocalContextIntra (yuk!)
++ NeighbourAvailable na;
++
++ // Vars that allow us to locate everything from just an lc
++ struct HEVCRpiContext * context; // ??? make const ???
++ unsigned int lc_n; // lc list el no
++
++ // Job wait links
++ struct HEVCRpiLocalContext * jw_next;
++ struct HEVCRpiLocalContext * jw_prev;
++ struct HEVCRpiLocalContext * ljw_next;
++ struct HEVCRpiLocalContext * ljw_prev;
++ struct HEVCRpiJob * volatile jw_job;
++ sem_t jw_sem;
++
++ // ?? Wrap in structure ??
++ sem_t bt_sem_in;
++ sem_t * bt_psem_out;
++ volatile int bt_terminate;
++ unsigned int ts;
++ unsigned int bt_last_line; // Last line in this bit_thread chunk
++ unsigned int bt_line_no;
++ unsigned int bt_line_width;
++ unsigned int bt_line_inc;
++
++ struct HEVCRpiJob * jb0;
++ char unit_done; // Set once we have dealt with this slice
++// char max_done;
++ char bt_is_tile;
++ char last_progress_good;
++
++ char wpp_init; // WPP/Tile bitstream init has happened
++
++ uint8_t cabac_state[HEVC_CONTEXTS];
++
++ uint8_t stat_coeff[4];
++
++// uint8_t first_qp_group;
++
++ GetBitContext gb;
++ CABACContext cc;
++
++ int8_t qp_y;
++ int8_t curr_qp_y;
++
++ int qPy_pred;
++
++ uint8_t ctb_left_flag;
++ uint8_t ctb_up_flag;
++ uint8_t ctb_up_right_flag;
++ uint8_t ctb_up_left_flag;
++ int end_of_tiles_x;
++ int end_of_tiles_y;
++ /* +7 is for subpixel interpolation, *2 for high bit depths */
++ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++ /* The extended size between the new edge emu buffer is abused by SAO */
++ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++ DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++
++ int ct_depth;
++ CodingUnit cu;
++ PredictionUnit pu;
++
++#define BOUNDARY_LEFT_SLICE (1 << 0)
++#define BOUNDARY_LEFT_TILE (1 << 1)
++#define BOUNDARY_UPPER_SLICE (1 << 2)
++#define BOUNDARY_UPPER_TILE (1 << 3)
++ /* properties of the boundary of the current CTB for the purposes
++ * of the deblocking filter */
++ int boundary_flags;
++} HEVCRpiLocalContext;
++
++
++// Each block can have an intra prediction and an add_residual command
++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
++
++// Sand only has 2 planes (Y/C)
++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
++
++#ifdef RPI_DEBLOCK_VPU
++// Worst case is 16x16 CTUs
++#define RPI_MAX_DEBLOCK_CMDS (HEVC_RPI_MAX_WIDTH*4/16)
++#endif
++
++// Command for intra prediction and transform_add of predictions to coefficients
++enum rpi_pred_cmd_e
++{
++ RPI_PRED_ADD_RESIDUAL,
++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++ RPI_PRED_ADD_DC,
++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C
++ RPI_PRED_ADD_DC_V,
++ RPI_PRED_INTRA,
++ RPI_PRED_I_PCM,
++ RPI_PRED_CMD_MAX
++};
++
++typedef struct HEVCPredCmd {
++ uint8_t type;
++ uint8_t size; // log2 "size" used by all variants
++ uint8_t na; // i_pred - but left here as they pack well
++ uint8_t c_idx; // i_pred
++ union {
++ struct { // TRANSFORM_ADD
++ uint8_t * dst;
++ const int16_t * buf;
++ uint16_t stride; // Should be good enough for all pic fmts we use
++ int16_t dc;
++ } ta;
++ struct {
++ uint8_t * dst;
++ uint32_t stride;
++ int dc;
++ } dc;
++ struct { // INTRA
++ uint16_t x;
++ uint16_t y;
++ enum IntraPredMode mode;
++ } i_pred;
++ struct { // I_PCM
++ uint16_t x;
++ uint16_t y;
++ const void * src;
++ uint32_t src_len;
++ } i_pcm;
++ };
++} HEVCPredCmd;
++
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
++
++typedef struct HEVCRpiInterPredQ
++{
++ union qpu_mc_pred_cmd_u *qpu_mc_base;
++ union qpu_mc_pred_cmd_u *qpu_mc_curr;
++ struct qpu_mc_src_s *last_l0;
++ struct qpu_mc_src_s *last_l1;
++ unsigned int load;
++ uint32_t code_setup;
++ uint32_t code_sync;
++ uint32_t code_exit;
++} HEVCRpiInterPredQ;
++
++typedef struct HEVCRpiInterPredEnv
++{
++ HEVCRpiInterPredQ * q;
++ uint8_t n; // Number of Qs
++ uint8_t n_grp; // Number of Q in a group
++ uint8_t curr; // Current Q number (0..n-1)
++ uint8_t used; // 0 if nothing in any Q, 1 otherwise
++ uint8_t used_grp; // 0 if nothing in any Q in the current group
++ unsigned int max_fill;
++ unsigned int min_gap;
++ GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++ unsigned int n; // Number of commands
++ HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCoeffEnv {
++ unsigned int n;
++ int16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCoeffsEnv {
++ HEVCRpiCoeffEnv s[4];
++ GPU_MEM_PTR_T gptr;
++ void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiFrameProgressWait {
++ int req;
++ struct HEVCRpiFrameProgressWait * next;
++ sem_t sem;
++} HEVCRpiFrameProgressWait;
++
++typedef struct HEVCRpiFrameProgressState {
++ struct HEVCRpiFrameProgressWait * first;
++ struct HEVCRpiFrameProgressWait * last;
++ pthread_mutex_t lock;
++} HEVCRpiFrameProgressState;
++
++typedef struct RpiBlk
++{
++ unsigned int x;
++ unsigned int y;
++ unsigned int w;
++ unsigned int h;
++} RpiBlk;
++
++typedef struct HEVCRpiJob {
++ struct HEVCRpiJob * next; // Free chain
++ struct HEVCRpiJobCtl * jbc_local;
++ const HEVCRpiSPS * sps; // sps used to set up this job
++
++ int waited;
++ int ctu_ts_first;
++ int ctu_ts_last;
++ RpiBlk bounds; // Bounding box of job
++
++ struct qpu_mc_pred_y_p_s * last_y8_p;
++ struct qpu_mc_src_s * last_y8_l1;
++
++ HEVCRpiInterPredEnv chroma_ip;
++ HEVCRpiInterPredEnv luma_ip;
++ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
++ HEVCRpiIntraPredEnv intra;
++ HEVCRpiCoeffsEnv coeffs;
++ HEVCRpiFrameProgressWait progress_wait;
++} HEVCRpiJob;
++
++struct HEVCRpiContext;
++
++typedef void HEVCRpiWorkerFn(struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
++
++typedef struct HEVCRpiPassQueue
++{
++// int pending;
++ volatile int terminate;
++ sem_t sem_in;
++ sem_t * psem_out;
++ unsigned int job_n;
++ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
++ HEVCRpiWorkerFn * worker;
++ pthread_t thread;
++ uint8_t pass_n; // Pass number - debug
++ uint8_t started;
++} HEVCRpiPassQueue;
++
++
++struct HEVCRpiJobGlobal;
++
++typedef struct HEVCRpiJobCtl
++{
++ sem_t sem_out;
++
++ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated
++ struct HEVCRpiJobGlobal * jbg;
++
++ HEVCRpiLocalContext * lcw_head;
++ HEVCRpiLocalContext * lcw_tail;
++
++ pthread_mutex_t in_lock;
++ int offload_in;
++
++ HEVCRpiJob *offloadq[RPI_MAX_JOBS];
++} HEVCRpiJobCtl;
++
++
++typedef struct HEVCRpiJobGlobal
++{
++ intptr_t ref_count;
++ pthread_mutex_t lock;
++ HEVCRpiJob * free1; // Singly linked list of free jobs
++ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job
++ HEVCRpiLocalContext * wait_good; // Last good tail
++ HEVCRpiLocalContext * wait_tail;
++
++} HEVCRpiJobGlobal;
++
++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++ int y_pred1_y8_merge;
++ int y_pred1_xy;
++ int y_pred1_x0;
++ int y_pred1_y0;
++ int y_pred1_x0y0;
++ int y_pred1_wle8;
++ int y_pred1_wgt8;
++ int y_pred1_hle16;
++ int y_pred1_hgt16;
++ int y_pred2_xy;
++ int y_pred2_x0;
++ int y_pred2_y0;
++ int y_pred2_x0y0;
++ int y_pred2_hle16;
++ int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++
++typedef struct HEVCRpiContext {
++ const AVClass *c; // needed by private avoptions
++ AVCodecContext *avctx;
++
++ struct HEVCRpiContext *sList[MAX_NB_THREADS];
++
++ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS];
++ HEVCRpiLocalContext *HEVClc;
++
++ uint8_t threads_type;
++ uint8_t threads_number;
++
++ int width;
++ int height;
++
++ char used_for_ref; // rpi
++ char offload_recon;
++
++ HEVCRpiJobCtl * jbc;
++
++ // Function pointers
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++ const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory
++#endif
++ HEVCRpiQpu qpu;
++
++ HEVCRpiFrameProgressState progress_states[2];
++
++#ifdef RPI_DEBLOCK_VPU
++// With the new scheme of rpi_execute_dblk_cmds
++// it looks like ff_hevc_rpi_hls_filter is no longer called in raster order.
++// This causes trouble if RPI_DEBLOCK_VPU_Q_COUNT > 1 because we prepare setup
++// data for more than one row at a time before triggering the deblocker for one row.
++// This means that the deblock of the final row can use the wrong setup buffer.
++//
++// Also concerned that the thread progress and waiting for job completion is
++// not done correctly with RPI_DEBLOCK_VPU at the end of the frame, or for small CTU sizes.
++#define RPI_DEBLOCK_VPU_Q_COUNT 1
++
++ int enable_rpi_deblock;
++
++ int uv_setup_width;
++ int uv_setup_height;
++ int setup_width; // Number of 16x16 blocks across the image
++ int setup_height; // Number of 16x16 blocks down the image
++
++ struct dblk_vpu_q_s
++ {
++ GPU_MEM_PTR_T deblock_vpu_gmem;
++
++ uint8_t (*y_setup_arm)[2][2][2][4];
++ uint8_t (*y_setup_vc)[2][2][2][4];
++
++ uint8_t (*uv_setup_arm)[2][2][2][4];
++ uint8_t (*uv_setup_vc)[2][2][2][4];
++
++ int (*vpu_cmds_arm)[6]; // r0-r5 for each command
++ int vpu_cmds_vc;
++
++ vpu_qpu_wait_h cmd_id;
++ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
++
++ struct dblk_vpu_q_s * dvq;
++ unsigned int dvq_n;
++#endif
++
++ uint8_t *cabac_state;
++
++ /** 1 if the independent slice segment header was successfully parsed */
++ uint8_t slice_initialized;
++
++ AVFrame *frame;
++ AVFrame *output_frame;
++ uint8_t *sao_pixel_buffer_h[3];
++ uint8_t *sao_pixel_buffer_v[3];
++
++ HEVCRpiParamSets ps;
++
++ AVBufferPool *tab_mvf_pool;
++ AVBufferPool *rpl_tab_pool;
++
++ ///< candidate references for the current frame
++ RefPicList rps[5];
++
++ SliceHeader sh;
++ SAOParams *sao;
++ DBParams *deblock;
++ enum HEVCNALUnitType nal_unit_type;
++ int temporal_id; ///< temporal_id_plus1 - 1
++ HEVCFrame *ref;
++ HEVCFrame DPB[HEVC_DPB_ELS];
++ int poc;
++ int pocTid0;
++ int slice_idx; ///< number of the slice being currently decoded
++ int eos; ///< current packet contains an EOS/EOB NAL
++ int last_eos; ///< last packet contains an EOS/EOB NAL
++ int max_ra;
++ int bs_width;
++ int bs_height;
++
++ int is_decoded;
++ int no_rasl_output_flag;
++
++ HEVCPredContext hpc;
++ HEVCDSPContext hevcdsp;
++ VideoDSPContext vdsp;
++ BswapDSPContext bdsp;
++ int8_t *qp_y_tab;
++ uint8_t *horizontal_bs;
++ uint8_t *vertical_bs;
++
++ int32_t *tab_slice_address;
++
++ // CU
++ uint8_t *skip_flag;
++ uint8_t *tab_ct_depth;
++ // PU
++ uint8_t *tab_ipm;
++
++ uint8_t *cbf_luma; // cbf_luma of colocated TU
++ uint8_t *is_pcm;
++
++ // CTB-level flags affecting loop filter operation
++ uint8_t *filter_slice_edges;
++
++ /** used on BE to byteswap the lines for checksumming */
++ uint8_t *checksum_buf;
++ int checksum_buf_size;
++
++ /**
++ * Sequence counters for decoded and output frames, so that old
++ * frames are output first after a POC reset
++ */
++ uint16_t seq_decode;
++ uint16_t seq_output;
++
++ int enable_parallel_tiles;
++ atomic_int wpp_err;
++
++ const uint8_t *data;
++
++ H2645Packet pkt;
++ // type of the first VCL NAL of the current frame
++ enum HEVCNALUnitType first_nal_type;
++
++ uint8_t context_initialized;
++ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated
++ ///< as a format defined in 14496-15
++ int apply_defdispwin;
++
++ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
++ int nuh_layer_id;
++
++ HEVCSEIContext sei;
++
++ // Put structures that allocate non-trivial storage at the end
++ // These are mostly used indirectly so position in the structure doesn't matter
++ HEVCRpiLocalContextIntra HEVClcIntra;
++ HEVCRpiPassQueue passq[RPI_PASSES];
++#if RPI_EXTRA_BIT_THREADS > 0
++ int bt_started;
++ // This simply contains thread descriptors - task setup is held elsewhere
++ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
++#endif
++#if RPI_TSTATS
++ HEVCRpiStats tstats;
++#endif
++} HEVCRpiContext;
++
++/**
++ * Mark all frames in DPB as unused for reference.
++ */
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
++
++/**
++ * Drop all frames currently in DPB.
++ */
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
++
++const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref,
++ int x0, int y0);
++
++/**
++ * Construct the reference picture sets for the current frame.
++ */
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture list(s) for the current slice.
++ */
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts);
++int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts);
++int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_end_of_slice_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0, const int x_cb, const int y_cb);
++int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int ct_depth,
++ const int x0, const int y0);
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
++int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
++int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size);
++int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth);
++int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth);
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
++int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx);
++
++/**
++ * Get the number of candidate references for the current frame.
++ */
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
++
++/**
++ * Find next frame in output order and put a reference to it in frame.
++ * @return 1 if a frame was output, 0 otherwise
++ */
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags);
++
++void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++ const int nPbW, const int nPbH);
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++ int nPbH, int log2_cb_size, int part_idx,
++ int merge_idx, MvField * const mv);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW,
++ int nPbH, int log2_cb_size, int part_idx,
++ int merge_idx, MvField * const mv,
++ int mvp_lx_flag, int LX);
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase, int log2_cb_size);
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++ int log2_trafo_size);
++int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_qp_delta_abs(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_hls_filter(HEVCRpiContext * const s, const int x, const int y, const int ctb_size);
++void ff_hevc_rpi_hls_filters(HEVCRpiContext *s, int x_ctb, int y_ctb, int ctb_size);
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++ const int x0, const int y0,
++ const int log2_trafo_size, const enum ScanType scan_idx,
++ const int c_idx);
++
++void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
++
++
++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra[4];
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCFrame * const ref, const int y)
++{
++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
++{
++ if (s->used_for_ref)
++ ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++ const HEVCFrame * const ref, const int y)
++{
++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++}
++
++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
++{
++ if (s->used_for_ref)
++ {
++ ff_hevc_rpi_progress_signal_field(s, y, 0);
++ }
++}
++
++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
++{
++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++}
++
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_rpi_progress_set_all_done(HEVCFrame * const ref)
++{
++ if (ref->tf.progress != NULL)
++ {
++ int * const p = (int *)&ref->tf.progress->data;
++ p[0] = INT_MAX;
++ p[1] = INT_MAX;
++ }
++}
++
++#define HEVC_RPI_420_ONLY 1
++#define HEVC_RPI_SAND128_ONLY 1
++
++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++ return cidx == 0 ? 0 : 1;
++#else
++ return s->ps.sps->hshift[cidx];
++#endif
++}
++
++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++ return cidx == 0 ? 0 : 1;
++#else
++ return s->ps.sps->vshift[cidx];
++#endif
++}
++
++static inline int ctx_cfmt(const HEVCRpiContext * const s)
++{
++#if HEVC_RPI_420_ONLY
++ return 1;
++#else
++ return s->ps.sps->chroma_format_idc;
++#endif
++}
++
++static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
++{
++#if HEVC_RPI_SAND128_ONLY
++ return 128;
++#else
++ return frame->linesize[c_idx];
++#endif
++}
++
++#if HEVC_RPI_SAND128_ONLY
++// Propagate this decision to later zc includes
++#define RPI_ZC_SAND128_ONLY 1
++#endif
++
++#endif /* AVCODEC_RPI_HEVCDEC_H */
+diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
+new file mode 100644
+index 0000000000..3e4cfe8d46
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.c
+@@ -0,0 +1,415 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdsp.h"
++
++static const int8_t transform[32][32] = {
++ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
++ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
++ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
++ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90,
++ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 },
++ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
++ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 },
++ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89,
++ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
++ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
++ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 },
++ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87,
++ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 },
++ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
++ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 },
++ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83,
++ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
++ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
++ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 },
++ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80,
++ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 },
++ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
++ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 },
++ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75,
++ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
++ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
++ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 },
++ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70,
++ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 },
++ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
++ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 },
++ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64,
++ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
++ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
++ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 },
++ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57,
++ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 },
++ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
++ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 },
++ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50,
++ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
++ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
++ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 },
++ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43,
++ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 },
++ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
++ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 },
++ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36,
++ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
++ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
++ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 },
++ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25,
++ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 },
++ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
++ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 },
++ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18,
++ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
++ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
++ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 },
++ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9,
++ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 },
++ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90,
++ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 },
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
++ { -2, 58, 10, -2},
++ { -4, 54, 16, -2},
++ { -6, 46, 28, -4},
++ { -4, 36, 36, -4},
++ { -4, 28, 46, -6},
++ { -2, 16, 54, -4},
++ { -2, 10, 58, -2},
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
++ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0},
++ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1},
++ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1}
++};
++
++#define BIT_DEPTH 8
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ const MvField *curr, const MvField *neigh, uint8_t *bs)
++{
++ for (; pus > 0; pus--) {
++ int strength, out;
++ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
++ int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
++ int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
++ int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
++
++#if 1 // This more directly matches the original implementation
++ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
++ // same L0 and L1
++ if (curr_refL0 == neigh_refL0 &&
++ curr_refL0 == curr_refL1 &&
++ neigh_refL0 == neigh_refL1) {
++ if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
++ (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++ strength = 1;
++ else
++ strength = 0;
++ } else if (neigh_refL0 == curr_refL0 &&
++ neigh_refL1 == curr_refL1) {
++ if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else if (neigh_refL1 == curr_refL0 &&
++ neigh_refL0 == curr_refL1) {
++ if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else {
++ strength = 1;
++ }
++ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++ Mv curr_mv0, neigh_mv0;
++
++ if (curr->pred_flag & 1) {
++ curr_mv0 = curr->mv[0];
++ } else {
++ curr_mv0 = curr->mv[1];
++ curr_refL0 = curr_refL1;
++ }
++
++ if (neigh->pred_flag & 1) {
++ neigh_mv0 = neigh->mv[0];
++ } else {
++ neigh_mv0 = neigh->mv[1];
++ neigh_refL0 = neigh_refL1;
++ }
++
++ if (curr_refL0 == neigh_refL0) {
++ if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
++ strength = 1;
++ else
++ strength = 0;
++ } else
++ strength = 1;
++ } else
++ strength = 1;
++#else // This has exactly the same effect, but is more suitable for vectorisation
++ Mv curr_mv[2];
++ Mv neigh_mv[2];
++ memcpy(curr_mv, curr->mv, sizeof curr_mv);
++ memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
++
++ if (!(curr->pred_flag & 2)) {
++ curr_mv[1] = curr_mv[0];
++ curr_refL1 = curr_refL0;
++ }
++ if (!(neigh->pred_flag & 2)) {
++ neigh_mv[1] = neigh_mv[0];
++ neigh_refL1 = neigh_refL0;
++ }
++ if (!(curr->pred_flag & 1)) {
++ curr_mv[0] = curr_mv[1];
++ curr_refL0 = curr_refL1;
++ }
++ if (!(neigh->pred_flag & 1)) {
++ neigh_mv[0] = neigh_mv[1];
++ neigh_refL0 = neigh_refL1;
++ }
++
++ strength = 1;
++
++ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
++ (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
++ (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
++
++ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
++ (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
++ (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
++
++ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#endif
++
++ curr += in_inc / sizeof (MvField);
++ neigh += in_inc / sizeof (MvField);
++
++ for (out = dup; out > 0; out--)
++ {
++ *bs = strength;
++ bs += out_inc;
++ }
++ }
++}
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef PEL_FUNC
++#define PEL_FUNC(dst1, idx1, idx2, a, depth) \
++ for(i = 0 ; i < 10 ; i++) \
++{ \
++ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \
++}
++
++#undef EPEL_FUNCS
++#define EPEL_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \
++ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \
++ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \
++ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
++
++#undef EPEL_UNI_FUNCS
++#define EPEL_UNI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \
++ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \
++ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
++
++#undef EPEL_BI_FUNCS
++#define EPEL_BI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \
++ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \
++ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
++
++#undef QPEL_FUNCS
++#define QPEL_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \
++ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \
++ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
++
++#undef QPEL_UNI_FUNCS
++#define QPEL_UNI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \
++ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \
++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
++
++#undef QPEL_BI_FUNCS
++#define QPEL_BI_FUNCS(depth) \
++ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \
++ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \
++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++
++#define SLICED_ADD_RESIDUAL(depth)\
++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \
++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \
++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \
++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \
++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \
++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \
++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \
++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \
++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \
++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \
++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \
++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \
++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \
++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \
++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \
++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++ for (i = 0; i != SAO_FILTER_N; ++i) { \
++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \
++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \
++ } \
++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \
++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#define HEVC_DSP(depth) \
++ hevcdsp->put_pcm = FUNC(put_pcm, depth); \
++ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \
++ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \
++ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \
++ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \
++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \
++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \
++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \
++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \
++ SLICED_ADD_RESIDUAL(depth); \
++ hevcdsp->dequant = FUNC(dequant, depth); \
++ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \
++ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \
++ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \
++ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \
++ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \
++ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \
++ \
++ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \
++ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \
++ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \
++ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \
++ \
++ for (i = 0; i != SAO_FILTER_N; ++i) { \
++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \
++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \
++ } \
++ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \
++ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \
++ SLICED_SAO(depth); \
++ \
++ QPEL_FUNCS(depth); \
++ QPEL_UNI_FUNCS(depth); \
++ QPEL_BI_FUNCS(depth); \
++ EPEL_FUNCS(depth); \
++ EPEL_UNI_FUNCS(depth); \
++ EPEL_BI_FUNCS(depth); \
++ \
++ SLICED_LOOP_FILTERS(depth); \
++ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \
++ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \
++ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \
++ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \
++ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \
++ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \
++ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
++ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
++int i = 0;
++
++ switch (bit_depth) {
++ case 9:
++ HEVC_DSP(9);
++ break;
++ case 10:
++ HEVC_DSP(10);
++ break;
++ case 12:
++ HEVC_DSP(12);
++ break;
++ default:
++ HEVC_DSP(8);
++ break;
++ }
++
++ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++
++ if (ARCH_PPC)
++ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
++ if (ARCH_X86)
++ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
++ if (ARCH_ARM)
++ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
++ if (ARCH_MIPS)
++ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
++}
+diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
+new file mode 100644
+index 0000000000..c974baa820
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.h
+@@ -0,0 +1,182 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDSP_H
++#define AVCODEC_RPI_HEVCDSP_H
++
++#include "hevc.h"
++#include "get_bits.h"
++
++#define MAX_PB_SIZE 64
++
++typedef struct SAOParams {
++// int offset_abs[3][4]; ///< sao_offset_abs
++// int offset_sign[3][4]; ///< sao_offset_sign
++
++ uint8_t band_position[3]; ///< sao_band_position
++ uint8_t eo_class[3]; ///< sao_eo_class
++ uint8_t type_idx[3]; ///< sao_type_idx
++
++ int16_t offset_val[3][5]; ///> 16;
++ const int dc_u = (dc << 16) >> 16;
++
++ stride /= sizeof(pixel);
++
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size * 2; x += 2) {
++ dst[x] = av_clip_pixel(dst[x] + dc_u);
++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++ }
++ dst += stride;
++ }
++}
++
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual)(_dst, res, stride, 32);
++}
++
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++ FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_u)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride, int dc_v)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ FUNC(add_residual_c)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
++ ptrdiff_t stride)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++ // Should never occur for 420, which is all that sand supports
++ av_assert0(0);
++}
++
++
++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++{
++ int16_t *coeffs = (int16_t *) _coeffs;
++ int x, y;
++ int size = 1 << log2_size;
++
++ if (mode) {
++ coeffs += size;
++ for (y = 0; y < size - 1; y++) {
++ for (x = 0; x < size; x++)
++ coeffs[x] += coeffs[x - size];
++ coeffs += size;
++ }
++ } else {
++ for (y = 0; y < size; y++) {
++ for (x = 1; x < size; x++)
++ coeffs[x] += coeffs[x - 1];
++ coeffs += size;
++ }
++ }
++}
++
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
++{
++ int shift = 15 - BIT_DEPTH - log2_size;
++ int x, y;
++ int size = 1 << log2_size;
++
++ if (shift > 0) {
++ int offset = 1 << (shift - 1);
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ *coeffs = (*coeffs + offset) >> shift;
++ coeffs++;
++ }
++ }
++ } else {
++ for (y = 0; y < size; y++) {
++ for (x = 0; x < size; x++) {
++ *coeffs = *coeffs << -shift;
++ coeffs++;
++ }
++ }
++ }
++}
++
++#define SET(dst, x) (dst) = (x)
++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
++
++#define TR_4x4_LUMA(dst, src, step, assign) \
++ do { \
++ int c0 = src[0 * step] + src[2 * step]; \
++ int c1 = src[2 * step] + src[3 * step]; \
++ int c2 = src[0 * step] - src[3 * step]; \
++ int c3 = 74 * src[1 * step]; \
++ \
++ assign(dst[2 * step], 74 * (src[0 * step] - \
++ src[2 * step] + \
++ src[3 * step])); \
++ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \
++ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \
++ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \
++ } while (0)
++
++static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++{
++ int i;
++ int shift = 7;
++ int add = 1 << (shift - 1);
++ int16_t *src = coeffs;
++
++ for (i = 0; i < 4; i++) {
++ TR_4x4_LUMA(src, src, 4, SCALE);
++ src++;
++ }
++
++ shift = 20 - BIT_DEPTH;
++ add = 1 << (shift - 1);
++ for (i = 0; i < 4; i++) {
++ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
++ coeffs += 4;
++ }
++}
++
++#undef TR_4x4_LUMA
++
++#define TR_4(dst, src, dstep, sstep, assign, end) \
++ do { \
++ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
++ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
++ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
++ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
++ \
++ assign(dst[0 * dstep], e0 + o0); \
++ assign(dst[1 * dstep], e1 + o1); \
++ assign(dst[2 * dstep], e1 - o1); \
++ assign(dst[3 * dstep], e0 - o0); \
++ } while (0)
++
++#define TR_8(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_8[4]; \
++ int o_8[4] = { 0 }; \
++ for (i = 0; i < 4; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_8[i] += transform[4 * j][i] * src[j * sstep]; \
++ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \
++ \
++ for (i = 0; i < 4; i++) { \
++ assign(dst[i * dstep], e_8[i] + o_8[i]); \
++ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \
++ } \
++ } while (0)
++
++#define TR_16(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_16[8]; \
++ int o_16[8] = { 0 }; \
++ for (i = 0; i < 8; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_16[i] += transform[2 * j][i] * src[j * sstep]; \
++ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \
++ \
++ for (i = 0; i < 8; i++) { \
++ assign(dst[i * dstep], e_16[i] + o_16[i]); \
++ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \
++ } \
++ } while (0)
++
++#define TR_32(dst, src, dstep, sstep, assign, end) \
++ do { \
++ int i, j; \
++ int e_32[16]; \
++ int o_32[16] = { 0 }; \
++ for (i = 0; i < 16; i++) \
++ for (j = 1; j < end; j += 2) \
++ o_32[i] += transform[j][i] * src[j * sstep]; \
++ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \
++ \
++ for (i = 0; i < 16; i++) { \
++ assign(dst[i * dstep], e_32[i] + o_32[i]); \
++ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \
++ } \
++ } while (0)
++
++#define IDCT_VAR4(H) \
++ int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR8(H) \
++ int limit = FFMIN(col_limit, H); \
++ int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR16(H) IDCT_VAR8(H)
++#define IDCT_VAR32(H) IDCT_VAR8(H)
++
++#define IDCT(H) \
++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \
++ int col_limit) \
++{ \
++ int i; \
++ int shift = 7; \
++ int add = 1 << (shift - 1); \
++ int16_t *src = coeffs; \
++ IDCT_VAR ## H(H); \
++ \
++ for (i = 0; i < H; i++) { \
++ TR_ ## H(src, src, H, H, SCALE, limit2); \
++ if (limit2 < H && i%4 == 0 && !!i) \
++ limit2 -= 4; \
++ src++; \
++ } \
++ \
++ shift = 20 - BIT_DEPTH; \
++ add = 1 << (shift - 1); \
++ for (i = 0; i < H; i++) { \
++ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \
++ coeffs += H; \
++ } \
++}
++
++#define IDCT_DC(H) \
++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \
++{ \
++ int i, j; \
++ int shift = 14 - BIT_DEPTH; \
++ int add = 1 << (shift - 1); \
++ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \
++ \
++ for (j = 0; j < H; j++) { \
++ for (i = 0; i < H; i++) { \
++ coeffs[i + j * H] = coeff; \
++ } \
++ } \
++}
++
++IDCT( 4)
++IDCT( 8)
++IDCT(16)
++IDCT(32)
++
++IDCT_DC( 4)
++IDCT_DC( 8)
++IDCT_DC(16)
++IDCT_DC(32)
++
++#undef TR_4
++#undef TR_8
++#undef TR_16
++#undef TR_32
++
++#undef SET
++#undef SCALE
++
++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ int16_t *sao_offset_val, int sao_left_class,
++ int width, int height)
++{
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int offset_table[32] = { 0 };
++ int k, y, x;
++ int shift = BIT_DEPTH - 5;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ for (k = 0; k < 4; k++)
++ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
++ dst += stride_dst;
++ src += stride_src;
++ }
++}
++
++#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
++
++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
++ int eo, int width, int height) {
++
++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++ static const int8_t pos[4][2][2] = {
++ { { -1, 0 }, { 1, 0 } }, // horizontal
++ { { 0, -1 }, { 0, 1 } }, // vertical
++ { { -1, -1 }, { 1, 1 } }, // 45 degree
++ { { 1, -1 }, { -1, 1 } }, // 135 degree
++ };
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int a_stride, b_stride;
++ int x, y;
++ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++ stride_dst /= sizeof(pixel);
++
++ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
++ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ int diff0 = CMP(src[x], src[x + a_stride]);
++ int diff1 = CMP(src[x], src[x + b_stride]);
++ int offset_val = edge_idx[2 + diff0 + diff1];
++ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
++ }
++ src += stride_src;
++ dst += stride_dst;
++ }
++}
++
++
++#if BIT_DEPTH == 10
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
+#endif
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-new file mode 100644
-index 0000000000..82bf380eb4
---- /dev/null
-+++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,63 @@
-+#ifndef rpi_shader_H
-+#define rpi_shader_H
-+
-+extern unsigned int rpi_shader[];
-+
-+#define mc_setup_c_q0 (rpi_shader + 0)
-+#define mc_start (rpi_shader + 0)
-+#define mc_setup_c_qn (rpi_shader + 2)
-+#define mc_filter_c_p (rpi_shader + 142)
-+#define mc_filter_c_p_l1 (rpi_shader + 272)
-+#define mc_filter_c_b (rpi_shader + 402)
-+#define mc_sync_q0 (rpi_shader + 590)
-+#define mc_sync_q1 (rpi_shader + 608)
-+#define mc_sync_q2 (rpi_shader + 620)
-+#define mc_sync_q3 (rpi_shader + 632)
-+#define mc_sync_q4 (rpi_shader + 644)
-+#define mc_sync_q5 (rpi_shader + 662)
-+#define mc_sync_q6 (rpi_shader + 674)
-+#define mc_sync_q7 (rpi_shader + 686)
-+#define mc_sync_q8 (rpi_shader + 698)
-+#define mc_sync_q9 (rpi_shader + 716)
-+#define mc_sync_q10 (rpi_shader + 728)
-+#define mc_sync_q11 (rpi_shader + 740)
-+#define mc_exit_c_qn (rpi_shader + 752)
-+#define mc_exit_y_qn (rpi_shader + 752)
-+#define mc_exit_c_q0 (rpi_shader + 770)
-+#define mc_exit_y_q0 (rpi_shader + 770)
-+#define mc_setup_y_q0 (rpi_shader + 790)
-+#define mc_setup_y_qn (rpi_shader + 792)
-+#define mc_filter_y_pxx (rpi_shader + 1032)
-+#define mc_filter_y_bxx (rpi_shader + 1162)
-+#define mc_filter_y_p00 (rpi_shader + 1292)
-+#define mc_filter_y_b00 (rpi_shader + 1382)
-+#define mc_setup_c10_q0 (rpi_shader + 1462)
-+#define mc_setup_c10_qn (rpi_shader + 1464)
-+#define mc_filter_c10_p (rpi_shader + 1600)
-+#define mc_filter_c10_p_l1 (rpi_shader + 1728)
-+#define mc_filter_c10_b (rpi_shader + 1856)
-+#define mc_sync10_q0 (rpi_shader + 2042)
-+#define mc_sync10_q1 (rpi_shader + 2060)
-+#define mc_sync10_q2 (rpi_shader + 2072)
-+#define mc_sync10_q3 (rpi_shader + 2084)
-+#define mc_sync10_q4 (rpi_shader + 2096)
-+#define mc_sync10_q5 (rpi_shader + 2114)
-+#define mc_sync10_q6 (rpi_shader + 2126)
-+#define mc_sync10_q7 (rpi_shader + 2138)
-+#define mc_sync10_q8 (rpi_shader + 2150)
-+#define mc_sync10_q9 (rpi_shader + 2168)
-+#define mc_sync10_q10 (rpi_shader + 2180)
-+#define mc_sync10_q11 (rpi_shader + 2192)
-+#define mc_exit_c10_q0 (rpi_shader + 2204)
-+#define mc_exit_y10_q0 (rpi_shader + 2204)
-+#define mc_exit_c10_qn (rpi_shader + 2224)
-+#define mc_exit_y10_qn (rpi_shader + 2224)
-+#define mc_setup_y10_q0 (rpi_shader + 2242)
-+#define mc_setup_y10_qn (rpi_shader + 2244)
-+#define mc_filter_y10_pxx (rpi_shader + 2494)
-+#define mc_filter_y10_p00 (rpi_shader + 2624)
-+#define mc_filter_y10_bxx (rpi_shader + 2716)
-+#define mc_filter_y10_b00 (rpi_shader + 2846)
-+#define mc_end (rpi_shader + 2926)
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++ int *borders, int _width, int _height,
++ int c_idx, uint8_t *vert_edge,
++ uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int sao_eo_class = sao->eo_class[c_idx];
++ int init_x = 0, width = _width, height = _height;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ if (sao_eo_class != SAO_EO_VERT) {
++ if (borders[0]) {
++ for (y = 0; y < height; y++) {
++ dst[y * stride_dst] = src[y * stride_src];
++ }
++ init_x = 1;
++ }
++ if (borders[2]) {
++ int offset = width - 1;
++ for (x = 0; x < height; x++) {
++ dst[x * stride_dst + offset] = src[x * stride_src + offset];
++ }
++ width--;
++ }
++ }
++ if (sao_eo_class != SAO_EO_HORIZ) {
++ if (borders[1]) {
++ for (x = init_x; x < width; x++)
++ dst[x] = src[x];
++ }
++ if (borders[3]) {
++ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++ ptrdiff_t y_stride_src = stride_src * (height - 1);
++ for (x = init_x; x < width; x++)
++ dst[x + y_stride_dst] = src[x + y_stride_src];
++ height--;
++ }
++ }
++}
++
++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++ int *borders, int _width, int _height,
++ int c_idx, uint8_t *vert_edge,
++ uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++ int x, y;
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int sao_eo_class = sao->eo_class[c_idx];
++ int init_x = 0, init_y = 0, width = _width, height = _height;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++
++ if (sao_eo_class != SAO_EO_VERT) {
++ if (borders[0]) {
++ for (y = 0; y < height; y++) {
++ dst[y * stride_dst] = src[y * stride_src];
++ }
++ init_x = 1;
++ }
++ if (borders[2]) {
++ int offset = width - 1;
++ for (x = 0; x < height; x++) {
++ dst[x * stride_dst + offset] = src[x * stride_src + offset];
++ }
++ width--;
++ }
++ }
++ if (sao_eo_class != SAO_EO_HORIZ) {
++ if (borders[1]) {
++ for (x = init_x; x < width; x++)
++ dst[x] = src[x];
++ init_y = 1;
++ }
++ if (borders[3]) {
++ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++ ptrdiff_t y_stride_src = stride_src * (height - 1);
++ for (x = init_x; x < width; x++)
++ dst[x + y_stride_dst] = src[x + y_stride_src];
++ height--;
++ }
++ }
++
++ {
++ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
++ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2];
++ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
++ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3];
++
++ // Restore pixels that can't be modified
++ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
++ for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
++ dst[y*stride_dst] = src[y*stride_src];
++ }
++ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
++ for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
++ dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
++ }
++
++ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
++ for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
++ dst[x] = src[x];
++ }
++ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
++ for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
++ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
++ }
++ if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
++ dst[0] = src[0];
++ if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
++ dst[width-1] = src[width-1];
++ if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
++ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
++ if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
++ dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
++
++ }
++}
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
++
++// --- Plaited chroma versions
++
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++ ptrdiff_t stride_dst, ptrdiff_t stride_src,
++ const int16_t *sao_offset_val_u, int sao_left_class_u,
++ const int16_t *sao_offset_val_v, int sao_left_class_v,
++ int width, int height)
++{
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int offset_table_u[32] = { 0 };
++ int offset_table_v[32] = { 0 };
++ int k, y, x;
++ int shift = BIT_DEPTH - 5;
++
++ stride_dst /= sizeof(pixel);
++ stride_src /= sizeof(pixel);
++ width *= 2;
++
++ for (k = 0; k < 4; k++)
++ {
++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++ }
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x += 2)
++ {
++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++ // *** & 31 shouldn't be wanted but just now we generate broken input that
++ // crashes us in 10-bit world
++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
++ }
++ dst += stride_dst;
++ src += stride_src;
++ }
++}
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++ int eo, int width, int height) {
+
-+#endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-new file mode 100644
-index 0000000000..ba6cc13a95
---- /dev/null
-+++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1741 @@
++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++ static const int8_t pos[4][2][2] = {
++ { { -1, 0 }, { 1, 0 } }, // horizontal
++ { { 0, -1 }, { 0, 1 } }, // vertical
++ { { -1, -1 }, { 1, 1 } }, // 45 degree
++ { { 1, -1 }, { -1, 1 } }, // 135 degree
++ };
++ pixel *dst = (pixel *)_dst;
++ pixel *src = (pixel *)_src;
++ int a_stride, b_stride;
++ int x, y;
++ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+
-+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
-+# the warning that we are using rotation & ra/rb registers. r0..3 can be
-+# rotated through all 16 elems ra regs can only be rotated through their
-+# local 4. As it happens this is what is wanted here as we do not want the
-+# constants from the other half of the calc.
++ stride_dst /= sizeof(pixel);
++ width *= 2;
+
-+# PREREAD is the number of requests that we have sitting in the TMU request
-+# queue.
-+#
-+# There are 8 slots availible in the TMU request Q for tm0s requests, but
-+# only 4 output FIFO entries and overflow is bad (corruption or crash)
-+# (If threaded then only 2 out FIFO entries, but we aren't.)
-+# In s/w we are effectively limited to the min vertical read which is >= 4
-+# so output FIFO is the limit.
-+#
-+# However in the current world there seems to be no benefit (and a small
-+# overhead) in setting this bigger than 2.
++ av_assert0(width <= 64);
+
-+.set PREREAD, 4
++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x += 2) {
++ int diff0u = CMP(src[x], src[x + a_stride]);
++ int diff1u = CMP(src[x], src[x + b_stride]);
++ int offset_valu = edge_idx[2 + diff0u + diff1u];
++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++ int offset_valv = edge_idx[2 + diff0v + diff1v];
++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++ }
++ src += stride_src;
++ dst += stride_dst;
++ }
++}
+
-+# Block heights - 8 & 16 are the only numbers we currently support
++// Do once
++#if BIT_DEPTH == 8
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
++#endif
+
-+.set C_BLK_HEIGHT_8, 16
-+.set C_BLK_HEIGHT_16, 8
-+.set Y_BLK_HEIGHT_8, 16
-+.set Y_BLK_HEIGHT_16, 8
++#undef CMP
+
-+# QPU counts - depend on block size
-+# If we have a 2-byte format & block_size > 8 then can only afford
-+# 8 QPUs
-+# These numbers must match the numbers in rpi_shader_cmd.h
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
+
-+.set N_QPU_8, 12
-+.set N_QPU_16, 12
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = src[x] << (14 - BIT_DEPTH);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
+
-+# register allocation
-+#
++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
+
-+# ra0-3
-+# Used as temp and may be loop filter coeffs (split into .8s)
-+# or temp in loop. Check usage on an individual basis.
++ for (y = 0; y < height; y++) {
++ memcpy(dst, src, width * sizeof(pixel));
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+# ra4-7
-+# C: L0 H filter out FIFO
-+# otherwise -- free --
++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# ra8-11
-+# temp in some places - check usage
-+# Y: (with rb8-11) horiz out FIFO
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+# ra12-15
-+# -- free --
++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# uniform: width:height
-+.set ra_width_height, ra16
-+.set ra_width, ra16.16b
-+.set ra_height, ra16.16a
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+# y:y2 same layout as y_y2_next so we can update both together
-+.set ra_y_y2, ra17
-+.set ra_y2, ra17.16a
-+.set ra_y, ra17.16b
++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
+
-+# uniform: L1 weight (U on left, V on right)
-+# Only used in Y B
-+.set ra_wt_off_mul_l1, ra18
-+.set ra_wt_off_l1, ra18.16b
-+.set ra_wt_mul_l1, ra18.16a
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
+
-+# y_next:y2_next same layout as y_y2 so we can update both together
-+.set ra_y_y2_next, ra19
-+.set ra_y_next, ra19.16b
-+.set ra_y2_next, ra19.16a
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
++ }
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+# Setup: consts - subdivide a single register
-+.set ra_kff100100, ra20
-+.set ra_k256, ra20.16a
-+.set ra_k0, ra20.8a
-+.set ra_k1, ra20.8b
-+.set ra_k16, ra20.8c
-+.set ra_k255, ra20.8d
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define QPEL_FILTER(src, stride) \
++ (filter[0] * src[x - 3 * stride] + \
++ filter[1] * src[x - 2 * stride] + \
++ filter[2] * src[x - stride] + \
++ filter[3] * src[x ] + \
++ filter[4] * src[x + stride] + \
++ filter[5] * src[x + 2 * stride] + \
++ filter[6] * src[x + 3 * stride] + \
++ filter[7] * src[x + 4 * stride])
++
++static void FUNC(put_hevc_qpel_h)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
+
-+# Loop: xshifts
-+.set ra_xshift, ra21.16a
-+.set ra_xshift_next, ra21.16b
++static void FUNC(put_hevc_qpel_v)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
+
-+# Loop var: L0 weight (U on left, V on right)
-+# _off_ is not used in loop as we want to modify it before use
-+.set ra_wt_off_mul_l0, ra22
-+.set ra_wt_mul_l0, ra22.16a
-+.set ra_wt_off_l0, ra22.16b
++static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
++ uint8_t *_src,
++ ptrdiff_t _srcstride,
++ int height, intptr_t mx,
++ intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
-+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
-+# 2nd byte but as the source should never be > 3 there 0x3ff should do
-+.set ra_blk_height_pmax, ra23
-+.set ra_pmax, ra23.16a
-+.set ra_blk_height, ra23.8c
-+# -- free -- ra23.8d
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++ tmp += MAX_PB_SIZE;
++ dst += MAX_PB_SIZE;
++ }
++}
+
-+# Loop: src frame base (L0)
-+.set ra_base, ra24
++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Loop: src frame base (L1)
-+.set ra_base2, ra25
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+# Loop: next src frame base (L0)
-+.set ra_base_next, ra26
++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
+
-+# -- free -- ra27
-+# -- free -- ra28
-+# -- free -- ra29
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
+
-+# Use an even numbered register as a link register to avoid corrupting flags
-+.set ra_link, ra30
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# -- free -- ra31
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+.set rb_xshift2, rb0
-+.set rb_xshift2_next, rb1
++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
-+.set rb_elem_x, rb2
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+# El Flags
-+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
-+.set rb_ef, rb3
+
-+# rb4-7
-+# C-B: L1 H filter out FIFO
-+# Y: (with ra2.8x) Y vertical filter coeffs
++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
+
-+# rb8-11
-+# C: Vertical filter coeffs
-+# Y: (with ra8-11) horiz out FIFO
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
+
-+# Loop var: offset to add before shift (round + weighting offsets)
-+# Exact value varies by loop
-+.set rb_wt_off, rb12
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Setup: denom + 6 + 9
-+.set rb_wt_den_p15, rb13
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
++
++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# -- free -- rb14
-+# -- free -- rb15
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+# Line pitch (128 for sand128)
-+.set rb_pitch, rb16
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
+
-+# Loop count - 2 (set up TMU for next xfer)
-+.set rb_i_tmu, rb17
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
+
-+# Loop count for min(height, 16)
-+# Y will reset & loop again if height > 16
-+.set rb_lcount, rb18
++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# frame_base2_next
-+.set rb_base2_next, rb19
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
-+# offset to the slice
-+.set rb_xpitch, rb20
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
+
-+# -- free -- rb21
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
-+.set rb_pmask, rb22
++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Loop: destination address
-+.set rb_dest, rb23
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+# vdw_setup_1(dst_pitch)
-+.set rb_dma1_base, rb24
++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
+
-+# Setup: pic width - 1
-+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
-+.set rb_max_x, rb25
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
+
-+# Loop: height<<23 + width<<16 + vdw_setup_0
-+.set rb_dma0, rb26
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
+
-+# vdw_setup_0 (depends on QPU number)
-+.set rb_dma0_base, rb27
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+# Setup: vw_setup value to reset VPM write pointer
-+.set rb_vpm_init, rb28
++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Loop: vdw_setup_1(dst_pitch-width) = stride
-+.set rb_dma1, rb29
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+# Setup: pic_height - 1
-+.set rb_max_y, rb30
++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
+
-+# -- free -- rb31
++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
+
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
+
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox,
++ intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
-+.set i_shift16, -16
-+.set i_shift21, -11
-+.set i_shift23, -9
-+.set i_shift30, -2
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+# Much of the setup code is common between Y & C
-+# Macros that express this - obviously these can't be overlapped
-+# so are probably unsuitable for loop code
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
+
-+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
-+ mov r2, qpu_num
-+.if v_bit_depth <= 8
-+ # 8 bit version
-+ asr r1, r2, 2
-+ shl r1, r1, 6
-+ and r0, r2, 3
-+ or r0, r0, r1
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
+
-+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+ add r_vpm, r0, r1 # VPM 8bit storage
++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ const int8_t *filter;
++ pixel *src = (pixel*)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ src -= QPEL_EXTRA_BEFORE * srcstride;
++ filter = ff_hevc_rpi_qpel_filters[mx - 1];
++ for (y = 0; y < height + QPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+ shl r0, r0, 5
++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_qpel_filters[my - 1];
+
-+.else
-+ # 16 bit version
-+ # Limited to 8 QPUs if blk height > 8
-+ asr r1, r2, 1
-+.if v_blk_height <= 8
-+ shl r1, r1, 4
-+.else
-+ shl r1, r1, 5
-+.endif
-+ and r0, r2, 1
-+ or r0, r0, r1
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR
-+ add r_vpm, r0, r1
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define EPEL_FILTER(src, stride) \
++ (filter[0] * src[x - stride] + \
++ filter[1] * src[x] + \
++ filter[2] * src[x + stride] + \
++ filter[3] * src[x + 2 * stride])
+
-+ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
-+ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
-+ shl r0, r0, 6
-+.endif
-+ add r_dma, r0, r1 # DMA out
-+.endm
++static void FUNC(put_hevc_epel_h)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
+
++static void FUNC(put_hevc_epel_v)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
+
-+.macro m_setup_q0
-+ srel -, 12
-+.endm
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ dst += MAX_PB_SIZE;
++ }
++}
+
-+# Code start label
-+::mc_start
++static void FUNC(put_hevc_epel_hv)(int16_t *dst,
++ uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
+
-+################################################################################
-+# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++ src -= EPEL_EXTRA_BEFORE * srcstride;
+
-+.macro m_setup_c, v_bit_depth
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift, 1
-+.set v_pmask, 0xff
-+.set v_blk_height, C_BLK_HEIGHT_8
-+.else
-+.set v_x_shift, 2
-+.set v_pmask, 0xffff
-+.set v_blk_height, C_BLK_HEIGHT_16
-+.endif
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
+
-+ mov tmurs, 1 # No swap TMUs
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++ tmp += MAX_PB_SIZE;
++ dst += MAX_PB_SIZE;
++ }
++}
+
-+# Load first request location
-+ mov ra0, unif # next_x_y
++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+ shl rb_ef, r0, i_shift30
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+ mov ra_base, unif # Store frame c base
++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Read image dimensions
-+ sub r0, unif, 1 # pic c width
-+ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes
-+ sub rb_max_y, unif, 1 # pic c height
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ }
++ dst += dststride;
++ src += srcstride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+# load constants
-+ mov ra_kff100100, 0xff100100
-+ mov rb_pmask, v_pmask
-+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# get source pitch
-+ mov rb_xpitch, unif # stride2
-+ mov rb_pitch, unif # stride1
-+ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly
-+ add rb_dma1_base, r1, rb_pitch # vdw_setup_1
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++ src += srcstride;
++ dst += dststride;
++ }
++}
+
-+ and r0, 1, elem_num
-+ nop ; mul24 r0, r0, 5
-+.if v_bit_depth <= 8
-+ add rb_elem_x, r0, elem_num
-+.else
-+ add r0, r0, elem_num
-+ add rb_elem_x, r0, r0
-+.endif
++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++ dst += dststride;
++ src += srcstride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay]
-+ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice
-+ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y
-+ min r0, r0, rb_max_x
++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Get shift
-+# Shift will always calculate as 0 for 9+ bit
-+# Ideally we can optimize the shift out of the code in these cases but for now
-+# it is tidier to leave it in
-+.if v_bit_depth <= 8
-+ shl ra_xshift_next, r0, 3
-+.else
-+ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
-+.endif
++ src -= EPEL_EXTRA_BEFORE * srcstride;
+
-+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+.if v_bit_depth <= 8
-+ and r0, r0, -4
-+.endif
-+ sub r1, ra_k0, rb_pitch
-+ and r1, r0, r1
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1
-+ add ra_base, ra_base, r0
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
+
-+ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
+
-+# Compute part of VPM to use for DMA output
-+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
-+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# And again for L1, but only worrying about frame2 stuff
++ src -= EPEL_EXTRA_BEFORE * srcstride;
+
-+# Load first request location
-+ mov ra0, unif # next_x_y
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+ mov ra_base2, unif # [ra0 delay] Store frame c base
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+ shl r0, ra0.16b, v_x_shift
-+ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset
-+ max r0, r0, 0
-+ min r0, r0, rb_max_x
++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Get shift (already zero if 9+ bit so ignore)
-+.if v_bit_depth <= 8
-+ shl rb_xshift2_next, r0, 3
-+.endif
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ }
++ dst += dststride;
++ src += srcstride;
++ }
++}
+
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+.if v_bit_depth <= 8
-+ and r0, r0, -4
-+.endif
-+ sub r1, ra_k0, rb_pitch
-+ and r1, r0, r1
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov r2, ra_y2
-+ add ra_base2, ra_base2, r0
++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+# Do preloads
-+# r0 = ra_y, r2 = ra_y2
-+ mov r3, PREREAD ; mov r0, ra_y
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++) {
++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++ }
++ dst += dststride;
++ src += srcstride;
++ }
++}
+
-+:1
-+ sub.setf r3, r3, 1
-+ max r1, r0, 0
-+ min r1, r1, rb_max_y
-+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t0s, ra_base, r1 ; mov ra_y, r0
++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++ src += srcstride;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+ max r1, r2, 0
-+ brr.anynz -, r:1b
-+ min r1, r1, rb_max_y
-+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t1s, ra_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz 1b
++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++ int offset = 1 << (shift - 1);
++#else
++ int offset = 0;
++#endif
+
-+ mov ra_link, unif # link
-+# touch registers to keep simulator happy
-+ # ra/b4..7: B0 -> B stash registers
-+ mov ra4, 0 ; mov rb4, 0
-+ bra -, ra_link
-+ mov ra5, 0 ; mov rb5, 0
-+ mov ra6, 0 ; mov rb6, 0
-+ mov ra7, 0 ; mov rb7, 0
-+# >>> ra_link
-+.endm
++ src -= EPEL_EXTRA_BEFORE * srcstride;
+
-+::mc_setup_c_q0
-+ m_setup_q0
-+::mc_setup_c_qn
-+ m_setup_c 8
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+################################################################################
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
+
-+# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++ ox = ox * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ }
++}
+
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x, ra_x16_base point to the current coordinates for this block
++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++ int16_t *src2,
++ int height, int denom, int wx0, int wx1,
++ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++ pixel *dst = (pixel *)_dst;
++ ptrdiff_t dststride = _dststride / sizeof(pixel);
++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++ int16_t *tmp = tmp_array;
++ int shift = 14 + 1 - BIT_DEPTH;
++ int log2Wd = denom + shift - 1;
++
++ src -= EPEL_EXTRA_BEFORE * srcstride;
++
++ for (y = 0; y < height + EPEL_EXTRA; y++) {
++ for (x = 0; x < width; x++)
++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++ src += srcstride;
++ tmp += MAX_PB_SIZE;
++ }
+
-+.macro m_filter_c_p, v_tmu, v_bit_depth
++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++ filter = ff_hevc_rpi_epel_filters[my - 1];
+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 1
-+.set v_x_mul, 2
-+.set v_v_shift, 8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 2
-+.set v_x_mul, 4
-+.set v_v_shift, i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
++ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
++ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
++ for (y = 0; y < height; y++) {
++ for (x = 0; x < width; x++)
++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
++ tmp += MAX_PB_SIZE;
++ dst += dststride;
++ src2 += MAX_PB_SIZE;
++ }
++}
+
-+.if v_tmu == 0
-+.set vrx_xshift, rb_xshift2 # b side more convienient
-+.set vrx_xshift_next, ra_xshift_next
-+.set vra_y_next, ra_y_next
-+.set vrx_base_next, ra_base_next
-+.set vra_y, ra_y
-+.set vra_base, ra_base
-+.set vr_txs, t0s
-+.else
-+.set vrx_xshift, ra_xshift # a side more convienient
-+.set vrx_xshift_next, rb_xshift2_next
-+.set vra_y_next, ra_y2_next
-+.set vrx_base_next, rb_base2_next
-+.set vra_y, ra_y2
-+.set vra_base, ra_base2
-+.set vr_txs, t1s
-+.endif
++// line zero
++#define P3 pix[-4 * xstride]
++#define P2 pix[-3 * xstride]
++#define P1 pix[-2 * xstride]
++#define P0 pix[-1 * xstride]
++#define Q0 pix[0 * xstride]
++#define Q1 pix[1 * xstride]
++#define Q2 pix[2 * xstride]
++#define Q3 pix[3 * xstride]
+
-+# per-channel shifts were calculated on the *previous* invocation
-+# get base addresses and per-channel shifts for *next* invocation
-+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++// line three. used only for deblocking decision
++#define TP3 pix[-4 * xstride + 3 * ystride]
++#define TP2 pix[-3 * xstride + 3 * ystride]
++#define TP1 pix[-2 * xstride + 3 * ystride]
++#define TP0 pix[-1 * xstride + 3 * ystride]
++#define TQ0 pix[0 * xstride + 3 * ystride]
++#define TQ1 pix[1 * xstride + 3 * ystride]
++#define TQ2 pix[2 * xstride + 3 * ystride]
++#define TQ3 pix[3 * xstride + 3 * ystride]
++
++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
++ ptrdiff_t _xstride, ptrdiff_t _ystride,
++ int beta, int *_tc,
++ uint8_t *_no_p, uint8_t *_no_q)
++{
++ int d, j;
++ pixel *pix = (pixel *)_pix;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
+
-+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base
++ beta <<= BIT_DEPTH - 8;
+
-+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
-+ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
-+ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs
-+ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
-+ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
++ for (j = 0; j < 2; j++) {
++ const int dp0 = abs(P2 - 2 * P1 + P0);
++ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
++ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
++ const int d0 = dp0 + dq0;
++ const int d3 = dp3 + dq3;
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ const int no_p = _no_p[j];
++ const int no_q = _no_q[j];
+
-+.if v_bit_depth <= 8
-+ shl vrx_xshift_next, r0, 3
-+ and r0, r0, -4
-+.endif
-+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced!
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs
-+ add vrx_base_next, r3, r0 ; mov r1, ra_height
++ if (d0 + d3 >= beta) {
++ pix += 4 * ystride;
++ continue;
++ } else {
++ const int beta_3 = beta >> 3;
++ const int beta_2 = beta >> 2;
++ const int tc25 = ((tc * 5 + 1) >> 1);
+
-+# set up VPM write
-+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
-+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
-+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
++ // strong filtering
++ const int tc2 = tc << 1;
++ for (d = 0; d < 4; d++) {
++ const int p3 = P3;
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ const int q3 = Q3;
++ if (!no_p) {
++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++ }
++ if (!no_q) {
++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++ }
++ pix += ystride;
++ }
++ } else { // normal filtering
++ int nd_p = 1;
++ int nd_q = 1;
++ const int tc_2 = tc >> 1;
++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++ nd_p = 2;
++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++ nd_q = 2;
+
-+# ; unpack filter coefficients
++ for (d = 0; d < 4; d++) {
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++ if (abs(delta0) < 10 * tc) {
++ delta0 = av_clip(delta0, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ if (!no_p && nd_p > 1) {
++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++ P1 = av_clip_pixel(p1 + deltap1);
++ }
++ if (!no_q && nd_q > 1) {
++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++ Q1 = av_clip_pixel(q1 + deltaq1);
++ }
++ }
++ pix += ystride;
++ }
++ }
++ }
++ }
++}
+
-+ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a
-+ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2)
-+ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight
++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
++ ptrdiff_t _ystride, int *_tc,
++ uint8_t *_no_p, uint8_t *_no_q)
++{
++ int d, j, no_p, no_q;
++ pixel *pix = (pixel *)_pix;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
+
-+ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y
++ for (j = 0; j < 2; j++) {
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ if (tc <= 0) {
++ pix += 4 * ystride;
++ continue;
++ }
++ no_p = _no_p[j];
++ no_q = _no_q[j];
+
-+ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d
++ for (d = 0; d < 4; d++) {
++ int delta0;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ pix += ystride;
++ }
++ }
++}
+
-+ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link
-+ sub ra3, rb_wt_den_p15, ra_k1
++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
++}
+
-+# r5 = 0 (loop counter)
-+# ra9 = alias for rb_max_y
-+# ra_wt_mul_l0 = weight L0
-+# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
-+# rb_wt_off = (offset * 2 + 1) << (ra3 - 1)
++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++ int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
++}
+
-+# We want (r0r1)
-+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
-+# We fetch (after shift)
-+# C0 : C3 : C1 : C4 : C2 : C5 : ...
++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
++ beta, tc, no_p, no_q);
++}
+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ int beta, int32_t *tc, uint8_t *no_p,
++ uint8_t *no_q)
++{
++ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
++ beta, tc, no_p, no_q);
++}
+
-+.if v_tmu == 0
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment
-+ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
-+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
-+.else
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment
-+ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
-+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
-+.endif
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
+
-+ add vra_y, r3, ra_k1 ; mov r0, r1 << 15
-+ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+ min r3, r3, ra9 ; mov.ifnc r0, r2
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
+
-+ mov ra4, ra5 ; mul24 r2, r3, rb_pitch
-+ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
+
-+# apply horizontal filter
-+# The filter coeffs for the two halves of this are the same (unlike in the
-+# Y case) so it doesn't matter which ra0 we get them from
-+# Also as the two halves are locked together we don't need to separate the 1st
-+# r0 mul or the last r1 mul as they are vaild for all QPUs
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
+
-+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
-+ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++ unsigned int _stride, unsigned int beta, const int32_t _tc[2],
++ const uint8_t _no_p[2], const uint8_t _no_q[2],
++ uint8_t * _pix_l)
++{
++ int d, j;
++ pixel *pix_l = (pixel *)_pix_l;
++ pixel *pix_r = (pixel *)_pix_r;
++ const ptrdiff_t xstride = 1;
++ const ptrdiff_t ystride = _stride / sizeof(pixel);
+
-+# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
-+# Have to dup block as we need to move the brr - code is more common than it
-+# looks at first glance
-+.if v_bit_depth <= 8
-+ brr.anyn -, r:1b
-+ add r2, r2, r3 ; mov ra5, ra6
-+ mov ra6, ra7 ; mul24 r1, ra7, rb10
-+ sub ra7, r2, r0 ; mul24 r0, ra4, rb8
-+.else
-+ add r2, r2, r3 ; mov ra5, ra6
-+ brr.anyn -, r:1b
-+ mov ra6, ra7 ; mul24 r1, ra7, rb10
-+ sub r2, r2, r0 ; mul24 r0, ra4, rb8
-+ asr ra7, r2, v_bit_depth - 8
-+.endif
-+# >>> .anyn 1b
++ beta <<= BIT_DEPTH - 8;
+
-+ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay]
-+ add r1, r1, r0 ; mul24 r0, ra7, rb11
-+ sub r1, r1, r0
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
-+ asr r1, r1, 14
-+ nop ; mul24 r1, r1, ra_wt_mul_l0
-+ shl r1, r1, 8 ; mov r3, ra_blk_height
-+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+ brr.anyn -, r:1b
-+ asr r1, r1, ra3
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+# >>> .anyn 1b
++ for (j = 0; j < 2; j++) {
++ const int dp0 = abs(P2 - 2 * P1 + P0);
++ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
++ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
++ const int d0 = dp0 + dq0;
++ const int d3 = dp3 + dq3;
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ const int no_p = _no_p[j];
++ const int no_q = _no_q[j];
+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++ if (d0 + d3 >= beta) {
++ pix_l += 4 * ystride;
++ pix_r += 4 * ystride;
++ continue;
++ } else {
++ const int beta_3 = beta >> 3;
++ const int beta_2 = beta >> 2;
++ const int tc25 = ((tc * 5 + 1) >> 1);
+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
++ // strong filtering
++ const int tc2 = tc << 1;
++ for (d = 0; d < 4; d++) {
++ const int p3 = P3;
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ const int q3 = Q3;
++ if (!no_p) {
++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++ }
++ if (!no_q) {
++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++ }
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ } else { // normal filtering
++ int nd_p = 1;
++ int nd_q = 1;
++ const int tc_2 = tc >> 1;
++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++ nd_p = 2;
++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++ nd_q = 2;
++
++ for (d = 0; d < 4; d++) {
++ const int p2 = P2;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ const int q2 = Q2;
++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++ if (abs(delta0) < 10 * tc) {
++ delta0 = av_clip(delta0, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ if (!no_p && nd_p > 1) {
++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++ P1 = av_clip_pixel(p1 + deltap1);
++ }
++ if (!no_q && nd_q > 1) {
++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++ Q1 = av_clip_pixel(q1 + deltaq1);
++ }
++ }
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ }
++ }
++ }
++}
+
-+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ add rb_lcount, rb_lcount, r0
-+ brr -, r:1b
-+ add rb_dma0, rb_dma0, r1
-+ add rb_dest, rb_dest, r2
-+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
+
-+# At 10 bits
-+# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
-+# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
-+# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
-+# (P)
-+# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
-+# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
-+# ... should be OK
-+#
-+# (B)
-+# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
-+# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
-+# So signed overflow if we sign extend here :-(
-+#
-+# In practice this doesn't happen (we need a maximal offset and a very unlucky
-+# filter).
-+#
-+# This could be fixed by offsetting the filters s.t. they are unsigned until
-+# weight mul and then removing the offset with the weighting offset (I think
-+# this should work) or splitting the rounding & offsetting
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++ ptrdiff_t _ystride, const int32_t *_tc,
++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++ int d, j, no_p, no_q;
++ pixel *pix_l = (pixel *)_pix_l;
++ pixel *pix_r = (pixel *)_pix_r;
++ ptrdiff_t xstride = _xstride / sizeof(pixel);
++ ptrdiff_t ystride = _ystride / sizeof(pixel);
+
-+::mc_filter_c_p
-+ m_filter_c_p 0, 8
++ for (j = 0; j < 2; j++) {
++ const int tc = _tc[j] << (BIT_DEPTH - 8);
++ if (tc <= 0) {
++ pix_l += 4 * ystride;
++ pix_r += 4 * ystride;
++ continue;
++ }
++ no_p = _no_p[j];
++ no_q = _no_q[j];
+
-+::mc_filter_c_p_l1
-+ m_filter_c_p 1, 8
++ for (d = 0; d < 4; d++) {
++ int delta0;
++ const int p1 = P1;
++ const int p0 = P0;
++ const int q0 = Q0;
++ const int q1 = Q1;
++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++ if (!no_p)
++ P0 = av_clip_pixel(p0 + delta0);
++ if (!no_q)
++ Q0 = av_clip_pixel(q0 - delta0);
++ pix_l += ystride;
++ pix_r += ystride;
++ }
++ }
++}
+
-+################################################################################
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++ unsigned int no_f)
++{
++ uint8_t no_p[2] = {no_f & 1, no_f & 2};
++ uint8_t no_q[2] = {no_f & 4, no_f & 8};
++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
+
-+# mc_filter_c_b
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++ uint8_t * src_l,
++ unsigned int no_f)
++{
++ uint8_t no_p[2] = {no_f & 1, no_f & 2};
++ uint8_t no_q[2] = {no_f & 4, no_f & 8};
++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
+
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x, ra_x16_base point to the current coordinates for this block
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
+
-+.macro m_filter_c_b, v_bit_depth
+diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
+new file mode 100644
+index 0000000000..f6db76482d
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.c
+@@ -0,0 +1,122 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 1
-+.set v_v_shift, 8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 2
-+.set v_v_shift, i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
-+.set v_x_mul, (1 << v_x_shift)
++#include "rpi_hevcdec.h"
+
-+# per-channel shifts were calculated on the *previous* invocation
++#include "rpi_hevcpred.h"
+
-+# get base addresses and per-channel shifts for *next* invocation
-+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
++#define PRED_C 0
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
+
-+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
+
-+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0
-+ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
-+ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
-+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
+
-+.if v_bit_depth <= 8
-+ shl ra_xshift_next, r0, 3
-+.endif
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
+
-+ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs
-+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs)
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height
-+ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
+
-+# set up VPM write
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
+
-+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
-+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
-+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
+
-+ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2
-+ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base
-+ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
+
-+# L1 - uniform layout could possibly be optimized
++void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
+
-+ shl r0, ra3.16b, v_x_shift # r0=x*2
-+ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs
-+ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
-+ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs
-+ min r0, r0, rb_max_x ; mov rb9, ra3.8b
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
+
-+.if v_bit_depth <= 8
-+ shl rb_xshift2_next, r0, 3
-+.endif
++#define HEVC_PRED_Y(depth) \
++ hpc->intra_pred[0] = FUNC(intra_pred_2, depth); \
++ hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \
++ hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \
++ hpc->intra_pred[3] = FUNC(intra_pred_5, depth); \
++ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \
++ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \
++ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \
++ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \
++ hpc->pred_dc = FUNC(pred_dc, depth); \
++ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
++ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
++ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+
-+ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
-+ and r1, r0, r1 ; mov rb10, ra3.8c
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov rb_dest, unif # Add stripe offsets ; dst_addr
-+ add rb_base2_next, r3, r0
++#define HEVC_PRED_C(depth) \
++ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \
++ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \
++ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \
++ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \
++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \
++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \
++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \
++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \
++ hpc->pred_dc_c = FUNCC(pred_dc, depth); \
++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+
-+ mov ra9, rb_max_y ; mov rb11, ra3.8d
-+ shl r1, ra_wt_off_l1, rb_wt_den_p15
-+ asr rb_wt_off, r1, 9 ; mov ra_link, unif # link
++#define HEVC_PRED(depth) \
++ HEVC_PRED_Y(depth); \
++ HEVC_PRED_C(depth);
+
-+# r5 loop counter
-+# ra0 H coeffs L0
-+# ra1 H coeffs L1
-+# ra2 V coeffs L0
-+# ra3 temp
-+# ra4-7 L0 H FIFO
-+# rb4-7 L1 H FIFO
-+# rb8-rb11 V coeffs L1
-+# ra9 rb_max_y alias
++ switch (bit_depth) {
++ case 9:
++ HEVC_PRED(9);
++ break;
++ case 10:
++ HEVC_PRED(10);
++ break;
++ case 12:
++ HEVC_PRED(12);
++ break;
++ default:
++ HEVC_PRED(8);
++ break;
++ }
+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment
-+ shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
-+ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
-+ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
-+ add ra_y, 1, ra_y ; mov r3, ra_y
++ if (ARCH_MIPS)
++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++}
+diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
+new file mode 100644
+index 0000000000..03c6eb3295
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.h
+@@ -0,0 +1,57 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
+
-+ max r3, r3, ra_k0 ; mov r0, r1 << 15
-+ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++#ifndef AVCODEC_RPI_HEVCPRED_H
++#define AVCODEC_RPI_HEVCPRED_H
+
-+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++#include
++#include
++#include "config.h"
+
-+# L0 H-filter
-+# H FIFO scrolls are spread all over this loop
-+ mov rb4, rb5 ; mov ra4, ra5 # ? Just moves
++struct HEVCRpiContext;
++struct HEVCRpiLocalContext;
+
-+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
-+ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra0.8d, r1
-+.if v_bit_depth <= 8
-+ sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1
-+.else
-+ sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1
-+ asr ra3, r2, (v_bit_depth - 8)
-+.endif
++typedef struct HEVCPredContext {
++ void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
+
-+ shr r2, r4, rb_xshift2 ; mov ra5, ra6
-+ shr r1, r2, v_v_shift ; mov r3, ra_y2
-+ add ra_y2, r3, ra_k1 ; mov rb6, rb7
++ void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride);
++ void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride, int log2_size, int c_idx);
++ void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int c_idx, int mode);
++ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
+
-+ max r3, r3, ra_k0 ; mov r0, r1 << 15
-+ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride);
++ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
++ ptrdiff_t stride, int log2_size, int c_idx);
++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++ const uint8_t *left, ptrdiff_t stride,
++ int c_idx, int mode);
++} HEVCPredContext;
+
-+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+ add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth);
++void ff_hevc_rpi_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
+
-+# L1 H-filter
++#endif /* AVCODEC_RPI_HEVCPRED_H */
+diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
+new file mode 100644
+index 0000000000..4ee776f955
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred_template.c
+@@ -0,0 +1,850 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
+
-+ and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0
-+ nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+ sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1
-+# V filters - start in branch delay slots of H
-+# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
-+ add r2, r2, r3 ; mul24 r1, rb5, ra2.8b
-+ brr.anyn -, r:1b
-+ mov ra6, ra7 ; mul24 r3, ra7, rb10
-+ sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a
-+ asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
-+# >>> .anyn 1b
++#include "config.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "bit_depth_template.c"
+
-+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay]
-+ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d
-+ sub r2, r1, r0 ; mul24 r0, ra4, rb8
-+ sub r1, r3, r0 ; mul24 r0, ra5, rb9
-+ add r1, r1, r0 ; mul24 r0, ra7, rb11
-+ sub r1, r1, r0 ; mul24 r2, r2, ra_k256
++#include "rpi_hevcdec.h"
++#include "rpi_hevcpred.h"
+
-+ asr r2, r2, 14 ; mul24 r1, r1, ra_k256
-+ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0
+
-+ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
-+ add r1, r1, r2 ; mov r3, ra_blk_height
++#define DUMP_PRED 0
+
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
++#define POS(x, y) src[(x) + stride * (y)]
+
-+ brr.anyn -, r:1b
-+ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+# >>> .anyn 1b
++// INCLUDED_ONCE defined at EOF
++#ifndef INCLUDED_ONCE
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++ uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++ uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++ pixel4_16 t = {{x, x, x, x}};
++ return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++ pixel4_32 t = {{x, x, x, x}};
++ return t;
++}
++#endif
+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
+
-+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++#if BIT_DEPTH == 8
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t c8_src_ptr_t
++#define c_dst_ptr_t c8_dst_ptr_t
++#else
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
++#endif
+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ add rb_lcount, rb_lcount, r0
-+ brr -, r:1b
-+ add rb_dma0, rb_dma0, r1
-+ add rb_dest, rb_dest, r2
-+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
++#endif
+
-+::mc_filter_c_b
-+ m_filter_c_b 8
+
-+################################################################################
-+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
-+# conflicts
++#if DUMP_PRED && !defined(INCLUDE_ONCE)
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++ for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++ for (unsigned int x = 0; x != size; x++) {
++ printf("%4d", data[x * 2]);
++ }
++ printf("\n");
++ }
++ printf("\n");
++}
++#endif
+
-+.macro m_exit_drain
-+.if PREREAD == 2
-+# Special case 2 as loop is wasteful
-+ nop ; nop ; ldtmu0
-+ nop ; nop ; ldtmu1
-+ nop ; nop ; ldtmu0
-+ mov -, vw_wait ; nop ; ldtmu1
-+.else
-+ mov.setf r3, PREREAD - 1
-+:1
-+ brr.anynz -, r:1b
-+ nop ; nop ; ldtmu0
-+ nop ; nop ; ldtmu1
-+ sub.setf r3, r3, 1
-+ # >>>
-+ mov -, vw_wait
-+.endif
-+.endm
++static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++ int log2_size, int c_idx_arg)
++{
++#define PU(x) \
++ ((x) >> s->ps.sps->log2_min_pu_size)
++#define MVF(x, y) \
++ (s->ref->tab_mvf[(x) + (y) * min_pu_width])
++#define MVF_PU(x, y) \
++ MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift))))
++#define IS_INTRA(x, y) \
++ (MVF_PU(x, y).pred_flag == PF_INTRA)
++#define MIN_TB_ADDR_ZS(x, y) \
++ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
++#define EXTEND(ptr, val, len) \
++do { \
++ pixel4 pix = PIXEL_SPLAT_X4(val); \
++ for (i = 0; i < (len); i += 4) \
++ AV_WN4P(ptr + i, pix); \
++} while (0)
++
++#define EXTEND_RIGHT_CIP(ptr, start, length) \
++ for (i = start; i < (start) + (length); i += 4) \
++ if (!IS_INTRA(i, -1)) \
++ AV_WN4P(&ptr[i], a); \
++ else \
++ a = PIXEL_SPLAT_X4(ptr[i+3])
++#define EXTEND_LEFT_CIP(ptr, start, length) \
++ for (i = start; i > (start) - (length); i--) \
++ if (!IS_INTRA(i - 1, -1)) \
++ ptr[i - 1] = ptr[i]
++#define EXTEND_UP_CIP(ptr, start, length) \
++ for (i = (start); i > (start) - (length); i -= 4) \
++ if (!IS_INTRA(-1, i - 3)) \
++ AV_WN4P(&ptr[i - 3], a); \
++ else \
++ a = PIXEL_SPLAT_X4(ptr[i - 3])
++#define EXTEND_DOWN_CIP(ptr, start, length) \
++ for (i = start; i < (start) + (length); i += 4) \
++ if (!IS_INTRA(-1, i)) \
++ AV_WN4P(&ptr[i], a); \
++ else \
++ a = PIXEL_SPLAT_X4(ptr[i + 3])
++ // c_idx will alaways be 1 for _c versions and 0 for y
++ const unsigned int c_idx = PRED_C;
++ int i;
++ const unsigned int hshift = ctx_hshift(s, c_idx);
++ const unsigned int vshift = ctx_vshift(s, c_idx);
++ int size = (1 << log2_size);
++ int size_in_luma_h = size << hshift;
++ int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
++ int size_in_luma_v = size << vshift;
++ int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
++ const int x = x0 >> hshift;
++ const int y = y0 >> vshift;
++ int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++ int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+
-+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
-+# All qpus start at the beginning and after that (group - 1) must have finished
-+# before (group) can start
-+#
-+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
-+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
-+# lockup otherwise)
-+#
-+# There is some, currently ill defined, potential lockup if we have the VDM active
-+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
-+#
-+# The code stalled when I had many waiters on a single sem so we have a
-+# "ripple" of srels to restart. Unsure why, may have been bug, but this works
-+# and we currently have both the memory & sems to support it.
-+.macro m_sync_q, n_qpu, n_quads
-+# Do not generate code for qpu >= quads * 4 - fns should never be called
-+.if n_qpu < n_quads * 4
-+ mov ra_link, unif # Can only branch to an a reg (not r0)
-+ mov -, vw_wait # [ra_link delay]
++ int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
+
-+.set n_sem_sync, n_qpu - (n_qpu % 4)
-+.set n_sem_in, n_qpu
-+.set n_sem_out, n_qpu + 1
++ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
++ pixel *const src = c_idx == 0 ?
++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
+
-+.if n_qpu % 4 == 0
++ int min_pu_width = s->ps.sps->min_pu_width;
+
-+.set n_sem_quad_in, 12 + n_qpu / 4
-+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++ const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
++ lc->tu.intra_pred_mode;
++ pixel4 a;
++ pixel left_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
++ pixel filtered_left_array[2 * MAX_TB_SIZE + 1];
++#endif
++ pixel top_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
++ pixel filtered_top_array[2 * MAX_TB_SIZE + 1];
++#endif
+
-+ sacq -, n_sem_sync
-+ sacq -, n_sem_sync
-+ sacq -, n_sem_sync
-+ bra -, ra_link
-+ sacq -, n_sem_quad_in
-+ srel -, n_sem_out
-+ srel -, n_sem_quad_out
++ pixel *left = left_array + 1;
++ pixel *top = top_array + 1;
++#if !PRED_C
++ pixel *filtered_left = filtered_left_array + 1;
++ pixel *filtered_top = filtered_top_array + 1;
++#endif
++ int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
++ int cand_left = lc->na.cand_left;
++ int cand_up_left = lc->na.cand_up_left;
++ int cand_up = lc->na.cand_up;
++ int cand_up_right = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1);
+
-+.else
-+ bra -, ra_link
-+ srel -, n_sem_sync
-+ sacq -, n_sem_in
-+.if n_sem_out % 4 != 0
-+ srel -, n_sem_out
-+.else
-+ nop
-+.endif
-+.endif
-+.endif
-+.endm
++ int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) -
++ (y0 + size_in_luma_v)) >> vshift;
++ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
++ (x0 + size_in_luma_h)) >> hshift;
+
-+.set v_quads8, N_QPU_8 / 4
++ pixel * src_l = src - 1;
++ pixel * src_u = src - stride;
++ pixel * src_ur = src_u + size;
+
-+::mc_sync_q0
-+ m_sync_q 0, v_quads8
-+::mc_sync_q1
-+ m_sync_q 1, v_quads8
-+::mc_sync_q2
-+ m_sync_q 2, v_quads8
-+::mc_sync_q3
-+ m_sync_q 3, v_quads8
-+::mc_sync_q4
-+ m_sync_q 4, v_quads8
-+::mc_sync_q5
-+ m_sync_q 5, v_quads8
-+::mc_sync_q6
-+ m_sync_q 6, v_quads8
-+::mc_sync_q7
-+ m_sync_q 7, v_quads8
-+::mc_sync_q8
-+ m_sync_q 8, v_quads8
-+::mc_sync_q9
-+ m_sync_q 9, v_quads8
-+::mc_sync_q10
-+ m_sync_q 10, v_quads8
-+::mc_sync_q11
-+ m_sync_q 11, v_quads8
++ {
++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
++ const AVFrame * const frame = s->frame;
++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
++ if ((x & mask) == 0)
++ src_l -= stripe_adj;
++ if (((x + size) & mask) == 0)
++ src_ur += stripe_adj;
++ }
+
-+# mc_exit()
-+# Chroma & Luma the same now
++ if (s->ps.pps->constrained_intra_pred_flag == 1) {
++ int size_in_luma_pu_v = PU(size_in_luma_v);
++ int size_in_luma_pu_h = PU(size_in_luma_h);
++ int on_pu_edge_x = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size);
++ int on_pu_edge_y = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size);
++ if (!size_in_luma_pu_h)
++ size_in_luma_pu_h++;
++ if (cand_bottom_left == 1 && on_pu_edge_x) {
++ int x_left_pu = PU(x0 - 1);
++ int y_bottom_pu = PU(y0 + size_in_luma_v);
++ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu);
++ cand_bottom_left = 0;
++ for (i = 0; i < max; i += 2)
++ cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA);
++ }
++ if (cand_left == 1 && on_pu_edge_x) {
++ int x_left_pu = PU(x0 - 1);
++ int y_left_pu = PU(y0);
++ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu);
++ cand_left = 0;
++ for (i = 0; i < max; i += 2)
++ cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA);
++ }
++ if (cand_up_left == 1) {
++ int x_left_pu = PU(x0 - 1);
++ int y_top_pu = PU(y0 - 1);
++ cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA;
++ }
++ if (cand_up == 1 && on_pu_edge_y) {
++ int x_top_pu = PU(x0);
++ int y_top_pu = PU(y0 - 1);
++ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu);
++ cand_up = 0;
++ for (i = 0; i < max; i += 2)
++ cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA);
++ }
++ if (cand_up_right == 1 && on_pu_edge_y) {
++ int y_top_pu = PU(y0 - 1);
++ int x_right_pu = PU(x0 + size_in_luma_h);
++ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu);
++ cand_up_right = 0;
++ for (i = 0; i < max; i += 2)
++ cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA);
++ }
++ memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel));
++ memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel));
++ top[-1] = 128;
++ }
++ if (cand_up_left) {
++ left[-1] = src_l[-stride];
++ top[-1] = left[-1];
++ }
++ if (cand_up)
++ // Always good - even with sand
++ memcpy(top, src_u, size * sizeof(pixel));
++ if (cand_up_right) {
++ memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
++ EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
++ size - top_right_size);
++ }
++ if (cand_left)
++ for (i = 0; i < size; i++)
++ left[i] = src_l[stride * i];
++ if (cand_bottom_left) {
++ for (i = size; i < size + bottom_left_size; i++)
++ left[i] = src_l[stride * i];
++ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
++ size - bottom_left_size);
++ }
+
-+.macro m_exit_qn
-+ m_exit_drain
-+ nop ; nop ; thrend
-+ nop
-+ nop
-+# >>> thrend <<<
-+.endm
++ if (s->ps.pps->constrained_intra_pred_flag == 1) {
++ if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) {
++ int size_max_x = x0 + ((2 * size) << hshift) < s->ps.sps->width ?
++ 2 * size : (s->ps.sps->width - x0) >> hshift;
++ int size_max_y = y0 + ((2 * size) << vshift) < s->ps.sps->height ?
++ 2 * size : (s->ps.sps->height - y0) >> vshift;
++ int j = size + (cand_bottom_left? bottom_left_size: 0) -1;
++ if (!cand_up_right) {
++ size_max_x = x0 + ((size) << hshift) < s->ps.sps->width ?
++ size : (s->ps.sps->width - x0) >> hshift;
++ }
++ if (!cand_bottom_left) {
++ size_max_y = y0 + (( size) << vshift) < s->ps.sps->height ?
++ size : (s->ps.sps->height - y0) >> vshift;
++ }
++ if (cand_bottom_left || cand_left || cand_up_left) {
++ while (j > -1 && !IS_INTRA(-1, j))
++ j--;
++ if (!IS_INTRA(-1, j)) {
++ j = 0;
++ while (j < size_max_x && !IS_INTRA(j, -1))
++ j++;
++ EXTEND_LEFT_CIP(top, j, j + 1);
++ left[-1] = top[-1];
++ }
++ } else {
++ j = 0;
++ while (j < size_max_x && !IS_INTRA(j, -1))
++ j++;
++ if (j > 0)
++ if (x0 > 0) {
++ EXTEND_LEFT_CIP(top, j, j + 1);
++ } else {
++ EXTEND_LEFT_CIP(top, j, j);
++ top[-1] = top[0];
++ }
++ left[-1] = top[-1];
++ }
++ left[-1] = top[-1];
++ if (cand_bottom_left || cand_left) {
++ a = PIXEL_SPLAT_X4(left[-1]);
++ EXTEND_DOWN_CIP(left, 0, size_max_y);
++ }
++ if (!cand_left)
++ EXTEND(left, left[-1], size);
++ if (!cand_bottom_left)
++ EXTEND(left + size, left[size - 1], size);
++ if (x0 != 0 && y0 != 0) {
++ a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
++ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
++ if (!IS_INTRA(-1, - 1))
++ left[-1] = left[0];
++ } else if (x0 == 0) {
++ EXTEND(left, 0, size_max_y);
++ } else {
++ a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
++ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
++ }
++ top[-1] = left[-1];
++ if (y0 != 0) {
++ a = PIXEL_SPLAT_X4(left[-1]);
++ EXTEND_RIGHT_CIP(top, 0, size_max_x);
++ }
++ }
++ }
++ // Infer the unavailable samples
++ if (!cand_bottom_left) {
++ if (cand_left) {
++ EXTEND(left + size, left[size - 1], size);
++ } else if (cand_up_left) {
++ EXTEND(left, left[-1], 2 * size);
++ cand_left = 1;
++ } else if (cand_up) {
++ left[-1] = top[0];
++ EXTEND(left, left[-1], 2 * size);
++ cand_up_left = 1;
++ cand_left = 1;
++ } else if (cand_up_right) {
++ EXTEND(top, top[size], size);
++ left[-1] = top[size];
++ EXTEND(left, left[-1], 2 * size);
++ cand_up = 1;
++ cand_up_left = 1;
++ cand_left = 1;
++ } else { // No samples available
++#if PRED_C
++ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
++#else
++ left[-1] = (1 << (BIT_DEPTH - 1));
++#endif
++ EXTEND(top, left[-1], 2 * size);
++ EXTEND(left, left[-1], 2 * size);
++ }
++ }
+
-+::mc_exit_c_qn
-+::mc_exit_y_qn
-+ m_exit_qn
++ if (!cand_left)
++ EXTEND(left, left[size], size);
++ if (!cand_up_left) {
++ left[-1] = left[0];
++ }
++ if (!cand_up)
++ EXTEND(top, left[-1], size);
++ if (!cand_up_right)
++ EXTEND(top + size, top[size - 1], size);
+
++ top[-1] = left[-1];
+
++ // Filtering process
++ // Sand can only apply to chroma_format_idc == 1 so we don't need to
++ // worry about chroma smoothing for that case
++#if !PRED_C
++ if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || ctx_cfmt(s) == 3)) {
++ if (mode != INTRA_DC && size != 4){
++ int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
++ int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)),
++ FFABS((int)(mode - 10U)));
++ if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
++ int threshold = 1 << (BIT_DEPTH - 5);
++ if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 &&
++ log2_size == 5 &&
++ FFABS(top[-1] + top[63] - 2 * top[31]) < threshold &&
++ FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
++ // We can't just overwrite values in top because it could be
++ // a pointer into src
++ filtered_top[-1] = top[-1];
++ filtered_top[63] = top[63];
++ for (i = 0; i < 63; i++)
++ filtered_top[i] = ((64 - (i + 1)) * top[-1] +
++ (i + 1) * top[63] + 32) >> 6;
++ for (i = 0; i < 63; i++)
++ left[i] = ((64 - (i + 1)) * left[-1] +
++ (i + 1) * left[63] + 32) >> 6;
++ top = filtered_top;
++ } else {
++ filtered_left[2 * size - 1] = left[2 * size - 1];
++ filtered_top[2 * size - 1] = top[2 * size - 1];
++ for (i = 2 * size - 2; i >= 0; i--)
++ filtered_left[i] = (left[i + 1] + 2 * left[i] +
++ left[i - 1] + 2) >> 2;
++ filtered_top[-1] =
++ filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
++ for (i = 2 * size - 2; i >= 0; i--)
++ filtered_top[i] = (top[i + 1] + 2 * top[i] +
++ top[i - 1] + 2) >> 2;
++ left = filtered_left;
++ top = filtered_top;
++ }
++ }
++ }
++ }
+
-+# mc_interrupt_exit12()
++ switch (mode) {
++ case INTRA_PLANAR:
++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_DC:
++ s->hpc.pred_dc((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride, log2_size, c_idx);
++ break;
++ default:
++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride, c_idx,
++ mode);
++ break;
++ }
++#else
++ switch (mode) {
++ case INTRA_PLANAR:
++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride);
++ break;
++ case INTRA_DC:
++ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride, log2_size, c_idx);
++ break;
++ default:
++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++ (uint8_t *)left, stride, c_idx,
++ mode);
++ break;
++ }
+
-+.macro m_exit_q0
-+ m_exit_drain
-+ sacq -, 12
-+ nop ; nop ; thrend
-+ mov interrupt, 1
-+ nop
-+# >>> thrend <<<
-+.endm
++#if DUMP_PRED
++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
++}
+
-+::mc_exit_c_q0
-+::mc_exit_y_q0
-+ m_exit_q0
++#define INTRA_PRED(size) \
++static void FUNC(intra_pred_ ## size)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx) \
++{ \
++ FUNC(intra_pred)(s, lc, x0, y0, size, c_idx); \
++}
+
-+# LUMA CODE
++INTRA_PRED(2)
++INTRA_PRED(3)
++INTRA_PRED(4)
++INTRA_PRED(5)
+
-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-+# For P frames we make the second x,y coordinates offset by +8
++#undef INTRA_PRED
+
++#if !PRED_C
++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left, ptrdiff_t stride,
++ int trafo_size)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++ int size = 1 << trafo_size;
++ for (y = 0; y < size; y++)
++ for (x = 0; x < size; x++)
++ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] +
++ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1);
++}
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++ const uint8_t * _left, ptrdiff_t stride,
++ int trafo_size)
++{
++ int x, y;
++ int size = 1 << trafo_size;
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const c_src_ptr_t top = (c_src_ptr_t)_top;
++ const c_src_ptr_t left = (c_src_ptr_t)_left;
+
-+################################################################################
-+# mc_setup
-+#
-+# typedef struct qpu_mc_pred_y_s_s {
-+# qpu_mc_src_t next_src1;
-+# qpu_mc_src_t next_src2;
-+# uint16_t pic_h;
-+# uint16_t pic_w;
-+# uint32_t stride2;
-+# uint32_t stride1;
-+# uint32_t wdenom;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_y_s_t;
++ for (y = 0; y < size; y++, src += stride)
++ {
++ for (x = 0; x < size; x++)
++ {
++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] +
++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] +
++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++ }
++ }
++}
++#endif
+
-+.macro m_setup_y, v_bit_depth
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \
++ const uint8_t *left, ptrdiff_t stride) \
++{ \
++ FUNC(pred_planar)(src, top, left, stride, size + 2); \
++}
+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift, 0
-+.set v_pmask, 0xff
-+.set v_blk_height, Y_BLK_HEIGHT_8
-+.else
-+.set v_x_shift, 1
-+.set v_pmask, 0xffff
-+.set v_blk_height, Y_BLK_HEIGHT_16
-+.endif
++PRED_PLANAR(0)
++PRED_PLANAR(1)
++PRED_PLANAR(2)
++PRED_PLANAR(3)
+
++#undef PRED_PLANAR
+
-+ # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
-+ mov ra9, unif # ref_y_base
-+ mov ra1, unif # x2_y2
-+ mov ra11, unif # ref_y2_base
++#if !PRED_C
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int log2_size, int c_idx)
++{
++ int i, j, x, y;
++ int size = (1 << log2_size);
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
++ int dc = size;
++ pixel4 a;
++ for (i = 0; i < size; i++)
++ dc += left[i] + top[i];
+
-+# load constants
-+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+ shl rb_ef, r0, i_shift30
++ dc >>= log2_size + 1;
+
++ a = PIXEL_SPLAT_X4(dc);
+
-+ mov ra_kff100100, 0xff100100
-+ mov rb_pmask, v_pmask
-+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++ for (i = 0; i < size; i++)
++ for (j = 0; j < size; j+=4)
++ AV_WN4P(&POS(j, i), a);
++
++ if (c_idx == 0 && size < 32) {
++ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
++ for (x = 1; x < size; x++)
++ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
++ for (y = 1; y < size; y++)
++ POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++ }
++}
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int log2_size, int c_idx)
++{
++ unsigned int i, j;
++ const unsigned int size = (1 << log2_size);
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ const c_src_ptr_t top = (c_src_ptr_t)_top;
++ const c_src_ptr_t left = (c_src_ptr_t)_left;
++ unsigned int dc0 = size;
++ unsigned int dc1 = size;
+
-+# Compute part of VPM to use
++ for (i = 0; i < size; i++)
++ {
++ dc0 += left[i][0] + top[i][0];
++ dc1 += left[i][1] + top[i][1];
++ }
+
-+# Read image dimensions
-+ mov ra3, unif # width_height
-+ mov rb_xpitch, unif # stride2
-+.if v_x_shift == 0
-+ sub rb_max_x, ra3.16b, 1
-+.else
-+ sub r0, ra3.16b, 1
-+ shl rb_max_x, r0, v_x_shift
-+.endif
-+ sub rb_max_y, ra3.16a, 1
-+ mov rb_pitch, unif # stride1
++ dc0 >>= log2_size + 1;
++ dc1 >>= log2_size + 1;
+
-+# get destination pitch
-+ mov r1, vdw_setup_1(0)
-+ or rb_dma1_base, r1, rb_pitch
++ for (i = 0; i < size; i++, src += stride)
++ {
++ for (j = 0; j < size; ++j)
++ {
++ src[j][0] = dc0;
++ src[j][1] = dc1;
+
-+# Compute base address for first and second access
-+ mov r3, elem_num
-+ add r0, ra0.16b, r3 # Load x + elem_num
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, 0
-+ min r0, r0, rb_max_x
-+ shl ra_xshift_next, r0, 3 # Compute shifts
++ }
++ }
++}
++#endif
+
-+# X is byte offset - we can only load words - mask
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
++};
++static const int inv_angle[] = {
++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++ -630, -910, -1638, -4096
++};
++#endif
+
-+ and r0, r0, -4 ; v8subs r2, r2, r2
-+ sub r2, r2, rb_pitch
-+ and r1, r0, r2
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 # Add stripe offsets
-+ add ra_base, ra9, r0
++#if !PRED_C
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++ const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int c_idx,
++ int mode, int size)
++{
++ int x, y;
++ pixel *src = (pixel *)_src;
++ const pixel *top = (const pixel *)_top;
++ const pixel *left = (const pixel *)_left;
+
-+ # r3 still contains elem_num
-+ add r0, ra1.16b, r3 # Load x
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, 0
-+ min r0, r0, rb_max_x
-+ shl rb_xshift2_next, r0, 3 # Compute shifts
++ int angle = intra_pred_angle[mode - 2];
++ pixel ref_array[3 * MAX_TB_SIZE + 4];
++ pixel *ref_tmp = ref_array + size;
++ const pixel *ref;
++ int last = (size * angle) >> 5;
+
-+ # r2 still contains mask
-+ and r0, r0, -4
-+ and r1, r0, r2
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 # Add stripe offsets
-+ add ra_base2, ra11, r0
++ if (mode >= 18) {
++ ref = top - 1;
++ if (angle < 0 && last < -1) {
++ for (x = 0; x <= size; x += 4)
++ AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1]));
++ for (x = last; x <= -1; x++)
++ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++ ref = ref_tmp;
++ }
+
-+# Do preloads
-+ nop ; mov r0, ra0.16a # ; r0 = y
-+ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2
++ for (y = 0; y < size; y++) {
++ int idx = ((y + 1) * angle) >> 5;
++ int fact = ((y + 1) * angle) & 31;
++ if (fact) {
++ for (x = 0; x < size; x += 4) {
++ POS(x , y) = ((32 - fact) * ref[x + idx + 1] +
++ fact * ref[x + idx + 2] + 16) >> 5;
++ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
++ fact * ref[x + 1 + idx + 2] + 16) >> 5;
++ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
++ fact * ref[x + 2 + idx + 2] + 16) >> 5;
++ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
++ fact * ref[x + 3 + idx + 2] + 16) >> 5;
++ }
++ } else {
++ for (x = 0; x < size; x += 4)
++ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
++ }
++ }
++ if (mode == 26 && c_idx == 0 && size < 32) {
++ for (y = 0; y < size; y++)
++ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
++ }
++ } else {
++ ref = left - 1;
++ if (angle < 0 && last < -1) {
++ for (x = 0; x <= size; x += 4)
++ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
++ for (x = last; x <= -1; x++)
++ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++ ref = ref_tmp;
++ }
+
-+:1
-+ sub.setf r3, r3, 1
-+ max r1, r0, 0
-+ min r1, r1, rb_max_y
-+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t0s, ra_base, r1 ; mov ra_y, r0
++ for (x = 0; x < size; x++) {
++ int idx = ((x + 1) * angle) >> 5;
++ int fact = ((x + 1) * angle) & 31;
++ if (fact) {
++ for (y = 0; y < size; y++) {
++ POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
++ fact * ref[y + idx + 2] + 16) >> 5;
++ }
++ } else {
++ for (y = 0; y < size; y++)
++ POS(x, y) = ref[y + idx + 1];
++ }
++ }
++ if (mode == 10 && c_idx == 0 && size < 32) {
++ for (x = 0; x < size; x += 4) {
++ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - top[-1]) >> 1));
++ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
++ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1));
++ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1));
++ }
++ }
++ }
++}
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++ const uint8_t *_top,
++ const uint8_t *_left,
++ ptrdiff_t stride, int c_idx,
++ int mode, int size)
++{
++ int x, y;
++ c_dst_ptr_t src = (c_dst_ptr_t)_src;
++ c_src_ptr_t top = (c_src_ptr_t)_top;
++ c_src_ptr_t left = (c_src_ptr_t)_left;
+
-+ max r1, r2, 0
-+ brr.anynz -, r:1b
-+ min r1, r1, rb_max_y
-+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t1s, ra_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz 1b
++ const int angle = intra_pred_angle[mode - 2];
++ cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++ c_dst_ptr_t ref_tmp = ref_array + size;
++ c_src_ptr_t ref;
++ const int last = (size * angle) >> 5;
+
-+ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom
++ if (mode >= 18) {
++ ref = top - 1;
++ if (angle < 0 && last < -1) {
++ memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
++ for (x = last; x <= -1; x++)
++ {
++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++ }
++ ref = (c_src_ptr_t)ref_tmp;
++ }
+
-+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++ for (y = 0; y < size; y++, src += stride) {
++ const int idx = ((y + 1) * angle) >> 5;
++ const int fact = ((y + 1) * angle) & 31;
++ if (fact) {
++ for (x = 0; x < size; ++x) {
++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++ fact * ref[x + idx + 2][0] + 16) >> 5;
++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++ fact * ref[x + idx + 2][1] + 16) >> 5;
++ }
++ } else {
++ memcpy(src, ref + idx + 1, size * 2 * PW);
++ }
++ }
++ } else {
++ ref = left - 1;
++ if (angle < 0 && last < -1) {
++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
++ for (x = last; x <= -1; x++)
++ {
++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++ }
++ ref = (c_src_ptr_t)ref_tmp;
++ }
+
-+ mov ra_link, unif # Next fn
++ for (x = 0; x < size; x++, src++) {
++ const int idx = ((x + 1) * angle) >> 5;
++ const int fact = ((x + 1) * angle) & 31;
++ if (fact) {
++ for (y = 0; y < size; y++) {
++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++ fact * ref[y + idx + 2][0] + 16) >> 5;
++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++ fact * ref[y + idx + 2][1] + 16) >> 5;
++ }
++ } else {
++ for (y = 0; y < size; y++)
++ {
++ src[y * stride][0] = ref[y + idx + 1][0];
++ src[y * stride][1] = ref[y + idx + 1][1];
++ }
++ }
++ }
++ }
++}
++#endif
++
++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int c_idx, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2);
++}
++
++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int c_idx, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3);
++}
++
++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int c_idx, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4);
++}
+
-+# touch vertical context to keep simulator happy
-+ mov ra8, 0 ; mov rb8, 0
-+ bra -, ra_link
-+ mov ra9, 0 ; mov rb9, 0
-+ mov ra10, 0 ; mov rb10, 0
-+ mov ra11, 0 ; mov rb11, 0
-+# >>> ra_link
-+.endm
++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++ const uint8_t *left,
++ ptrdiff_t stride, int c_idx, int mode)
++{
++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
++}
+
-+::mc_setup_y_q0
-+ m_setup_q0
-+::mc_setup_y_qn
-+ m_setup_y 8
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
+
-+################################################################################
-+#
-+# Start of per-block setup code
-+# P and B blocks share the same setup code to save on Icache space
++#undef EXTEND_LEFT_CIP
++#undef EXTEND_RIGHT_CIP
++#undef EXTEND_UP_CIP
++#undef EXTEND_DOWN_CIP
++#undef IS_INTRA
++#undef MVF_PU
++#undef MVF
++#undef PU
++#undef EXTEND
++#undef MIN_TB_ADDR_ZS
++#undef POS
++#undef PW
+
-+# luma_setup_delay3 done in delay slots of branch that got us here
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
+
-+# get base addresses and per-channel shifts for *next* invocation
-+# per-channel shifts were calculated on the *previous* invocation
+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+new file mode 100644
+index 0000000000..c16d9931bd
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,145 @@
++/*
++Copyright (c) 2012, Broadcom Europe Ltd.
++All rights reserved.
+
-+# 1st 3 instructions of per_block-setup in branch delay
-+#
-+# typedef struct qpu_mc_pred_y_p_s {
-+# qpu_mc_src_t next_src1;
-+# qpu_mc_src_t next_src2;
-+# uint16_t h;
-+# uint16_t w;
-+# uint32_t mymx21;
-+# uint32_t wo1;
-+# uint32_t wo2;
-+# uint32_t dst_addr;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_y_p_t;
-+#
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++ * Redistributions of source code must retain the above copyright
++ notice, this list of conditions and the following disclaimer.
++ * Redistributions in binary form must reproduce the above copyright
++ notice, this list of conditions and the following disclaimer in the
++ documentation and/or other materials provided with the distribution.
++ * Neither the name of the copyright holder nor the
++ names of its contributors may be used to endorse or promote products
++ derived from this software without specific prior written permission.
+
-+.macro m_luma_setup, v_bit_depth
-+# Hack - QASM may well have have label pasting but I have no idea how...
-+.if v_bit_depth == 8
-+ brr ra_link, r:per_block_setup_8
-+.elif v_bit_depth == 10
-+ brr ra_link, r:per_block_setup_10
-+.endif
-+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack??
-+ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
-+ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+.endm
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
+
-+.macro m_per_block_setup, v_bit_depth
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++#include
+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 0
-+.set v_x_mul, 1
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 1
-+.set v_x_mul, 2
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
++#include
+
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+ min r0, r0, rb_max_x
++#define MAJOR_NUM 100
++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
++#define DEVICE_FILE_NAME "/dev/vcio"
+
-+ shl ra_xshift_next, r0, 3 # Compute shifts
-+ and r0, r0, -4
-+ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base
-+ and r1, r0, r2 ; mov ra_y_next, ra0.16a
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y
-+ add ra_base_next, ra_base_next, r0 # [ra1 delay]
++#include "rpi_mailbox.h"
++//#include
+
-+ add r0, ra1.16b, r3 # Load x2
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, r5 ; mov ra_y2_next, ra1.16a
-+ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base
-+ shl rb_xshift2_next, r0, 3 # Compute shifts
-+ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height
-+ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
-+ add rb_base2_next, rb_base2_next, r0
++/*
++ * use ioctl to send mbox property message
++ */
+
-+# get width,height of block (unif load above), r1 = width * pel_size
-+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
-+ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
-+ add rb_lcount, r0, 7
-+ shl r0, r0, v_dma_h_shift
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register
-+ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets
++static int mbox_property(int file_desc, void *buf)
++{
++ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
+
-+# get filter coefficients and discard unused B frame values
-+ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight
-+ shl ra8, r0, 3 ; mov r3, ra_k255
++ if (ret_val < 0) {
++ printf("ioctl_set_msg failed:%d\n", ret_val);
++ }
+
-+# Pack the 1st 4 filter coefs for H & V tightly
-+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++#ifdef DEBUG
++ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++ for (i=0; i>> branch ra_link
++ rv = mbox_property(fd, buf);
++ memcpy(img, rimg, sizeof(*img));
+
-+# r5 = 0
-+# ra_wt_mul_l1 = weight L1
-+# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred)
-+# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
-+# rb_wt_den_p15 = weight denom + 6 + 9
-+# rb_wt_mul_l0 = weight L0
-+.endm
++ return rv;
++}
+
-+:per_block_setup_8
-+ m_per_block_setup 8
++int mbox_open() {
++ int file_desc;
+
++ // open a char device file used for communicating with kernel mbox driver
++ file_desc = open(DEVICE_FILE_NAME, 0);
++ if (file_desc < 0) {
++ printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
++ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
++ }
++ return file_desc;
++}
+
++void mbox_close(int file_desc) {
++ close(file_desc);
++}
+
-+################################################################################
-+# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+# In a P block, y2_x2 should be y_x+8
-+# At this point we have already issued two pairs of texture requests for the current block
+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+new file mode 100644
+index 0000000000..b3168788d2
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,58 @@
++#ifndef RPI_MAILBOX_H
++#define RPI_MAILBOX_H
+
-+.macro m_filter_y_pxx, v_bit_depth
-+ m_luma_setup v_bit_depth
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++ void *u, *v;
++ int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
+
-+ shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++typedef union {
++ VC_IMAGE_EXTRA_UV_T uv;
++// VC_IMAGE_EXTRA_RGBA_T rgba;
++// VC_IMAGE_EXTRA_PAL_T pal;
++// VC_IMAGE_EXTRA_TF_T tf;
++// VC_IMAGE_EXTRA_BAYER_T bayer;
++// VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++// VC_IMAGE_EXTRA_CODEC_T codec;
++// VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
+
-+# r5 = 0 (loop count)
+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++typedef struct VC_IMAGE_T {
++ unsigned short type; /* should restrict to 16 bits */
++ unsigned short info; /* format-specific info; zero for VC02 behaviour */
++ unsigned short width; /* width in pixels */
++ unsigned short height; /* height in pixels */
++ int pitch; /* pitch of image_data array in bytes */
++ int size; /* number of bytes available in image_data array */
++ void *image_data; /* pixel data */
++ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */
++ void *metadata; /* metadata header for the image */
++ void *pool_object; /* nonNULL if image was allocated from a vc_pool */
++ int mem_handle; /* the mem handle for relocatable memory storage */
++ int metadata_size; /* size of metadata of each channel in bytes */
++ int channel_offset; /* offset of consecutive channels in bytes */
++ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++ uint8_t num_channels; /* number of channels (2 for stereo) */
++ uint8_t current_channel;/* the channel this header is currently pointing to */
++ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header
++ into a linked-mulitchannel image */
++ uint8_t channel_index; /* index of the channel this header represents while
++ it is being linked. */
++ uint8_t _dummy[3]; /* pad struct to 64 bytes */
++} VC_IMAGE_T;
+
-+# N.B. Whilst y == y2 as far as this loop is concerned we will start
-+# the grab for the next block before we finish with this block and that
-+# might be B where y != y2 so we must do full processing on both y and y2
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
-+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
+
-+ max r2, ra_y, 0 # y
-+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
++extern int mbox_open(void);
++extern void mbox_close(int file_desc);
+
-+ max r2, ra_y2, 0
-+ min r2, r2, rb_max_y ; mov ra7, ra8
-+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+
-+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+
-+# apply horizontal filter
-+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
-+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++#endif
+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+new file mode 100644
+index 0000000000..3dfc35fa5c
+--- /dev/null
++++ b/libavcodec/rpi_qpu.c
+@@ -0,0 +1,939 @@
++#include
++#include
++#include
++#include
++#include
++#include "libavutil/avassert.h"
+
-+ sub.setf -, r5, 8 ; mov ra9, ra10
-+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
-+ brr.anyn -, r:1b
-+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
-+ mov ra10, ra11 ; mov rb10, rb11
-+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
-+ # >>> .anyn 1b
++#include "config.h"
+
-+ # apply vertical filter and write to VPM
-+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
-+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
-+ add r1, r1, r0 ; mul24 r0, ra8, rb4
-+ add r1, r1, r0 ; mul24 r0, ra9, rb5
-+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
-+ add r1, r1, r0 ; mul24 r0, ra11, rb7
-+ sub r1, r1, r0
-+# At this point r1 is a 22-bit signed quantity: 8 (original sample),
-+# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
-+# The top 8 bits have rubbish in them as mul24 is unsigned
-+# The low 6 bits need discard before weighting
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish
-+ asr r1, r1, 14
-+ nop ; mul24 r1, r1, ra_wt_mul_l0
-+ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop
++#include
++#include
+
-+ shl r1, r1, 8 ; v8subs r0, ra_height, r3
-+ brr.anyn -, r:1b
-+ asr r1, r1, rb_wt_den_p15
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
++#include
+
-+# >>> branch.anyn yloop
++#include "rpi_mailbox.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include "interface/vmcs_host/vc_vchi_gpuserv.h"
++#pragma GCC diagnostic pop
+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0
+
-+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL 0
+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++// QPU "noflush" flags
++// a mixture of flushing & profiling
+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ add rb_lcount, rb_lcount, r0
-+ brr -, r:1b
-+ add rb_dma0, rb_dma0, r1
-+ add rb_dest, rb_dest, r2
-+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed)
+
-+::mc_filter_y_pxx
-+ m_filter_y_pxx 8
++#define vcos_verify_ge0(x) ((x)>=0)
+
++// Size in 32bit words
++#define QPU_CODE_SIZE 4098
++#define VPU_CODE_SIZE 2048
+
-+################################################################################
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
++{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
++{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
++{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
++{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
++{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
++{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
++{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
++{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
++{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
++{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
++{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
++{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
++{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
++{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
++{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
++{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
++// Odd rows
++{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
++{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
++{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
++{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
++{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
++{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
++{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
++{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
++{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
++{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
++{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
++{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
++{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
++{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
++{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
++{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
++};
+
-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+# In a P block, only the first half of coefficients contain used information.
-+# At this point we have already issued two pairs of texture requests for the current block
-+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
-+# Or possibly by taking advantage of symmetry?
++// Code/constants on GPU
++struct GPU
++{
++ unsigned int qpu_code[QPU_CODE_SIZE];
++ unsigned int vpu_code8[VPU_CODE_SIZE];
++ unsigned int vpu_code10[VPU_CODE_SIZE];
++ short transMatrix2even[16*16*2];
++};
++
++#define CFE_ENTS_PER_A 8
++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
++// allow 128
++#define CFE_ENT_COUNT 128
++#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A)
+
-+.macro m_filter_y_bxx, v_bit_depth
-+ m_luma_setup v_bit_depth
++struct rpi_cache_flush_env_s {
++// unsigned int n;
++// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++ struct vcsm_user_clean_invalid2_s v;
++};
+
-+:1
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
-+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++#define WAIT_COUNT_MAX 16
+
-+ max r2, ra_y, 0 # y
-+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
++typedef struct trace_time_one_s
++{
++ int count;
++ int64_t start[WAIT_COUNT_MAX];
++ int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
+
-+ max r2, ra_y2, 0
-+ min r2, r2, rb_max_y ; mov ra7, ra8
-+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++typedef struct trace_time_wait_s
++{
++ unsigned int jcount;
++ int64_t start0;
++ int64_t last_update;
++ trace_time_one_t active;
++ trace_time_one_t wait;
++} trace_time_wait_t;
+
-+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9
++typedef struct vq_wait_s
++{
++ sem_t sem;
++ struct vq_wait_s * next;
++} vq_wait_t;
+
-+# apply horizontal filter
-+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0
-+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++ vq_wait_t * head;
++ vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
+
-+ sub.setf -, r5, 8 ; mov ra9, ra10
-+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a
-+ brr.anyn -, r:1b
-+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b
-+ mov ra10, ra11 ; mov rb10, rb11
-+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
-+ # >>> .anyn 1b
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
+
-+ # apply vertical filter and write to VPM
-+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
-+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
-+ add r1, r1, r0 ; mul24 r0, ra8, rb4
-+ add r1, r1, r0 ; mul24 r0, ra9, rb5
-+ sub r1, r1, r0 ; mul24 r0, ra10, rb6
-+ add r1, r1, r0 ; mul24 r0, ra11, rb7
-+ sub r1, r1, r0 ; mov r2, rb_wt_off
-+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
-+# Top 8 bits are bad - low 6 bits should be discarded
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++typedef struct gpu_env_s
++{
++ int open_count;
++ int init_count;
++ int mb;
++ int vpu_i_cache_flushed;
++ GPU_MEM_PTR_T code_gm_ptr;
++ vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
+
-+ asr r1, r1, 14
-+ nop ; mul24 r0, r1, ra_wt_mul_l0
-+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0
++// Stop more than one thread trying to allocate memory or use the processing resources at once
++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
++static gpu_env_t * gpu = NULL;
+
-+ add r1, r1, r0 ; mov r3, ra_blk_height
-+ shl r1, r1, 8 ; v8subs r0, ra_height, r3
-+ brr.anyn -, r:1b
-+ asr r1, r1, rb_wt_den_p15
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++static int64_t ns_time(void)
++{
++ struct timespec ts;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
++}
+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
+
-+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ add rb_lcount, rb_lcount, r0
-+ brr -, r:1b
-+ add rb_dma0, rb_dma0, r1
-+ add rb_dest, rb_dest, r2
-+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++ // Update totals for levels that are still pending
++ for (int i = 0; i < tto->count; ++i) {
++ tto->total[i] += now - tto->start[i];
++ tto->start[i] = now;
++ }
+
-+::mc_filter_y_bxx
-+ m_filter_y_bxx 8
++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++ prefix,
++ T_ARG(now - start0 - tto->total[0]),
++ T_ARG(tto->total[0]),
++ T_ARG(tto->total[1]),
++ T_ARG(tto->total[2]),
++ T_ARG(tto->total[3]));
++}
+
-+################################################################################
-+#
-+# typedef struct qpu_mc_pred_y_p00_s {
-+# qpu_mc_src_t next_src1;
-+# uint16_t h;
-+# uint16_t w;
-+# uint32_t wo1;
-+# uint32_t dst_addr;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_y_p00_t;
+
-+.macro m_filter_y_p00, v_bit_depth
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++ av_assert0(tto->count < WAIT_COUNT_MAX);
++ tto->start[tto->count++] = now;
++}
+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 0
-+.set v_x_mul, 1
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 1
-+.set v_x_mul, 2
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++ const int n = --tto->count;
++ av_assert0(n >= 0);
++ tto->total[n] += now - tto->start[n];
++}
+
-+ mov ra0, unif ; mov r3, elem_num # y_x
-+ mov ra_xshift, ra_xshift_next # [ra0 delay]
-+ add r0, ra0.16b, r3
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++ tto_print(&ttw->active, now, ttw->start0, "Active");
++ tto_print(&ttw->wait, now, ttw->start0, " Wait");
++}
+
-+ max r0, r0, 0
-+ min r0, r0, rb_max_x
++#endif
+
-+ shl ra_xshift_next, r0, 3 # Compute shifts
-+ and r0, r0, -4 ; v8subs r2, r2, r2
-+ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base
-+ and r1, r0, r2 ; mov ra_y_next, ra0.16a
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height
-+ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write
++// GPU memory alloc fns (internal)
+
-+# get width,height of block (unif load above)
-+# Compute vdw_setup1(dst_pitch-width)
-+ shl r1, ra_width, v_x_shift
-+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
-+ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
-+ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
-+ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
-+ add rb_dma0, r0, rb_dma0_base
++// GPU_MEM_PTR_T alloc fns
++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++ p->numbytes = (numbytes + 255) & ~255; // Round up
++ p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
++ av_assert0(p->vcsm_handle);
++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++ av_assert0(p->vc_handle);
++ p->arm = vcsm_lock(p->vcsm_handle);
++ av_assert0(p->arm);
++ p->vc = mbox_mem_lock(mb, p->vc_handle);
++ av_assert0(p->vc);
++// printf("***** %s, %d\n", __func__, numbytes);
+
-+ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0
-+ # For B l1 & L0 offsets should be identical so it doesn't matter which we use
-+ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link
++ return 0;
++}
++
++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++ p->numbytes = numbytes;
++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
++ av_assert0(p->vcsm_handle);
++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++ av_assert0(p->vc_handle);
++ p->arm = vcsm_lock(p->vcsm_handle);
++ av_assert0(p->arm);
++ p->vc = mbox_mem_lock(mb, p->vc_handle);
++ av_assert0(p->vc);
++// printf("***** %s, %d\n", __func__, numbytes);
++ return 0;
++}
+
-+:1
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
-+ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
-+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
++ mbox_mem_unlock(mb, p->vc_handle);
++ vcsm_unlock_ptr(p->arm);
++ vcsm_free(p->vcsm_handle);
++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
++// printf("***** %s\n", __func__);
++}
+
-+ max r2, ra_y, 0 # y
-+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
+
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
-+ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
-+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++// GPU init, free, lock, unlock
+
-+ brr.anyn -, r:1b
-+ asr r1, r1, rb_wt_den_p15
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
++static void gpu_term(void)
++{
++ gpu_env_t * const ge = gpu;
+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++ // We have to hope that eveything has terminated...
++ gpu = NULL;
+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++ vc_gpuserv_deinit();
+
-+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++ gpu_free_internal(ge->mb, &ge->code_gm_ptr);
+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++ vcsm_exit();
+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ add rb_lcount, rb_lcount, r0
-+ brr -, r:1b
-+ add rb_dma0, rb_dma0, r1
-+ add rb_dest, rb_dest, r2
-+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++ mbox_close(ge->mb);
+
-+::mc_filter_y_p00
-+ m_filter_y_p00 8
++ vq_wait_pool_deinit(&ge->wait_pool);
+
-+################################################################################
++ free(ge);
++}
+
-+.macro m_filter_y_b00, v_bit_depth
-+# luma setup does a fair bit more than we need calculating filter coeffs
-+# that we will never use but it saves I-cache to use it (also simple!)
-+ m_luma_setup v_bit_depth
+
-+# Fix up vals that were expecting a filter (somewhat icky)
-+ mov r0, 7
-+ sub rb_i_tmu, rb_i_tmu, r0
-+ sub rb_lcount, rb_lcount, r0
-+ mov r0, 8 ; mov r1, ra_wt_off_mul_l0
-+ shl rb_wt_off, rb_wt_off, r0
-+ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// Connect to QPU, returns 0 on success.
++static int gpu_init(gpu_env_t ** const gpu) {
++ volatile struct GPU* ptr;
++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++ *gpu = NULL;
+
-+:1
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
-+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
++ if (ge == NULL)
++ return -1;
+
-+ max r2, ra_y, 0 # y
-+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next
++ if ((ge->mb = mbox_open()) < 0)
++ return -1;
+
-+ max r2, ra_y2, 0
-+ min r2, r2, rb_max_y
-+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
-+ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
++ vq_wait_pool_init(&ge->wait_pool);
+
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
-+ add r1, r0, r1
-+ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
-+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++ vcsm_init();
+
-+ brr.anyn -, r:1b
-+ asr r1, r1, rb_wt_den_p15
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
++ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++ // Zero everything so we have zeros between the code bits
++ memset((void *)ptr, 0, sizeof(*ptr));
+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++ // Now copy over the QPU code into GPU memory
++ {
++ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
++ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy((void*)ptr->qpu_code, ff_hevc_rpi_shader, num_bytes);
++ }
++ // And the VPU code
++ {
++ int num_bytes = sizeof(rpi_hevc_transform8);
++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++ }
++ {
++ int num_bytes = sizeof(rpi_hevc_transform10);
++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
++ }
++ // And the transform coefficients
++ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+
-+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++ *gpu = ge;
++ return 0;
++}
+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ add rb_lcount, rb_lcount, r0
-+ brr -, r:1b
-+ add rb_dma0, rb_dma0, r1
-+ add rb_dest, rb_dest, r2
-+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer
-+# >>> 1b
-+.endm
+
-+::mc_filter_y_b00
-+ m_filter_y_b00 8
++static void gpu_unlock(void) {
++ pthread_mutex_unlock(&gpu_mutex);
++}
+
-+################################################################################
-+################################################################################
-+# 10 BIT
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++ pthread_mutex_lock(&gpu_mutex);
+
-+::mc_setup_c10_q0
-+ m_setup_q0
-+::mc_setup_c10_qn
-+ m_setup_c 10
++ av_assert0(gpu != NULL);
++ return gpu;
++}
+
-+::mc_filter_c10_p
-+ m_filter_c_p 0, 10
++static gpu_env_t * gpu_lock_ref(void)
++{
++ pthread_mutex_lock(&gpu_mutex);
+
-+::mc_filter_c10_p_l1
-+ m_filter_c_p 1, 10
++ if (gpu == NULL) {
++ int rv = gpu_init(&gpu);
++ if (rv != 0) {
++ gpu_unlock();
++ return NULL;
++ }
++ }
+
++ ++gpu->open_count;
++ return gpu;
++}
+
-+::mc_filter_c10_b
-+ m_filter_c_b 10
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++ if (--ge->open_count == 0)
++ gpu_term();
+
-+# Even if these fns are the same as for other bit depths we want our own copy
-+# to keep the code we are using in a single lump to avoid (direct map) cache
-+# thrashing
-+.set v_quads10, N_QPU_16 / 4
++ gpu_unlock();
++}
+
-+::mc_sync10_q0
-+ m_sync_q 0, v_quads10
-+::mc_sync10_q1
-+ m_sync_q 1, v_quads10
-+::mc_sync10_q2
-+ m_sync_q 2, v_quads10
-+::mc_sync10_q3
-+ m_sync_q 3, v_quads10
-+::mc_sync10_q4
-+ m_sync_q 4, v_quads10
-+::mc_sync10_q5
-+ m_sync_q 5, v_quads10
-+::mc_sync10_q6
-+ m_sync_q 6, v_quads10
-+::mc_sync10_q7
-+ m_sync_q 7, v_quads10
-+::mc_sync10_q8
-+ m_sync_q 8, v_quads10
-+::mc_sync10_q9
-+ m_sync_q 9, v_quads10
-+::mc_sync10_q10
-+ m_sync_q 10, v_quads10
-+::mc_sync10_q11
-+ m_sync_q 11, v_quads10
++static inline gpu_env_t * gpu_ptr(void)
++{
++ av_assert0(gpu != NULL);
++ return gpu;
++}
+
-+::mc_exit_y10_q0
-+::mc_exit_c10_q0
-+ m_exit_q0
++// Public gpu fns
+
-+::mc_exit_y10_qn
-+::mc_exit_c10_qn
-+ m_exit_qn
++// Allocate memory on GPU
++// Fills in structure containing ARM pointer, videocore handle, videocore memory address, numbytes
++// Returns 0 on success.
++// This allocates memory that will not be cached in ARM's data cache.
++// Therefore safe to use without data cache flushing.
++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
++{
++ int r;
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
++ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
++ gpu_unlock();
++ return r;
++}
+
-+::mc_setup_y10_q0
-+ m_setup_q0
-+::mc_setup_y10_qn
-+ m_setup_y 10
++// This allocates data that will be
++// Cached in ARM L2
++// Uncached in VPU L2
++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
++{
++ int r;
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
++ r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
++ gpu_unlock();
++ return r;
++}
+
-+:per_block_setup_10
-+ m_per_block_setup 10
++void gpu_free(GPU_MEM_PTR_T * const p) {
++ gpu_env_t * const ge = gpu_lock();
++ gpu_free_internal(ge->mb, p);
++ gpu_unlock_unref(ge);
++}
+
-+::mc_filter_y10_pxx
-+ m_filter_y_pxx 10
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
++ // Make sure that the gpu is initialized
++ av_assert0(gpu != NULL);
++ switch (bit_depth){
++ case 8:
++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++ case 10:
++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++ default:
++ av_assert0(0);
++ }
++ return 0;
++}
+
-+::mc_filter_y10_p00
-+ m_filter_y_p00 10
++unsigned int vpu_get_constants(void) {
++ av_assert0(gpu != NULL);
++ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
++}
+
-+::mc_filter_y10_bxx
-+ m_filter_y_bxx 10
++int gpu_get_mailbox(void)
++{
++ av_assert0(gpu);
++ return gpu->mb;
++}
+
-+::mc_filter_y10_b00
-+ m_filter_y_b00 10
++void gpu_ref(void)
++{
++ gpu_lock_ref();
++ gpu_unlock();
++}
+
++void gpu_unref(void)
++{
++ gpu_env_t * const ge = gpu_lock();
++ gpu_unlock_unref(ge);
++}
+
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
+
-+::mc_end
-+# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
-new file mode 100644
-index 0000000000..9f8983da52
---- /dev/null
-+++ b/libavcodec/rpi_shader_cmd.h
-@@ -0,0 +1,128 @@
-+#ifndef RPI_SHADER_CMD_H
-+#define RPI_SHADER_CMD_H
++#define CACHE_EL_MAX 16
+
-+#pragma pack(push, 4)
++rpi_cache_flush_env_t * rpi_cache_flush_init()
++{
++ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
++ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
++ if (rfe == NULL)
++ return NULL;
+
-+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
-+// If mixed then we are just confused and get a lot of warnings....
-+typedef const uint8_t * qpu_mc_src_addr_t;
-+typedef uint8_t * qpu_mc_dst_addr_t;
-+#else
-+typedef uint32_t qpu_mc_src_addr_t;
-+typedef uint32_t qpu_mc_dst_addr_t;
-+#endif
++ rfe->v.op_count = 0;
++ return rfe;
++}
+
-+typedef struct qpu_mc_src_s
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+{
-+ int16_t y;
-+ int16_t x;
-+ qpu_mc_src_addr_t base;
-+} qpu_mc_src_t;
-+
++ if (rfe != NULL)
++ free(rfe);
++}
+
-+typedef struct qpu_mc_pred_c_p_s {
-+ qpu_mc_src_t next_src;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t coeffs_x;
-+ uint32_t coeffs_y;
-+ uint32_t wo_u;
-+ uint32_t wo_v;
-+ qpu_mc_dst_addr_t dst_addr_c;
-+ uint32_t next_fn;
-+} qpu_mc_pred_c_p_t;
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
++{
++ int rc = 0;
++ if (rfe->v.op_count != 0) {
++ if (vcsm_clean_invalid2(&rfe->v) != 0)
++ {
++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno);
++ rc = -1;
++ }
++ rfe->v.op_count = 0;
++ }
++ return rc;
++}
+
-+typedef struct qpu_mc_pred_c_b_s {
-+ qpu_mc_src_t next_src1;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t coeffs_x1;
-+ uint32_t coeffs_y1;
-+ uint32_t weight_u1;
-+ uint32_t weight_v1;
-+ qpu_mc_src_t next_src2;
-+ uint32_t coeffs_x2;
-+ uint32_t coeffs_y2;
-+ uint32_t wo_u2;
-+ uint32_t wo_v2;
-+ qpu_mc_dst_addr_t dst_addr_c;
-+ uint32_t next_fn;
-+} qpu_mc_pred_c_b_t;
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++ int rc = rpi_cache_flush_execute(rfe);;
+
-+typedef struct qpu_mc_pred_c_s_s {
-+ qpu_mc_src_t next_src1;
-+ uint32_t pic_cw; // C Width (== Y width / 2)
-+ uint32_t pic_ch; // C Height (== Y Height / 2)
-+ uint32_t stride2;
-+ uint32_t stride1;
-+ uint32_t wdenom;
-+ qpu_mc_src_t next_src2;
-+ uint32_t next_fn;
-+} qpu_mc_pred_c_s_t;
++ free(rfe);
++ return rc;
++}
+
-+typedef struct qpu_mc_pred_c_s {
-+ union {
-+ qpu_mc_pred_c_p_t p;
-+ qpu_mc_pred_c_b_t b;
-+ qpu_mc_pred_c_s_t s;
-+ };
-+} qpu_mc_pred_c_t;
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
++{
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+
++ av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
+
-+typedef struct qpu_mc_pred_y_p_s {
-+ qpu_mc_src_t next_src1;
-+ qpu_mc_src_t next_src2;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t mymx21;
-+ uint32_t wo1;
-+ uint32_t wo2;
-+ qpu_mc_dst_addr_t dst_addr;
-+ uint32_t next_fn;
-+} qpu_mc_pred_y_p_t;
++ b->invalidate_mode = mode;
++ b->block_count = blocks;
++ b->start_address = gm->arm + offset0;
++ b->block_size = block_size;
++ b->inter_block_stride = block_stride;
++}
+
-+typedef struct qpu_mc_pred_y_p00_s {
-+ qpu_mc_src_t next_src1;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t wo1;
-+ qpu_mc_dst_addr_t dst_addr;
-+ uint32_t next_fn;
-+} qpu_mc_pred_y_p00_t;
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset, const unsigned int size)
++{
++ // Deal with empty pointer trivially
++ if (gm == NULL || size == 0)
++ return;
+
-+typedef struct qpu_mc_pred_y_s_s {
-+ qpu_mc_src_t next_src1;
-+ qpu_mc_src_t next_src2;
-+ uint16_t pic_h;
-+ uint16_t pic_w;
-+ uint32_t stride2;
-+ uint32_t stride1;
-+ uint32_t wdenom;
-+ uint32_t next_fn;
-+} qpu_mc_pred_y_s_t;
++ av_assert0(offset <= gm->numbytes);
++ av_assert0(size <= gm->numbytes);
++ av_assert0(offset + size <= gm->numbytes);
+
-+// Only a useful structure in that it allows us to return something other than a void *
-+typedef struct qpu_mc_pred_y_s {
-+ union {
-+ qpu_mc_pred_y_p_t p;
-+ qpu_mc_pred_y_p00_t p00;
-+ qpu_mc_pred_y_s_t s;
-+ };
-+} qpu_mc_pred_y_t;
++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
++}
+
-+typedef union qpu_mc_pred_cmd_u {
-+ qpu_mc_pred_y_t y;
-+ qpu_mc_pred_c_t c;
-+ uint32_t data[1];
-+} qpu_mc_pred_cmd_t;
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
+
-+#define QPU_MC_PRED_N_Y8 12
-+#define QPU_MC_PRED_N_C8 12
+
-+#define QPU_MC_PRED_N_Y10 12
-+#define QPU_MC_PRED_N_C10 12
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++ if (gpu_is_buf1(frame)) {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++ }
++ else
++ {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++ }
++}
+
-+#pragma pack(pop)
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++ const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++ const unsigned int y_offset = frame->linesize[0] * y0;
++ const unsigned int y_size = frame->linesize[0] * height;
++ // Round UV up/down to get everything
++ const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
+
++#if 0
++ // *** frame->height is cropped height so not good
++ // As all unsigned they will also reject -ve
++ // Test individually as well as added to reject overflow
++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped
++ av_assert0(n <= (unsigned int)frame->height);
++ av_assert0(start_line + n <= (unsigned int)frame->height);
+#endif
+
-diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
-new file mode 100644
-index 0000000000..2d763f54ef
---- /dev/null
-+++ b/libavcodec/rpi_shader_template.c
-@@ -0,0 +1,66 @@
-+#ifdef RPI
-+
-+#include "hevc.h"
-+#include "hevcdec.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "rpi_shader_cmd.h"
-+#include "rpi_shader_template.h"
++ if (!gpu_is_buf1(frame))
++ {
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++ }
++ }
++ else if (!av_rpi_is_sand_frame(frame))
++ {
++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++ }
++ }
++ else
++ {
++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C
++ av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
+
-+typedef struct shader_track_s
-+{
-+ const union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+ const struct qpu_mc_src_s *last_l0;
-+ const struct qpu_mc_src_s *last_l1;
-+ uint32_t width; // pic_width * PW
-+ uint32_t height;
-+ uint32_t stride2;
-+ uint32_t stride1;
-+ uint32_t wdenom;
-+} shader_track_t;
++ if (do_chroma)
++ {
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++ b->invalidate_mode = mode;
++ b->block_count = block_count;
++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++ b->block_size = uv_size;
++ b->inter_block_stride = stride1 * stride2;
++ }
++ if (do_luma)
++ {
++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++ b->invalidate_mode = mode;
++ b->block_count = block_count;
++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++ b->block_size = y_size;
++ b->inter_block_stride = stride1 * stride2;
++ }
++ }
++}
+
-+static int wtoidx(const unsigned int w)
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
+{
-+ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ return pel_weight[w];
++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++ rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++ rpi_cache_flush_finish(rfe);
+}
+
-+static const int fctom(uint32_t x)
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
+{
-+ int rv;
-+ // As it happens we can take the 2nd filter term & divide it by 8
-+ // (dropping fractions) to get the fractional move
-+ rv = 8 - ((x >> 11) & 0xf);
-+ av_assert2(rv >= 0 && rv <= 7);
-+ return rv;
++ unsigned int i;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_init(&wp->pool[i].sem, 0, 0);
++ wp->pool[i].next = wp->pool + i + 1;
++ }
++ wp->head = wp->pool + 0;
++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
+}
+
-+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
+{
-+ return (x << shl) >> shr;
++ unsigned int i;
++ wp->head = NULL;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_destroy(&wp->pool[i].sem);
++ wp->pool[i].next = NULL;
++ }
+}
+
-+static inline int woff_p(HEVCContext *const s, int32_t x)
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(void)
+{
-+ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++ gpu_env_t * const ge = gpu_lock_ref();
++ vq_wait_t * const wait = ge->wait_pool.head;
++ ge->wait_pool.head = wait->next;
++ wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ tto_start(&ge->ttw.active, ns_time());
++#endif
++
++ gpu_unlock();
++ return wait;
+}
+
-+static inline int woff_b(HEVCContext *const s, int32_t x)
++static void vq_wait_delete(vq_wait_t * const wait)
+{
-+ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++ gpu_env_t * const ge = gpu_lock();
++ wait->next = ge->wait_pool.head;
++ ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ trace_time_wait_t * const ttw = &ge->ttw;
++ const int64_t now = ns_time();
++ ++ttw->jcount;
++ tto_end(&ttw->wait, now);
++
++ if (ttw->start0 == 0)
++ {
++ ttw->start0 = ttw->active.start[0];
++ ttw->last_update = ttw->start0;
++ }
++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++ {
++ ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++ ttw_print(ttw, now);
++ }
++ }
++#endif
++ gpu_unlock_unref(ge);
+}
+
-+static inline int wweight(int32_t x)
++static void vq_wait_wait(vq_wait_t * const wait)
+{
-+ return ext(x, 16, 16);
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ const int64_t now = ns_time();
++ gpu_env_t * const ge = gpu_lock();
++ tto_start(&ge->ttw.wait, now);
++ gpu_unlock();
++ }
++#endif
++
++ while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++ /* loop */;
+}
+
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ gpu_env_t *const ge = gpu_lock();
++ tto_end(&ge->ttw.active, ns_time());
++ gpu_unlock();
++ }
++#endif
+
-+#define PW 1
-+#include "rpi_shader_template_fn.h"
++ sem_post(&wait->sem);
++}
+
-+#undef PW
-+#define PW 2
-+#include "rpi_shader_template_fn.h"
+
-+#endif
+
-diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
-new file mode 100644
-index 0000000000..ecf5b8185a
---- /dev/null
-+++ b/libavcodec/rpi_shader_template.h
-@@ -0,0 +1,24 @@
-+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU 1
++#define VPU_QPU_MASK_VPU 2
+
-+#ifdef RPI
-+struct HEVCContext;
-+struct HEVCRpiInterPredEnv;
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
++{
++ unsigned int n;
++ unsigned int mask;
++ struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
+
-+void rpi_shader_c8(struct HEVCContext *const s,
-+ const struct HEVCRpiInterPredEnv *const ipe_y,
-+ const struct HEVCRpiInterPredEnv *const ipe_c);
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
+
-+void rpi_shader_c16(struct HEVCContext *const s,
-+ const struct HEVCRpiInterPredEnv *const ipe_y,
-+ const struct HEVCRpiInterPredEnv *const ipe_c);
++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
++{
++ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++ return vqj;
++}
+
-+void rpi_sand_dump8(const char * const name,
-+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
++{
++ memset(vqj, 0, sizeof(*vqj));
++ free(vqj);
++}
+
-+void rpi_sand_dump16(const char * const name,
-+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++ struct gpu_job_s * const j = vqj->j + vqj->n++;
++ av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
++ return j;
++}
+
-+#endif
-+#endif
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++ if (vpu_code != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_VPU;
+
-diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
-new file mode 100644
-index 0000000000..b5ac2ceed6
---- /dev/null
-+++ b/libavcodec/rpi_shader_template_fn.h
-@@ -0,0 +1,477 @@
-+#define STRCAT(x,y) x##y
++ j->command = EXECUTE_VPU;
++ // The bottom two bits of the execute address contain no-flush flags
++ // b0 will flush the VPU I-cache if unset so we nearly always want that set
++ // as we never reload code
++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
++ j->u.v.q[1] = r0;
++ j->u.v.q[2] = r1;
++ j->u.v.q[3] = r2;
++ j->u.v.q[4] = r3;
++ j->u.v.q[5] = r4;
++ j->u.v.q[6] = r5;
++ gpu->vpu_i_cache_flushed = 1;
++ }
++}
+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
++{
++ if (n != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_QPU;
++
++ j->command = EXECUTE_QPU;
++ j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
+#else
-+#error Unexpected PW
++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
+#endif
-+
-+#define PATCH_STRIDE (16 * PW)
-+
-+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
-+ const pixel s = *(const pixel *)src;
-+ pixel * d = (pixel *)dst;
-+ for (unsigned int j = 0; j < w; j += PW) {
-+ *d++ = s;
-+ }
-+ }
++ j->u.q.timeout = 5000;
++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++ }
+}
+
-+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
+{
-+ for (unsigned int i = 0; i != h; ++i, dst += stride) {
-+ memcpy(dst, src, w);
-+ }
++ vq_wait_post(v);
+}
+
-+static void FUNC(get_patch_y)(const shader_track_t * const st,
-+ uint8_t * dst, const unsigned int dst_stride,
-+ const qpu_mc_src_t *src,
-+ unsigned int _w, unsigned int _h)
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
+{
-+ int x = src->x * PW;
-+ int y = src->y;
-+ int w = _w * PW;
-+ int h = _h;
-+ int dl = 0;
-+ int dr = 0;
-+ int dt = 0;
-+ int db = 0;
-+
-+ if (x < 0) {
-+ if (-x >= w)
-+ x = PW - w;
-+ dl = -x;
-+ w += x;
-+ x = 0;
-+ }
-+ if (x + w > st->width) {
-+ if (x >= st->width)
-+ x = st->width - PW;
-+ dr = (x + w) - st->width;
-+ w = st->width - x;
-+ }
++ vq_wait_t * wait;
+
-+ // Y
-+ if (y < 0) {
-+ if (-y >= h)
-+ y = 1 - h;
-+ dt = -y;
-+ h += y;
-+ y = 0;
-+ }
-+ if (y + h > st->height) {
-+ if (y >= st->height)
-+ y = st->height - 1;
-+ db = (y + h) - st->height;
-+ h = st->height - y;
-+ }
++ if (vqj->mask == 0) {
++ *wait_h = NULL;
++ return;
++ }
+
-+ dst += dl + dt * dst_stride;
-+ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++ // We are going to want a sync object
++ wait = vq_wait_new();
+
-+ // Edge dup
-+ if (dl != 0)
-+ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
-+ if (dr != 0)
-+ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
-+ w += dl + dr;
-+ dst -= dl;
++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++ // If we only posted one thing or only QPU jobs
++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++ {
++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++ av_assert0(j->callback.func == 0);
+
-+ if (dt != 0)
-+ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
-+ if (db != 0)
-+ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
-+}
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
++ else
++ {
++ struct gpu_job_s *const j = new_job(vqj);
+
++ j->command = EXECUTE_SYNC;
++ j->u.s.mask = vqj->mask;
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
+
++ vqj->mask = 0;
++ *wait_h = wait;
++}
+
-+static void FUNC(get_patch_c)(const shader_track_t * const st,
-+ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
-+ const qpu_mc_src_t *src,
-+ unsigned int _w, unsigned int _h)
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
+{
-+ int x = src->x * PW;
-+ int y = src->y;
-+ int w = _w * PW;
-+ int h = _h;
-+ int dl = 0;
-+ int dr = 0;
-+ int dt = 0;
-+ int db = 0;
-+ const int width = st->width;
-+ const int height = st->height;
++ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
+
-+ if (x < 0) {
-+ if (-x >= w)
-+ x = PW - w;
-+ dl = -x;
-+ w += x;
-+ x = 0;
-+ }
-+ if (x + w > width) {
-+ if (x >= width)
-+ x = width - PW;
-+ dr = (x + w) - width;
-+ w = width - x;
-+ }
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++ int rv;
++ rv = vpu_qpu_job_start(vqj);
++ vpu_qpu_job_delete(vqj);
++ return rv;
++}
+
-+ // Y
-+ if (y < 0) {
-+ if (-y >= h)
-+ y = 1 - h;
-+ dt = -y;
-+ h += y;
-+ y = 0;
-+ }
-+ if (y + h > height) {
-+ if (y >= height)
-+ y = height - 1;
-+ db = (y + h) - height;
-+ h = height - y;
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++ if (wait_h != NULL)
++ {
++ vq_wait_t * const wait = *wait_h;
++ if (wait != NULL) {
++ *wait_h = NULL;
++ vq_wait_wait(wait);
++ vq_wait_delete(wait);
+ }
++ }
++}
+
-+ dst_u += dl + dt * dst_stride;
-+ dst_v += dl + dt * dst_stride;
-+ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++int vpu_qpu_init()
++{
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
+
-+ // Edge dup
-+ if (dl != 0)
-+ {
-+ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
-+ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
-+ }
-+ if (dr != 0)
-+ {
-+ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
-+ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
-+ }
-+ w += dl + dr;
-+ dst_u -= dl;
-+ dst_v -= dl;
++ if (ge->init_count++ == 0)
++ {
++ vc_gpuserv_init();
++ }
+
-+ if (dt != 0)
-+ {
-+ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
-+ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
-+ }
-+ if (db != 0)
-+ {
-+ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
-+ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
-+ }
++ gpu_unlock();
++ return 0;
+}
+
-+// w, y, w, h in pixels
-+// stride1, stride2 in bytes
-+void FUNC(rpi_sand_dump)(const char * const name,
-+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++void vpu_qpu_term()
+{
-+ const int mask = stride2 == 0 ? ~0 : stride1 - 1;
-+
-+ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++ gpu_env_t * const ge = gpu_lock();
+
-+ if (is_c) {
-+ x *= 2;
-+ w *= 2;
-+ }
++ if (--ge->init_count == 0) {
++ vc_gpuserv_deinit();
+
-+ for (int i = y; i != y + h; ++i) {
-+ for (int j = x; j != x + w; ++j) {
-+ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
-+ char sep = is_c && (j & 1) == 0 ? ':' : ' ';
-+#if PW == 1
-+ if (j < 0 || i < 0)
-+ printf("..%c", sep);
-+ else
-+ printf("%02x%c", *(const pixel*)p, sep);
-+#else
-+ if (j < 0 || i < 0)
-+ printf("...%c", sep);
-+ else
-+ printf("%03x%c", *(const pixel*)p, sep);
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ ttw_print(&ge->ttw, ns_time());
+#endif
-+ }
-+ printf("\n");
-+ }
++ }
++
++ gpu_unlock_unref(ge);
++}
++
++uint32_t qpu_fn(const int * const mc_fn)
++{
++ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader) + offsetof(struct GPU, qpu_code);
+}
+
+
-+void FUNC(rpi_shader_c)(HEVCContext *const s,
-+ const HEVCRpiInterPredEnv *const ipe_y,
-+ const HEVCRpiInterPredEnv *const ipe_c)
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
+{
-+ for (int c_idx = 0; c_idx < 2; ++c_idx)
-+ {
-+ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
-+ shader_track_t tracka[QPU_N_MAX] = {{NULL}};
-+ unsigned int exit_n = 0;
++ // Dummy values we can catch with emulation
++ qf->y_pxx = ~1U;
++ qf->y_bxx = ~2U;
++ qf->y_p00 = ~3U;
++ qf->y_b00 = ~4U;
++ qf->c_pxx = ~5U;
++ qf->c_bxx = ~6U;
+
-+ if (ipe == NULL || !ipe->used) {
-+ continue;
-+ }
++ switch (bit_depth) {
++ case 8:
++ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++ qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++ qf->y_p00 = qpu_fn(mc_filter_y_p00);
++ qf->y_b00 = qpu_fn(mc_filter_y_b00);
++ qf->c_pxx = qpu_fn(mc_filter_c_p);
++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++ qf->c_bxx = qpu_fn(mc_filter_c_b);
++ break;
++ case 10:
++ qf->c_pxx = qpu_fn(mc_filter_c10_p);
++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++ qf->c_bxx = qpu_fn(mc_filter_c10_b);
++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++ qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++ qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++ break;
++ default:
++ return -1;
++ }
++ return 0;
++}
+
-+ do {
-+ for (unsigned int i = 0; i != ipe->n; ++i) {
-+ const HEVCRpiInterPredQ * const q = ipe->q + i;
-+ shader_track_t * const st = tracka + i;
-+ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+new file mode 100644
+index 0000000000..9389047f8e
+--- /dev/null
++++ b/libavcodec/rpi_qpu.h
+@@ -0,0 +1,208 @@
++#ifndef RPI_QPU_H
++#define RPI_QPU_H
+
-+ for (;;) {
-+ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++#define RPI_ONE_BUF 1
+
-+ if (link == q->code_setup) {
-+ if (c_idx == 0) {
-+ // Luma
-+ const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++typedef struct gpu_mem_ptr_s {
++ unsigned char *arm; // Pointer to memory mapped on ARM side
++ int vc_handle; // Videocore handle of relocatable memory
++ int vcsm_handle; // Handle for use by VCSM
++ int vc; // Address for use in GPU code
++ int numbytes; // Size of memory block
++} GPU_MEM_PTR_T;
+
-+ st->height = c->pic_h;
-+ st->width = c->pic_w * PW;
-+ st->stride1 = c->stride1;
-+ st->stride2 = c->stride2;
-+ st->wdenom = c->wdenom;
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else {
-+ // Chroma
-+ const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++// General GPU functions
++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
+
-+ st->height = c->pic_ch;
-+ st->width = c->pic_cw * PW;
-+ st->stride1 = c->stride1;
-+ st->stride2 = c->stride2;
-+ st->wdenom = c->wdenom;
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ }
-+ else if (link == s->qpu.y_pxx) {
-+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+ const int w1 = FFMIN(c->w, 8);
-+ const int w2 = c->w - w1;
++#include "libavutil/frame.h"
++#if !RPI_ONE_BUF
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
++ return p->vc;
++}
+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
++ return p->vc;
++}
+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h + 7);
-+ if (w2 > 0) {
-+ FUNC(get_patch_y)(st,
-+ patch_y2, PATCH_STRIDE,
-+ st->last_l1,
-+ 16, c->h + 7);
-+ }
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
++ return p->vc;
++}
+
-+ // wo[offset] = offset*2+1
-+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
-+ if (w2 > 0) {
-+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
-+ }
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.y_bxx) {
-+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
++}
+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
++}
+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h + 7);
-+ FUNC(get_patch_y)(st,
-+ patch_y2, PATCH_STRIDE,
-+ st->last_l1,
-+ 16, c->h + 7);
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
++}
+
-+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++#else
+
-+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
-+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
-+ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.y_p00) {
-+ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++static inline int gpu_is_buf1(const AVFrame * const frame)
++{
++ return frame->buf[1] == NULL;
++}
+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
++{
++ return av_buffer_get_opaque(frame->buf[0]);
++}
+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h + 7);
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
++{
++ return av_buffer_pool_opaque(frame->buf[n]);
++}
+
-+ // wo[offset] = offset*2+1
-+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
-+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++ return gm->vc + (frame->data[n] - gm->arm);
++}
+
-+ st->last_l0 = &c->next_src1;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.y_b00) {
-+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++ return get_vc_address3(frame, 0);
++}
+
-+ av_assert0(c->w <= 16 && c->h <= 64);
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++ return get_vc_address3(frame, 1);
++}
+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h);
-+ FUNC(get_patch_y)(st,
-+ patch_y2, PATCH_STRIDE,
-+ st->last_l1,
-+ 16, c->h);
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++ return get_vc_address3(frame, 2);
++}
++
++#if 0
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.numbytes = frame->data[1] - frame->data[0];
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 0);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.arm += frame->data[1] - frame->data[0];
++ g.vc += frame->data[1] - frame->data[0];
++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 1);
++}
+
-+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
-+ patch_y3, patch_y1, PATCH_STRIDE,
-+ c->h, 0, 0, c->w);
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++ if (gpu_is_buf1(frame))
++ {
++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++ g.arm += frame->data[2] - frame->data[0];
++ g.vc += frame->data[2] - frame->data[0];
++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
++ return g;
++ }
++ else
++ return *gpu_buf3_gmem(frame, 2);
++}
++#endif
++#endif
+
-+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
-+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
-+ 0, woff_b(s, c->wo2), 0, 0, c->w);
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.c_pxx) {
-+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+ const int mx = fctom(c->coeffs_x);
-+ const int my = fctom(c->coeffs_y);
++// Cache flush stuff
+
-+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_u3[8 * 16 * PW];
-+ uint8_t patch_v3[8 * 16 * PW];
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
+
-+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & clear but do not free the env
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
+
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++typedef enum
++{
++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1,
++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2,
++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3
++} rpi_cache_flush_mode_t;
+
-+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++ const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++ const unsigned int uv_shift, const int do_luma, const int do_chroma);
+
-+ st->last_l0 = &c->next_src;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.c_pxx_l1) {
-+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+ const int mx = fctom(c->coeffs_x);
-+ const int my = fctom(c->coeffs_y);
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+
-+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_u3[8 * 16 * PW];
-+ uint8_t patch_v3[8 * 16 * PW];
+
-+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++// QPU specific functions
+
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++typedef struct HEVCRpiQpu {
++ uint32_t c_pxx;
++ uint32_t c_pxx_l1;
++ uint32_t c_bxx;
++ uint32_t y_pxx;
++ uint32_t y_bxx;
++ uint32_t y_p00;
++ uint32_t y_b00;
++} HEVCRpiQpu;
+
-+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
+
-+ st->last_l1 = &c->next_src;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.c_bxx) {
-+ const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
-+ const int mx1 = fctom(c->coeffs_x1);
-+ const int my1 = fctom(c->coeffs_y1);
-+ const int mx2 = fctom(c->coeffs_x2);
-+ const int my2 = fctom(c->coeffs_y2);
++uint32_t qpu_fn(const int * const mc_fn);
+
-+ uint8_t patch_u1[PATCH_STRIDE * 72];
-+ uint8_t patch_v1[PATCH_STRIDE * 72];
-+ uint8_t patch_u2[PATCH_STRIDE * 72];
-+ uint8_t patch_v2[PATCH_STRIDE * 72];
-+ uint8_t patch_u3[8 * 16 * PW];
-+ uint8_t patch_v3[8 * 16 * PW];
-+ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
-+ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++#define QPU_N_GRP 4
++#define QPU_N_MAX 12
+
-+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++#define QPU_MAIL_EL_VALS 2
+
-+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, mx1, my1, c->w);
-+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, mx1, my1, c->w);
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
+
-+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
-+ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
-+ 0, woff_b(s, c->wo_u2), mx2, my2, c->w);
-+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
-+ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
-+ 0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++// VPU specific functions
+
-+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
+
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == q->code_sync) {
-+ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
-+ break;
-+ }
-+ else if (link == q->code_exit) {
-+ // We expect exit to occur without other sync
-+ av_assert0(i == exit_n);
-+ ++exit_n;
-+ break;
-+ }
-+ else {
-+ av_assert0(0);
-+ }
-+ }
++vpu_qpu_job_h vpu_qpu_job_new(void);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+
-+ st->qpu_mc_curr = cmd;
-+ }
-+ } while (exit_n == 0);
-+ }
-+}
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
++extern unsigned int vpu_get_constants(void);
+
-+#undef FUNC
-+#undef pixel
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
++
++extern int gpu_get_mailbox(void);
++void gpu_ref(void);
++void gpu_unref(void);
+
++#endif
diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
new file mode 100644
-index 0000000000..97d58abc0a
+index 0000000000..185288da5a
--- /dev/null
+++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,745 @@
-+#include "config.h"
-+#ifdef RPI
+@@ -0,0 +1,741 @@
+#include "libavcodec/avcodec.h"
+#include "rpi_qpu.h"
+#include "rpi_mailbox.h"
@@ -32156,8 +40432,6 @@ index 0000000000..97d58abc0a
+ }
+}
+
-+#endif // RPI
-+
diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
new file mode 100644
index 0000000000..26fb3be999
@@ -32285,6 +40559,51 @@ index 13668c2105..bebf9024ec 100644
return 0;
}
+diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+index 9551f312e7..a1f68b8e30 100644
+--- a/libavcodec/utils.c
++++ b/libavcodec/utils.c
+@@ -1277,6 +1277,40 @@ AVCodec *avcodec_find_decoder(enum AVCodecID id)
+ return find_encdec(id, 0);
+ }
+
++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
++{
++ const enum AVPixelFormat *pf = p->pix_fmts;
++
++ // Assume good if we lack info
++ if (pf == NULL)
++ return 1;
++ if (fmt == AV_PIX_FMT_NONE)
++ return 0;
++
++ for (; *pf != AV_PIX_FMT_NONE; ++pf) {
++ if (*pf == fmt)
++ return 1;
++ }
++ return 0;
++}
++
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
++{
++ AVCodec *p, *experimental = NULL;
++ p = first_avcodec;
++ id= remap_deprecated_codec_id(id);
++ while (p) {
++ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
++ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
++ experimental = p;
++ } else
++ return p;
++ }
++ p = p->next;
++ }
++ return experimental;
++}
++
+ AVCodec *avcodec_find_decoder_by_name(const char *name)
+ {
+ AVCodec *p;
diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
index f0f849b326..cd97974748 100644
--- a/libavfilter/avfilter.c
@@ -32310,21 +40629,8 @@ index ad5aedd5f7..0d2df8b870 100644
frame->format);
break;
case AVMEDIA_TYPE_AUDIO:
-diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index 53cbcfb543..f93f06fcfb 100644
---- a/libavformat/mpegts.c
-+++ b/libavformat/mpegts.c
-@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
- #endif
- { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
- { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC },
-- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
-+ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC },
- { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 },
- { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
- { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 1a7996c4fd..154942fe74 100644
+index 1a7996c4fd..271e70ed84 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -750,7 +750,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
@@ -32336,27 +40642,111 @@ index 1a7996c4fd..154942fe74 100644
continue;
s->streams[i]->pts_wrap_reference = pts_wrap_reference;
s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
+@@ -2940,6 +2940,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
+ return 1;
+ }
+
++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
++// This should be quite general purpose but avoid possible conflicts
++// by limiting usage to cases wehere we know it works.
++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
++{
++ // Only try fallback if we know it is supported (HEVC only)
++ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
++ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
++ int err;
++
++ // Failed to find fallback or we are already at the fallback
++ if (new_codec == NULL || new_codec == old_codec)
++ {
++ return AVERROR_DECODER_NOT_FOUND;
++ }
++
++ // * This may be dodgy - header says to not use this fn,
++ // especially if we are going to reopen the context...
++ // (but it does seem to work for our cases)
++ if (avcodec_is_open(avctx)) {
++ avcodec_close(avctx);
++ }
++
++ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
++ {
++ return err;
++ }
++
++ return 0;
++}
++#else
++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
++#endif
++
+ /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
+ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+ AVDictionary **options)
+@@ -2974,7 +3008,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+ av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
+ if (s->codec_whitelist)
+ av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
+- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
++ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
++ {
++ // Try fallback if if looks worth a try
++ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
++ }
+ if (!options)
+ av_dict_free(&thread_opt);
+ if (ret < 0) {
+@@ -3005,6 +3043,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+ if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+ avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
+ ret = avcodec_send_packet(avctx, &pkt);
++
++ // If we are going to want to fall back we should know here
++ if (ret == AVERROR_DECODER_NOT_FOUND) {
++ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
++ break;
++ continue;
++ }
++
+ if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
+ break;
+ if (ret >= 0)
+@@ -3601,9 +3647,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
+ // Try to just open decoders, in case this is enough to get parameters.
+ if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
+ if (codec && !avctx->codec)
+- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
+- av_log(ic, AV_LOG_WARNING,
+- "Failed to open codec in %s\n",__FUNCTION__);
++ {
++ int err;
++
++ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
++ {
++ if (err == AVERROR_DECODER_NOT_FOUND) {
++ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
++ }
++ if (err < 0) {
++ av_log(ic, AV_LOG_WARNING,
++ "Failed to open codec in %s\n",__FUNCTION__);
++ }
++ }
++ }
+ }
+ if (!options)
+ av_dict_free(&thread_opt);
diff --git a/libavutil/Makefile b/libavutil/Makefile
-index 65e285a701..afb3effa2e 100644
+index 65e285a701..2ca778c59f 100644
--- a/libavutil/Makefile
+++ b/libavutil/Makefile
-@@ -62,6 +62,8 @@ HEADERS = adler32.h \
- rational.h \
- replaygain.h \
- ripemd.h \
-+ rpi_sand_fns.h \
-+ rpi_sand_fn_pw.h \
- samplefmt.h \
- sha.h \
- sha512.h \
-@@ -140,6 +142,7 @@ OBJS = adler32.o \
- reverse.o \
- rc4.o \
- ripemd.o \
-+ rpi_sand_fns.o \
- samplefmt.o \
- sha.o \
- sha512.o \
+@@ -165,6 +165,7 @@ OBJS-$(CONFIG_QSV) += hwcontext_qsv.o
+ OBJS-$(CONFIG_LIBDRM) += hwcontext_drm.o
+ OBJS-$(CONFIG_LZO) += lzo.o
+ OBJS-$(CONFIG_OPENCL) += opencl.o opencl_internal.o
++OBJS-$(CONFIG_RPI) += rpi_sand_fns.o
+ OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o
+ OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o
+ OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o
diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 5da44b0542..b74b7c4e2f 100644
--- a/libavutil/arm/Makefile
@@ -32441,22 +40831,33 @@ index 73b6bd0b14..d907de3f1c 100644
* @}
*/
diff --git a/libavutil/frame.c b/libavutil/frame.c
-index d5fd2932e3..1851e3655f 100644
+index d5fd2932e3..151a33a24d 100644
--- a/libavutil/frame.c
+++ b/libavutil/frame.c
-@@ -25,6 +25,7 @@
+@@ -16,6 +16,8 @@
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
++#include "config.h"
++
+ #include "channel_layout.h"
+ #include "avassert.h"
+ #include "buffer.h"
+@@ -25,6 +27,9 @@
#include "imgutils.h"
#include "mem.h"
#include "samplefmt.h"
++#if CONFIG_RPI
+#include "rpi_sand_fns.h"
++#endif
static AVFrameSideData *frame_new_side_data(AVFrame *frame,
-@@ -833,6 +834,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
+@@ -833,6 +838,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
(frame->crop_top + frame->crop_bottom) >= frame->height)
return AVERROR(ERANGE);
-+#ifdef RPI
++#if CONFIG_RPI
+ // Sand cannot be cropped - do not try
+ if (av_rpi_is_sand_format(frame->format))
+ return 0;
@@ -32727,12 +41128,11 @@ index 0000000000..52d52a2a83
+
diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
new file mode 100644
-index 0000000000..ec4cfadf8a
+index 0000000000..b8bfad915e
--- /dev/null
+++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,99 @@
+@@ -0,0 +1,96 @@
+#include "config.h"
-+#ifdef RPI
+#include
+#include
+#include "rpi_sand_fns.h"
@@ -32828,17 +41228,14 @@ index 0000000000..ec4cfadf8a
+ }
+}
+
-+#endif // RPI
-+
diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
new file mode 100644
-index 0000000000..aa880d0f63
+index 0000000000..ebaa2b6d08
--- /dev/null
+++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,129 @@
+@@ -0,0 +1,131 @@
+#ifndef AVUTIL_RPI_SAND_FNS
+#define AVUTIL_RPI_SAND_FNS
-+#ifdef RPI
+
+#include "libavutil/frame.h"
+
@@ -32891,9 +41288,13 @@ index 0000000000..aa880d0f63
+
+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+{
-+ // * We could repl;ace thios with a fixed 128 whic would allow the compiler
-+ // to optimize a whole lot better
++#ifdef RPI_ZC_SAND128_ONLY
++ // If we are sure we only only support 128 byte sand formats replace the
++ // var with a constant which should allow for better optimisation
++ return 128;
++#else
+ return frame->linesize[0];
++#endif
+}
+
+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
@@ -32963,7 +41364,6 @@ index 0000000000..aa880d0f63
+}
+
+#endif
-+#endif
+
diff --git a/libswscale/input.c b/libswscale/input.c
index bb2f4933ec..de5a17bc7f 100644
@@ -32995,14 +41395,14 @@ index bb2f4933ec..de5a17bc7f 100644
if (c->chrSrcHSubSample) {
switch (srcFormat) {
diff --git a/libswscale/utils.c b/libswscale/utils.c
-index dcab707de6..403558db3c 100644
+index dcab707de6..5b24de889a 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -256,6 +256,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
[AV_PIX_FMT_P010BE] = { 1, 1 },
[AV_PIX_FMT_P016LE] = { 1, 0 },
[AV_PIX_FMT_P016BE] = { 1, 0 },
-+#ifdef RPI
++#if CONFIG_RPI
+ [AV_PIX_FMT_SAND128] = { 1, 0 },
+ [AV_PIX_FMT_SAND64_10] = { 1, 0 },
+#endif
@@ -33544,17 +41944,16 @@ index 0000000000..fc14f2a3c2
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
new file mode 100755
-index 0000000000..ec25b81c31
+index 0000000000..59c0d3959e
--- /dev/null
+++ b/pi-util/conf_pi1.sh
-@@ -0,0 +1,31 @@
+@@ -0,0 +1,30 @@
+echo "Configure for Pi1"
+
+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=`pwd`/../firmware/opt/vc
+
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
@@ -33566,8 +41965,8 @@ index 0000000000..ec25b81c31
+ --target-os=linux\
+ --disable-stripping\
+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
@@ -33581,18 +41980,18 @@ index 0000000000..ec25b81c31
+# -Wa,-ahls
diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
new file mode 100755
-index 0000000000..f8e5e75375
+index 0000000000..4de256bc8a
--- /dev/null
+++ b/pi-util/conf_pi2.sh
-@@ -0,0 +1,30 @@
+@@ -0,0 +1,32 @@
+echo "Configure for Pi2/3"
+
+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+RPI_OPT_VC=`pwd`/../firmware/opt/vc
+
+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++RPI_DEFINES="-D__VCCOREVER__=0x4000000"
+#RPI_KEEPS="-save-temps=obj"
+RPI_KEEPS=""
+
@@ -33603,12 +42002,14 @@ index 0000000000..f8e5e75375
+ --disable-stripping\
+ --disable-thumb\
+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --enable-rpi\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+
++# --enable-decoder=hevc_rpi\
+# --enable-extra-warnings\
+# --arch=armv71\
+# --enable-shared\
@@ -33617,10 +42018,10 @@ index 0000000000..f8e5e75375
+# -Wa,-ahls
diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
new file mode 100755
-index 0000000000..70f7be22bb
+index 0000000000..e9556f0837
--- /dev/null
+++ b/pi-util/ffconf.py
-@@ -0,0 +1,174 @@
+@@ -0,0 +1,175 @@
+#!/usr/bin/env python
+
+import string
@@ -33634,7 +42035,7 @@ index 0000000000..70f7be22bb
+
+ffmpeg_exec = "./ffmpeg"
+
-+def testone(fileroot, srcname, es_file, md5_file):
++def testone(fileroot, srcname, es_file, md5_file, vcodec):
+ tmp_root = "/tmp"
+
+ names = srcname.split('/')
@@ -33656,7 +42057,7 @@ index 0000000000..70f7be22bb
+
+ # Unaligned needed for cropping conformance
+ rstr = subprocess.call(
-+ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
++ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
+ stdout=flog, stderr=subprocess.STDOUT)
+
+ try:
@@ -33720,7 +42121,7 @@ index 0000000000..70f7be22bb
+ return True
+ return False
+
-+def doconf(csva, tests, test_root):
++def doconf(csva, tests, test_root, vcodec):
+ unx_failures = []
+ unx_success = []
+ failures = 0
@@ -33732,7 +42133,7 @@ index 0000000000..70f7be22bb
+ print "==== ", name,
+ sys.stdout.flush()
+
-+ rv = testone(os.path.join(test_root, name), name, a[2], a[3])
++ rv = testone(os.path.join(test_root, name), name, a[2], a[3], vcodec=vcodec)
+ if (rv == 0):
+ successes += 1
+ else:
@@ -33783,6 +42184,7 @@ index 0000000000..70f7be22bb
+ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
+ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
++ argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
+ args = argp.parse_args()
+
+ if args.csvgen:
@@ -33793,7 +42195,7 @@ index 0000000000..70f7be22bb
+ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+
+
-+ doconf(csva, args.tests, args.test_root)
++ doconf(csva, args.tests, args.test_root, args.vcodec)
+
diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
new file mode 100755
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
new file mode 100644
index 00000000000..1d1fd1690ea
--- /dev/null
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
@@ -0,0 +1,283 @@
+From 8f170986cda0695f28eb2cd4e863aaae0e14d19f Mon Sep 17 00:00:00 2001
+From: Hendrik Leppkes
+Date: Sat, 9 Jan 2016 16:34:09 +0100
+Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles
+
+---
+ libavcodec/avcodec.h | 3 +++
+ libavcodec/codec_desc.c | 7 +++++++
+ libavcodec/profiles.c | 1 +
+ libavformat/mpegts.c | 2 +-
+ 4 files changed, 12 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index 6c4b011b5c..8f1f5a3e53 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -449,6 +449,8 @@ enum AVCodecID {
+ AV_CODEC_ID_GDV,
+ AV_CODEC_ID_FITS,
+
++ AV_CODEC_ID_H264_MVC,
++
+ /* various PCM "codecs" */
+ AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs
+ AV_CODEC_ID_PCM_S16LE = 0x10000,
+@@ -3318,6 +3320,7 @@ typedef struct AVCodecContext {
+ #define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244
+ #define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA)
+ #define FF_PROFILE_H264_CAVLC_444 44
++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
+
+ #define FF_PROFILE_VC1_SIMPLE 0
+ #define FF_PROFILE_VC1_MAIN 1
+diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
+index 478b7c0ffc..ff10f3b2bc 100644
+--- a/libavcodec/codec_desc.c
++++ b/libavcodec/codec_desc.c
+@@ -1700,6 +1700,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
+ .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
+ .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+ },
++ {
++ .id = AV_CODEC_ID_H264_MVC,
++ .type = AVMEDIA_TYPE_VIDEO,
++ .name = "h264_mvc",
++ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
++ .props = AV_CODEC_PROP_LOSSY,
++ },
+
+ /* various PCM "codecs" */
+ {
+diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
+index 30498efedf..9d3cf4b535 100644
+--- a/libavcodec/profiles.c
++++ b/libavcodec/profiles.c
+@@ -72,6 +72,7 @@ const AVProfile ff_h264_profiles[] = {
+ { FF_PROFILE_H264_CAVLC_444, "CAVLC 4:4:4" },
+ { FF_PROFILE_H264_MULTIVIEW_HIGH, "Multiview High" },
+ { FF_PROFILE_H264_STEREO_HIGH, "Stereo High" },
++ { FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH, "Multiview High Depth" },
+ { FF_PROFILE_UNKNOWN },
+ };
+
+diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
+index 53cbcfb543..f93f06fcfb 100644
+--- a/libavformat/mpegts.c
++++ b/libavformat/mpegts.c
+@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
+ #endif
+ { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
+ { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC },
+- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
++ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC },
+ { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 },
+ { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
+ { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
+--
+2.14.1
+
+
+From 00de72f97e8f69f5d4c614bff956ec726f97fa2e Mon Sep 17 00:00:00 2001
+From: Hendrik Leppkes
+Date: Sat, 9 Jan 2016 16:34:40 +0100
+Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs
+
+---
+ libavcodec/allcodecs.c | 1 +
+ libavcodec/h264.h | 2 ++
+ libavcodec/h264_parser.c | 34 ++++++++++++++++++++++++++++++----
+ 3 files changed, 33 insertions(+), 4 deletions(-)
+
+diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+index 5361a22141..a5289a5e14 100644
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
+@@ -732,6 +732,7 @@ static void register_all(void)
+ REGISTER_PARSER(H261, h261);
+ REGISTER_PARSER(H263, h263);
+ REGISTER_PARSER(H264, h264);
++ REGISTER_PARSER(H264_MVC, h264_mvc);
+ REGISTER_PARSER(HEVC, hevc);
+ REGISTER_PARSER(MJPEG, mjpeg);
+ REGISTER_PARSER(MLP, mlp);
+diff --git a/libavcodec/h264.h b/libavcodec/h264.h
+index 86df5eb9b3..22c4f1d82a 100644
+--- a/libavcodec/h264.h
++++ b/libavcodec/h264.h
+@@ -41,7 +41,9 @@ enum {
+ H264_NAL_END_STREAM = 11,
+ H264_NAL_FILLER_DATA = 12,
+ H264_NAL_SPS_EXT = 13,
++ H264_NAL_SPS_SUBSET = 15,
+ H264_NAL_AUXILIARY_SLICE = 19,
++ H264_NAL_SLICE_EXT = 20,
+ };
+
+ #endif /* AVCODEC_H264_H */
+diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
+index 053325c26b..855c74896e 100644
+--- a/libavcodec/h264_parser.c
++++ b/libavcodec/h264_parser.c
+@@ -62,6 +62,7 @@ typedef struct H264ParseContext {
+ int parse_last_mb;
+ int64_t reference_dts;
+ int last_frame_num, last_picture_structure;
++ int is_mvc;
+ } H264ParseContext;
+
+
+@@ -109,14 +110,18 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
+ } else if (state <= 5) {
+ int nalu_type = buf[i] & 0x1F;
+ if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
+- nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
++ nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
++ nalu_type == H264_NAL_SPS_SUBSET) {
+ if (pc->frame_start_found) {
+ i++;
+ goto found;
+ }
+ } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
+- nalu_type == H264_NAL_IDR_SLICE)) {
++ nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) {
+ state += 8;
++
++ if (nalu_type == H264_NAL_SLICE_EXT)
++ i += 3; // skip mvc extension
+ continue;
+ }
+ state = 7;
+@@ -594,7 +599,8 @@ static int h264_parse(AVCodecParserContext *s,
+ }
+ }
+
+- parse_nal_units(s, avctx, buf, buf_size);
++ if (!p->is_mvc)
++ parse_nal_units(s, avctx, buf, buf_size);
+
+ if (avctx->framerate.num)
+ avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+@@ -651,7 +657,7 @@ static int h264_split(AVCodecContext *avctx,
+ if ((state & 0xFFFFFF00) != 0x100)
+ break;
+ nalu_type = state & 0x1F;
+- if (nalu_type == H264_NAL_SPS) {
++ if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) {
+ has_sps = 1;
+ } else if (nalu_type == H264_NAL_PPS)
+ has_pps = 1;
+@@ -703,3 +709,23 @@ AVCodecParser ff_h264_parser = {
+ .parser_close = h264_close,
+ .split = h264_split,
+ };
++
++static av_cold int init_mvc(AVCodecParserContext *s)
++{
++ H264ParseContext *p = s->priv_data;
++ int ret = init(s);
++ if (ret < 0)
++ return ret;
++
++ p->is_mvc = 1;
++ return 0;
++}
++
++AVCodecParser ff_h264_mvc_parser = {
++ .codec_ids = { AV_CODEC_ID_H264_MVC },
++ .priv_data_size = sizeof(H264ParseContext),
++ .parser_init = init_mvc,
++ .parser_parse = h264_parse,
++ .parser_close = h264_close,
++ .split = h264_split,
++};
+--
+2.14.1
+
+
+From bbf5daa149ccc2c462be1bd5f6f710eba0e82094 Mon Sep 17 00:00:00 2001
+From: Hendrik Leppkes
+Date: Tue, 28 Nov 2017 16:12:12 +0000
+Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame
+ start was found
+
+---
+ libavcodec/h264_parser.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
+index 855c74896e..90a99a19a8 100644
+--- a/libavcodec/h264_parser.c
++++ b/libavcodec/h264_parser.c
+@@ -587,6 +587,9 @@ static int h264_parse(AVCodecParserContext *s,
+ } else {
+ next = h264_find_frame_end(p, buf, buf_size, avctx);
+
++ if (next == END_NOT_FOUND && pc->frame_start_found == 0)
++ s->fetch_timestamp = 1;
++
+ if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+ *poutbuf = NULL;
+ *poutbuf_size = 0;
+--
+2.14.1
+
+
+From 3a0ebb0f7473a9a5ab93e01f7261862a3d324e50 Mon Sep 17 00:00:00 2001
+From: popcornmix
+Date: Tue, 28 Nov 2017 18:32:08 +0000
+Subject: [PATCH 4/4] extract_extradata_bsf: Support H264_MVC
+
+---
+ libavcodec/extract_extradata_bsf.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c
+index ed6509c681..188e62a42d 100644
+--- a/libavcodec/extract_extradata_bsf.c
++++ b/libavcodec/extract_extradata_bsf.c
+@@ -56,7 +56,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
+ HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS,
+ };
+ static const int extradata_nal_types_h264[] = {
+- H264_NAL_SPS, H264_NAL_PPS,
++ H264_NAL_SPS, H264_NAL_SPS_SUBSET, H264_NAL_PPS,
+ };
+
+ ExtractExtradataContext *s = ctx->priv_data;
+@@ -88,14 +88,14 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
+ if (nal->type == HEVC_NAL_SPS) has_sps = 1;
+ if (nal->type == HEVC_NAL_VPS) has_vps = 1;
+ } else {
+- if (nal->type == H264_NAL_SPS) has_sps = 1;
++ if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SPS_SUBSET) has_sps = 1;
+ }
+ }
+ }
+
+ if (extradata_size &&
+ ((ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) ||
+- (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) {
++ ((ctx->par_in->codec_id == AV_CODEC_ID_H264 || ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC) && has_sps))) {
+ AVBufferRef *filtered_buf;
+ uint8_t *extradata, *filtered_data;
+
+@@ -247,6 +247,7 @@ static const struct {
+ } extract_tab[] = {
+ { AV_CODEC_ID_CAVS, extract_extradata_mpeg4 },
+ { AV_CODEC_ID_H264, extract_extradata_h2645 },
++ { AV_CODEC_ID_H264_MVC, extract_extradata_h2645 },
+ { AV_CODEC_ID_HEVC, extract_extradata_h2645 },
+ { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12 },
+ { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12 },
+@@ -306,6 +307,7 @@ fail:
+ static const enum AVCodecID codec_ids[] = {
+ AV_CODEC_ID_CAVS,
+ AV_CODEC_ID_H264,
++ AV_CODEC_ID_H264_MVC,
+ AV_CODEC_ID_HEVC,
+ AV_CODEC_ID_MPEG1VIDEO,
+ AV_CODEC_ID_MPEG2VIDEO,
+--
+2.14.1
+