diff --git a/packages/addons/addon-depends/ffmpegx/package.mk b/packages/addons/addon-depends/ffmpegx/package.mk
index 7992ac12d79..695f1c7aa7e 100644
--- a/packages/addons/addon-depends/ffmpegx/package.mk
+++ b/packages/addons/addon-depends/ffmpegx/package.mk
@@ -149,6 +149,8 @@ configure_target() {
     \
     `#General options` \
     --enable-avresample \
+    --disable-lzma \
+    --disable-alsa \
     \
     `#Toolchain options` \
     --arch="$TARGET_ARCH" \
diff --git a/packages/compress/libarchive/package.mk b/packages/compress/libarchive/package.mk
new file mode 100644
index 00000000000..b493f8cc3f7
--- /dev/null
+++ b/packages/compress/libarchive/package.mk
@@ -0,0 +1,35 @@
+################################################################################
+#      This file is part of LibreELEC - https://libreelec.tv
+#      Copyright (C) 2017-present Team LibreELEC
+#
+#  LibreELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  LibreELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LibreELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="libarchive"
+PKG_VERSION="3.3.2"
+PKG_SHA256="ed2dbd6954792b2c054ccf8ec4b330a54b85904a80cef477a1c74643ddafa0ce"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="https://www.libarchive.org"
+PKG_URL="https://www.libarchive.org/downloads/$PKG_NAME-$PKG_VERSION.tar.gz"
+PKG_DEPENDS_HOST="toolchain"
+PKG_DEPENDS_TARGET="toolchain"
+PKG_SECTION="compress"
+PKG_SHORTDESC="libarchive data compressor/decompressor"
+
+PKG_CMAKE_OPTS_TARGET="-DENABLE_SHARED=0 -DENABLE_STATIC=1 -DCMAKE_POSITION_INDEPENDENT_CODE=1 -DENABLE_EXPAT=0 -DENABLE_ICONV=0 -DENABLE_LIBXML2=0 -DENABLE_LZO=1 -DENABLE_TEST=0 -DENABLE_COVERAGE=0"
+
+post_makeinstall_target() {
+  rm -rf $INSTALL
+}
diff --git a/packages/compress/libarchive/patches/libarchive-01-die-Werror.patch b/packages/compress/libarchive/patches/libarchive-01-die-Werror.patch
new file mode 100644
index 00000000000..9a831f2a2ec
--- /dev/null
+++ b/packages/compress/libarchive/patches/libarchive-01-die-Werror.patch
@@ -0,0 +1,37 @@
+From f3c2f0ca7916288c72da07a2c3352b85b8f96e55 Mon Sep 17 00:00:00 2001
+From: Arne Morten Kvarving <arne.morten.kvarving@sintef.no>
+Date: Sat, 11 Nov 2017 23:42:40 +0100
+Subject: [PATCH] die Werror
+
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 73bf07b..08e8f49 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -96,7 +96,7 @@ IF (CMAKE_C_COMPILER_ID MATCHES "^GNU$")
+   #################################################################
+   # Set compile flags for debug build.
+   # This is added into CMAKE_C_FLAGS when CMAKE_BUILD_TYPE is "Debug"
+-  SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror")
++  #SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror")
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wextra")
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wunused")
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wshadow")
+@@ -112,7 +112,7 @@ IF (CMAKE_C_COMPILER_ID MATCHES "^Clang$")
+   # Set compile flags for debug build.
+   # This is added into CMAKE_C_FLAGS when CMAKE_BUILD_TYPE is "Debug"
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g")
+-  SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror")
++  #SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror")
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wextra")
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wunused")
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wshadow")
+@@ -133,7 +133,7 @@
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -qflag=w:w")
+   SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -qinfo=pro:use")
+ ENDIF(CMAKE_C_COMPILER_ID MATCHES "^XL$")
+-IF (MSVC)
++IF (0)
+   #################################################################
+   # Set compile flags for debug build.
+   # This is added into CMAKE_C_FLAGS when CMAKE_BUILD_TYPE is "Debug"
diff --git a/packages/compress/libarchive/patches/libarchive-02-static-please.patch b/packages/compress/libarchive/patches/libarchive-02-static-please.patch
new file mode 100644
index 00000000000..fb2bbc275a2
--- /dev/null
+++ b/packages/compress/libarchive/patches/libarchive-02-static-please.patch
@@ -0,0 +1,14 @@
+--- libarchive/libarchive/CMakeLists.txt
++++ libarchive/libarchive/CMakeLists.txt
+@@ -224,9 +224,9 @@
+ ENDIF()
+ 
+ # Libarchive is a shared library
+-ADD_LIBRARY(archive SHARED ${libarchive_SOURCES} ${include_HEADERS})
++ADD_LIBRARY(archive STATIC ${libarchive_SOURCES} ${include_HEADERS})
+ TARGET_LINK_LIBRARIES(archive ${ADDITIONAL_LIBS})
+-SET_TARGET_PROPERTIES(archive PROPERTIES SOVERSION ${SOVERSION})
++SET_TARGET_PROPERTIES(archive PROPERTIES COMPILE_DEFINITIONS LIBARCHIVE_STATIC)
+ 
+ # archive_static is a static library
+ ADD_LIBRARY(archive_static STATIC ${libarchive_SOURCES} ${include_HEADERS})
diff --git a/packages/compress/lz4/package.mk b/packages/compress/lz4/package.mk
new file mode 100644
index 00000000000..359797af93e
--- /dev/null
+++ b/packages/compress/lz4/package.mk
@@ -0,0 +1,37 @@
+################################################################################
+#      This file is part of LibreELEC - https://libreelec.tv
+#      Copyright (C) 2017-present Team LibreELEC
+#
+#  LibreELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  LibreELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LibreELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="lz4"
+PKG_VERSION="1.8.0"
+PKG_SHA256="2ca482ea7a9bb103603108b5a7510b7592b90158c151ff50a28f1ca8389fccf6"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="https://github.com/lz4/lz4"
+PKG_URL="https://github.com/lz4/lz4/archive/v$PKG_VERSION.tar.gz"
+PKG_DEPENDS_HOST="toolchain"
+PKG_DEPENDS_TARGET="toolchain"
+PKG_SECTION="compress"
+PKG_SHORTDESC="lz4 data compressor/decompressor"
+
+PKG_CMAKE_SCRIPT="$PKG_BUILD/contrib/cmake_unofficial/CMakeLists.txt"
+
+PKG_CMAKE_OPTS_TARGET="-DBUILD_SHARED_LIBS=0 -DCMAKE_POSITION_INDEPENDENT_CODE=0"
+
+post_makeinstall_target() {
+  rm -rf $INSTALL
+}
diff --git a/packages/compress/xz/package.mk b/packages/compress/xz/package.mk
index 225b7b61bdc..fbc70f8b4fa 100644
--- a/packages/compress/xz/package.mk
+++ b/packages/compress/xz/package.mk
@@ -36,3 +36,9 @@ PKG_CONFIGURE_OPTS_HOST="--disable-shared --enable-static \
                          --enable-lzma-links \
                          --disable-scripts \
                          --disable-nls"
+
+PKG_CONFIGURE_OPTS_TARGET="--disable-shared --enable-static"
+
+post_makeinstall_target() {
+  rm -rf $INSTALL
+}
diff --git a/packages/compress/xz/patches/xz-01-init-uninitialized-variables.patch b/packages/compress/xz/patches/xz-01-init-uninitialized-variables.patch
new file mode 100644
index 00000000000..0d88880702a
--- /dev/null
+++ b/packages/compress/xz/patches/xz-01-init-uninitialized-variables.patch
@@ -0,0 +1,27 @@
+From 5cd389f1fe1fe095cdf555194df875ee3ab445cf Mon Sep 17 00:00:00 2001
+From: MilhouseVH <milhouseVH.github@nmacleod.com>
+Date: Sun, 26 Nov 2017 22:21:15 +0000
+Subject: [PATCH] uninitialized variables build error
+
+---
+ src/liblzma/lzma/lzma_encoder.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/src/liblzma/lzma/lzma_encoder.c b/src/liblzma/lzma/lzma_encoder.c
+index ba9ce69..08e8c87 100644
+--- a/src/liblzma/lzma/lzma_encoder.c
++++ b/src/liblzma/lzma/lzma_encoder.c
+@@ -359,8 +359,8 @@ lzma_lzma_encode(lzma_lzma1_encoder *restrict coder, lzma_mf *restrict mf,
+ 		//   - UINT32_MAX: not a match but a literal
+ 		// Value ranges for len:
+ 		//   - [MATCH_LEN_MIN, MATCH_LEN_MAX]
+-		uint32_t len;
+-		uint32_t back;
++		uint32_t len = 0;
++		uint32_t back = 0;
+ 
+ 		if (coder->fast_mode)
+ 			lzma_lzma_optimum_fast(coder, mf, &back, &len);
+-- 
+2.14.1
+
diff --git a/packages/devel/libplist/package.mk b/packages/devel/libplist/package.mk
index 02cf133f3d2..523e932c5d6 100644
--- a/packages/devel/libplist/package.mk
+++ b/packages/devel/libplist/package.mk
@@ -17,13 +17,13 @@
 ################################################################################
 
 PKG_NAME="libplist"
-PKG_VERSION="1.12"
-PKG_SHA256="0effdedcb3de128c4930d8c03a3854c74c426c16728b8ab5f0a5b6bdc0b644be"
+PKG_VERSION="2.0.0"
+PKG_SHA256="3a7e9694c2d9a85174ba1fa92417cfabaea7f6d19631e544948dc7e17e82f602"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
-PKG_SITE="http://matt.colyer.name/projects/iphone-linux/"
+PKG_SITE="http://www.libimobiledevice.org/"
 PKG_URL="http://www.libimobiledevice.org/downloads/$PKG_NAME-$PKG_VERSION.tar.bz2"
-PKG_DEPENDS_TARGET="toolchain libxml2 glib"
+PKG_DEPENDS_TARGET="toolchain glib"
 PKG_SECTION="devel"
 PKG_SHORTDESC="libplist: a library for manipulating Apple Binary and XML Property Lists"
 PKG_LONGDESC="libplist is a library for manipulating Apple Binary and XML Property Lists"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
index b19c93e02e0..12279bfe00f 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.modplug"
-PKG_VERSION="72018cd"
-PKG_SHA256="e799c0a7405c4df89058b91b0925f0e7860d750c1613e3ef38e141f12fa78904"
+PKG_VERSION="63c6715"
+PKG_SHA256="95a001229ff68420f0f0bd8424067b1daca44c312abf22739425d272fe167729"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk
index 5cbeeb3ff27..aa264ddcb96 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.organya"
-PKG_VERSION="ff7ab78"
-PKG_SHA256="21b363e4fd72ae9d696d18ee0728f5c53413634cfb6464d68ed1eb42427b0874"
+PKG_VERSION="0f3d367"
+PKG_SHA256="4e0125900881ab6a438e0e9d14bde5c8ed756ef845c5d5288cc00fedd581d99b"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
index e240dc79a3e..718c75cfc81 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.timidity"
-PKG_VERSION="ed61c04"
-PKG_SHA256="5378463a7f30869d0f3ef659396fbd5d8cf6e62f3226307882293524899b80db"
+PKG_VERSION="1e13049"
+PKG_SHA256="bac11b90751d241bc191840b48327312b607dcac2149ef1d2855a09a84332a60"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk
index ca82adda097..df513be7728 100644
--- a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="audioencoder.flac"
-PKG_VERSION="add8481"
-PKG_SHA256="0afb2faeb025bec534df099b497dd085f05cb66e237d8259aa9c577dd14cfb05"
+PKG_VERSION="817e0de"
+PKG_SHA256="c122f4e09d38cfde167386376ed55760414ad4a742ea56f62e99d8306fe9194b"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
index 2060310f81e..d2c7f9f42a9 100644
--- a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="imagedecoder.raw"
-PKG_VERSION="aa45f0a"
-PKG_SHA256="5883d0f49e0f88e00a13dfcccf622032f0e0df5b9f67e99747d98fd500bbffb8"
+PKG_VERSION="87449b5"
+PKG_SHA256="0254f48d67204a85bea6c1c310b61ce01e5f2cf970608d3f58d7e42c474e0804"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
index 12d8dec86b4..62c8000616e 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="inputstream.adaptive"
-PKG_VERSION="853144d"
-PKG_SHA256="cadc38ee93894b37603af30eb71f248fcf2df056d3cff9b840de1e895679d6e4"
+PKG_VERSION="d2081b2"
+PKG_SHA256="3032079ede0f234781b7cf929010ce63d8af458389dd188c36be925eb301b669"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
index d8af5a1e348..620197f5c07 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="inputstream.rtmp"
-PKG_VERSION="0702f7e"
-PKG_SHA256="ab7a8d36c39dc7f5dd1925da2f5f94f5ee5bff9c24a14f9477ebcd761654de22"
+PKG_VERSION="c772497"
+PKG_SHA256="7408e26e43b08f9b57adb660ac56c6313bec65f01178c78b30280050d2e58a9d"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
index 3315e061055..cefcd1e69ea 100644
--- a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="peripheral.joystick"
-PKG_VERSION="a5cc154"
-PKG_SHA256="0582603842c82fcaecb66c0bf78e134a1be8cdd08f19d275b5217fbdc0963499"
+PKG_VERSION="33b43ce"
+PKG_SHA256="1554469f4fbcbb2a37de9c1ece6b1b41c9e71f1087a48cb7f9ec3ae7d425dc41"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
index aaac08fc1fb..dcdce4b2728 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.argustv"
-PKG_VERSION="23cc0e8"
-PKG_SHA256="e965a98240f6c7dee277a1a705ac5e26b138b2f1572572aea50bbfef92a54bf3"
+PKG_VERSION="1a48789"
+PKG_SHA256="236a55371cae180ec755be055238d7edb145aab9e9e918bd8b797344aa74709a"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
index ccacf0d41cd..fe8c8d9c585 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.demo"
-PKG_VERSION="122dedd"
-PKG_SHA256="6422a64924ae219dc6c12e1c12d04247aea92c9143d784a8535c75e0990e3934"
+PKG_VERSION="94c4817"
+PKG_SHA256="a2511806f593d8281631b2ac745d091ff9dd1f0b84e26c2e56f0fb41c9c5487c"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
index 3005edb2a11..03c7f5de8a9 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.dvbviewer"
-PKG_VERSION="afe584a"
-PKG_SHA256="1df0174100a30df460351fa9d5e21a1d46d474234cdaf24d19ec7975c0b0defb"
+PKG_VERSION="6129441"
+PKG_SHA256="c924922900c4d7982ca826c666c467f541b311e8a61afe66a224f6c88690afd3"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
index 833c1999859..3c1c23c2f88 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.filmon"
-PKG_VERSION="f67f905"
-PKG_SHA256="2829be270876a460c7ec47af04b5217d2a3bfc63f92acae312bb030c96a719ca"
+PKG_VERSION="2dee2ca"
+PKG_SHA256="caf3bd4f31863584a72f60e176e3e07443a1ee748908bdf3e955b023a6caebbe"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
index 3f4ef8f2c7d..99712510108 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.hdhomerun"
-PKG_VERSION="ddfe2cb"
-PKG_SHA256="8f464cc4df525371c7d67426424857fc823170c82a38cb80b2d6f9ca2f70117d"
+PKG_VERSION="3af3e91"
+PKG_SHA256="d27003e108dea71d80f71649f9b2b98634d9093eb532f185e152c33719a648a9"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
index 0883c518682..d71bd2b94cd 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.hts"
-PKG_VERSION="295893c"
-PKG_SHA256="513d8b4c969915b17fee7b79da212a52fca1a9e8aa1d8c84c171a239728ee952"
+PKG_VERSION="67fe2df"
+PKG_SHA256="cd5bc330522a1a0d92bb7191dd296779ef2cddbb6f426235eed7eb359e1596e0"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
index ad2860b3c72..fa8b29c75cb 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.mediaportal.tvserver"
-PKG_VERSION="fa1069d"
-PKG_SHA256="b998eb70fee844105c6a5dfe5663d4641be82d4da0f18050a02a62f6478e8e3a"
+PKG_VERSION="6c35e88"
+PKG_SHA256="6355e47381023aed857c63c43c54dd610a123a077c19b74991e004879569f113"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
index 5d6df221dbe..8f4b77e5d19 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.mythtv"
-PKG_VERSION="d5a6aa3"
-PKG_SHA256="21af28423221148cdd15a5a69261717e9847ee0c976d2da3fba37eea20153584"
+PKG_VERSION="ffaa1b5"
+PKG_SHA256="997b44e35aa1b422bf7306449fe480d3996eaf40fe82286740c4561132d937e8"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
index c9d75732e99..04e0ad8d32b 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.nextpvr"
-PKG_VERSION="5b7caa2"
-PKG_SHA256="9781a1b90287e146c4fed57a3604799dbad791cac7046827f449828a39d2077d"
+PKG_VERSION="938bb48"
+PKG_SHA256="5ccd5d2acef47e6fdffaabe0caee2d5ecc19e577682201dc8da5ca34e4f7e48c"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
index fd2cf0c53b6..b95736ebecc 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.njoy"
-PKG_VERSION="cc1cb56"
-PKG_SHA256="35425e762e780fc19759cdbc504a25f23be15e0da25a58c30056aeb9709061c1"
+PKG_VERSION="4a5efef"
+PKG_SHA256="dd03dc9882a127053a0b255eaaedd467ff4822362d2d08eb0d51908192fb42be"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk
index 9d005042e57..3df8a4ec6f4 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.octonet"
-PKG_VERSION="f1f8d44"
-PKG_SHA256="eb2e90750ec648d2fbecfcf982f7e4a55368f2e129da3df60d5fd8bf0b85d802"
+PKG_VERSION="c4af00a"
+PKG_SHA256="4a62a84c957517044e1d44a40ae5db93576e8c0244acebf9c3a37203920202f9"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
index 00f35a7900c..30795a062ee 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.pctv"
-PKG_VERSION="c6d10cb"
-PKG_SHA256="686c63860a0a94bbd27a703debe5fcbcf866b171fd9ce89bc97fc1547c4542a2"
+PKG_VERSION="6484615"
+PKG_SHA256="a5d475207b504e59190d0659b5477d3a3ec0430c1c0c1c6420d51a500ac27d6d"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
index 146f0d7f1e7..e8fdf12f5a7 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.stalker"
-PKG_VERSION="5f53d18"
-PKG_SHA256="a73dc965734eb5fd17580f3d1f8e27f87da1c3f5fc490eaa998a7dbf0856e44f"
+PKG_VERSION="44025a1"
+PKG_SHA256="70f279c473eb2b2432908760e9003cdf05d32037b64bb7eacf6429caa390a31e"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
index ad87bf1656f..67ec3e22f56 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.teleboy"
-PKG_VERSION="2d092c4"
-PKG_SHA256="9866fc70520bec037f6df27a74e64e36c8aaaa9807ad92b45ec0735298ecc89f"
+PKG_VERSION="e431126"
+PKG_SHA256="c4945fb73890b3be738985f74c0a5b6e0f99ca337e2ba0d97397f23f6ec7423d"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
index 627e44ad2d2..fe121a3643c 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.vbox"
-PKG_VERSION="56052da"
-PKG_SHA256="96c5cd2e3fb2d0d8cdb6f6c9d1afa3cfccb65604407eaac531ca784f01387a9b"
+PKG_VERSION="619c32b"
+PKG_SHA256="d9b4a4f1053dad95fc44fbfeaf69b719931a0eb20cd6f3ca4dc911f76d483780"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
index 2c1087a7d7d..06cbb9793c6 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.vdr.vnsi"
-PKG_VERSION="92e4c64"
-PKG_SHA256="5083e0763c6724809762c36ac22df08080bde5bfd1ec73dfef2b74ea647210b3"
+PKG_VERSION="0ec3e77"
+PKG_SHA256="f77fe1049233de6b7ff0d883783db0da9c98705221ba62cea208a86cf656993c"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
index f6fd567697a..404dea96bac 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.vuplus"
-PKG_VERSION="eb0d16d"
-PKG_SHA256="1aaf83aa6ebfbb22d8deca7b856a794eeaeda5e738c06a7cd972a37419816689"
+PKG_VERSION="7ea6b21"
+PKG_SHA256="48ea86488ea9a7faf8baeed4cf0340dfb6c16c3ee2e6a1d1366d3c208dd712bd"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
index 12fc511840e..b8e872a742c 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.wmc"
-PKG_VERSION="02edba6"
-PKG_SHA256="7c7f526e8ff6d19a019f01e5b7a979ef7cbdd909ebe3a13f9e666fc282db9be5"
+PKG_VERSION="55e701b"
+PKG_SHA256="9e3dcc8d96934bc2959ebc2e9e89dbcce8f664b2a6cfdbdce8512fa68307d590"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
index b07f8c4b46f..20b888099f9 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="pvr.zattoo"
-PKG_VERSION="70fafbe"
-PKG_SHA256="acdc5b65a9f97bd1957e5ca7cae4d401a8621dd70e1880c27ede1602704fa5a7"
+PKG_VERSION="23d9993"
+PKG_SHA256="4c9caad94059093f16a59865b72645d88d91357cb067db570d1c94ea274fb673"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
index dbd3d3873ac..8d550ea364b 100644
--- a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="screensaver.pingpong"
-PKG_VERSION="3a27396"
-PKG_SHA256="e87d270e05b446174a937b0e1d468812476f332ed0c194387adbbdf2df1c2163"
+PKG_VERSION="ce794e9"
+PKG_SHA256="ab5cbd929f5125127474447a1a9c9848aa0a3186f8f7dc7cfeee97a5f2658e06"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk
new file mode 100644
index 00000000000..f63014029f1
--- /dev/null
+++ b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk
@@ -0,0 +1,33 @@
+################################################################################
+#      This file is part of LibreELEC - https://libreelec.tv
+#      Copyright (C) 2017-present Team LibreELEC
+#
+#  LibreELEC is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation, either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  LibreELEC is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with LibreELEC.  If not, see <http://www.gnu.org/licenses/>.
+################################################################################
+
+PKG_NAME="vfs.libarchive"
+PKG_VERSION="d3f3953"
+PKG_SHA256="df094f8f217f25b9c288556cf1ca30c822d35d3e9dfc4973dd994e5f40508edd"
+PKG_REV="1"
+PKG_ARCH="any"
+PKG_LICENSE="GPL"
+PKG_SITE="http://www.kodi.tv"
+PKG_URL="https://github.com/notspiff/vfs.libarchive/archive/$PKG_VERSION.tar.gz"
+PKG_DEPENDS_TARGET="toolchain kodi-platform bzip2 libarchive lz4 lzo xz zlib"
+PKG_SECTION=""
+PKG_SHORTDESC="vfs.libarchive"
+PKG_LONGDESC="vfs.libarchive"
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_TYPE="kodi.vfs"
diff --git a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
index 125daaa21f6..2b7b5cc91d2 100644
--- a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="vfs.rar"
-PKG_VERSION="4eacaec"
-PKG_SHA256="96d162e295c786d0e07cced1b7377c6ba07ea691389d5fac02aba7a12974d8b5"
+PKG_VERSION="2904d06"
+PKG_SHA256="0de99949939acad8753156cf5bc87a33a2cdc1459ee15ea450c5fee75dd73b70"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk
index dd4779cf48b..c4fbf657b3b 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="visualization.vsxu"
-PKG_VERSION="caedcbc"
-PKG_SHA256="a471095637e8c24d68b37a8c13ced75b66ef4fba7c9dab8a25defefe1c1dc807"
+PKG_VERSION="c3d8264"
+PKG_SHA256="cc9a0e287cd272e83e99003d60599b1546265299c8e4f7a5c061cb3f8d4348cd"
 PKG_REV="2"
 PKG_ARCH="i386 x86_64"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/config/guisettings.xml b/packages/mediacenter/kodi/config/guisettings.xml
index 39d322489e2..0cf1b761475 100755
--- a/packages/mediacenter/kodi/config/guisettings.xml
+++ b/packages/mediacenter/kodi/config/guisettings.xml
@@ -1,4 +1,4 @@
-<settings>
+<settings version="2">
   <general>
     <settinglevel>2</settinglevel>
   </general>
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index 25598955bbe..4593d1f4f52 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -17,8 +17,8 @@
 ################################################################################
 
 PKG_NAME="kodi"
-PKG_VERSION="055f6ee"
-PKG_SHA256="bbdca237bafcdcf28c0d4a3e6309ca616ab9ca4a35f3fcef5de8bac3ad2f6cc6"
+PKG_VERSION="c356fa8"
+PKG_SHA256="a2ee06b44d6d3e6306aef3df15c57e1a14022bb80a0cb25f2c98caa4cdf4fe56"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
@@ -220,6 +220,7 @@ PKG_CMAKE_OPTS_TARGET="-DNATIVEPREFIX=$TOOLCHAIN \
                        -DENABLE_EVENTCLIENTS=ON \
                        -DENABLE_LDGOLD=ON \
                        -DENABLE_DEBUGFISSION=OFF \
+                       -DENABLE_APP_AUTONAME=OFF \
                        $KODI_ARCH \
                        $KODI_NEON \
                        $KODI_VDPAU \
diff --git a/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch b/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch
index 9c140bbed4b..877605afd89 100644
--- a/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch
+++ b/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch
@@ -15,7 +15,7 @@ index 95ae98c..9aca1e3 100644
  
  void CApplication::Minimize()
  {
--  g_Windowing.Minimize();
+-  CServiceBroker::GetWinSystem().Minimize();
  }
  
  PLAYERCOREID CApplication::GetCurrentPlayer()
diff --git a/packages/mediacenter/kodi/patches/kodi-999.99-PR12936.patch b/packages/mediacenter/kodi/patches/kodi-999.99-PR12936.patch
deleted file mode 100644
index 09cb77d55bb..00000000000
--- a/packages/mediacenter/kodi/patches/kodi-999.99-PR12936.patch
+++ /dev/null
@@ -1,178 +0,0 @@
-From 64b82bba85398be323e61cba3d559997752f9e6f Mon Sep 17 00:00:00 2001
-From: wsnipex <wsnipex@a1.net>
-Date: Thu, 19 Oct 2017 11:47:12 +0200
-Subject: [PATCH 1/2] [cmake] support building with ninja on posix platforms
-
----
- addons/xbmc.json/addon.xml.in                          |  2 +-
- cmake/scripts/common/GenerateVersionedFiles.cmake      | 10 +++++-----
- cmake/scripts/common/Macros.cmake                      | 10 ++++++++--
- xbmc/interfaces/json-rpc/schema/CMakeLists.txt         |  1 +
- xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake |  5 +++--
- xbmc/interfaces/json-rpc/schema/version.txt            |  2 +-
- 6 files changed, 19 insertions(+), 11 deletions(-)
-
-diff --git a/addons/xbmc.json/addon.xml.in b/addons/xbmc.json/addon.xml.in
-index fbd2923a8b02..6f09fa24126d 100644
---- a/addons/xbmc.json/addon.xml.in
-+++ b/addons/xbmc.json/addon.xml.in
-@@ -1,5 +1,5 @@
- <?xml version="1.0" encoding="UTF-8"?>
--<addon id="xbmc.json" version="@jsonrpc_version@" provider-name="Team XBMC">
-+<addon id="xbmc.json" version="@JSONRPC_VERSION@" provider-name="Team XBMC">
-   <backwards-compatibility abi="6.0.0"/>
-   <requires>
-     <import addon="xbmc.core" version="0.1.0"/>
-diff --git a/cmake/scripts/common/GenerateVersionedFiles.cmake b/cmake/scripts/common/GenerateVersionedFiles.cmake
-index 90b2173e6a4d..011f4956f1fe 100644
---- a/cmake/scripts/common/GenerateVersionedFiles.cmake
-+++ b/cmake/scripts/common/GenerateVersionedFiles.cmake
-@@ -13,12 +13,11 @@ endfunction()
- 
- # add-on xml's
- file(GLOB ADDON_XML_IN_FILE ${CORE_SOURCE_DIR}/addons/*/addon.xml.in)
--foreach(loop_var ${ADDON_XML_IN_FILE})
--  # prevent 'xbmc.json'; will be obtained from 'xbmc/interfaces/json-rpc/schema/CMakeLists.txt'.
--  if(loop_var MATCHES "xbmc.json")
--    continue()
--  endif()
- 
-+# remove 'xbmc.json', will be created from 'xbmc/interfaces/json-rpc/schema/CMakeLists.txt'
-+list(REMOVE_ITEM ADDON_XML_IN_FILE xbmc.json)
-+
-+foreach(loop_var ${ADDON_XML_IN_FILE})
-   list(GET loop_var 0 xml_name)
- 
-   string(REPLACE "/addon.xml.in" "" source_dir ${xml_name})
-@@ -35,4 +34,5 @@ foreach(loop_var ${ADDON_XML_IN_FILE})
-   unset(xml_name)
- endforeach()
- 
-+
- generate_versioned_file(xbmc/CompileInfo.cpp.in ${CORE_BUILD_DIR}/xbmc/CompileInfo.cpp)
-diff --git a/cmake/scripts/common/Macros.cmake b/cmake/scripts/common/Macros.cmake
-index 205117720c83..10ed15163678 100644
---- a/cmake/scripts/common/Macros.cmake
-+++ b/cmake/scripts/common/Macros.cmake
-@@ -643,6 +643,7 @@ endfunction()
- #   APP_VERSION - the app version (${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}-${APP_VERSION_TAG})
- #   APP_ADDON_API - the addon API version in the form of 16.9.702
- #   FILE_VERSION - file version in the form of 16,9,702,0 - Windows only
-+#   JSONRPC_VERSION - the json api version in the form of 8.3.0
- #
- # Set various variables defined in "versions.h"
- macro(core_find_versions)
-@@ -657,8 +658,9 @@ macro(core_find_versions)
- 
-   include(CMakeParseArguments)
-   core_file_read_filtered(version_list ${CORE_SOURCE_DIR}/version.txt)
--  string(REPLACE " " ";" version_list "${version_list}")
--  cmake_parse_arguments(APP "" "APP_NAME;COMPANY_NAME;WEBSITE;VERSION_MAJOR;VERSION_MINOR;VERSION_TAG;VERSION_CODE;ADDON_API;APP_PACKAGE" "" ${version_list})
-+  core_file_read_filtered(json_version ${CORE_SOURCE_DIR}/xbmc/interfaces/json-rpc/schema/version.txt)
-+  string(REPLACE " " ";" version_list "${version_list} ${json_version}")
-+  cmake_parse_arguments(APP "" "APP_NAME;COMPANY_NAME;WEBSITE;VERSION_MAJOR;VERSION_MINOR;VERSION_TAG;VERSION_CODE;ADDON_API;APP_PACKAGE;JSONRPC_VERSION" "" ${version_list})
- 
-   if(NOT ${APP_VERSION_CODE} MATCHES "^[0-9]+\\.[0-9][0-9]?\\.[0-9][0-9]?[0-9]?$")
-     message(FATAL_ERROR "VERSION_CODE was set to ${APP_VERSION_CODE} in version.txt, but it has to match '^\\d+\\.\\d{1,2}\\.\\d{1,3}$'")
-@@ -674,6 +676,7 @@ macro(core_find_versions)
-     string(TOLOWER ${APP_VERSION_TAG} APP_VERSION_TAG_LC)
-   endif()
-   string(REPLACE "." "," FILE_VERSION ${APP_ADDON_API}.0)
-+  set(JSONRPC_VERSION ${APP_JSONRPC_VERSION})
- 
-   # Set defines used in addon.xml.in and read from versions.h to set add-on
-   # version parts automatically
-@@ -698,6 +701,9 @@ macro(core_find_versions)
-   if(NOT DEFINED APP_VERSION_MAJOR OR NOT DEFINED APP_VERSION_MINOR)
-     message(FATAL_ERROR "Could not determine app version! Make sure that ${CORE_SOURCE_DIR}/version.txt exists")
-   endif()
-+  if(NOT DEFINED JSONRPC_VERSION)
-+    message(FATAL_ERROR "Could not determine json-rpc version! Make sure that ${CORE_SOURCE_DIR}/xbmc/interfaces/json-rpc/schema/version.txt exists")
-+  endif()
- endmacro()
- 
- # add-on xml's
-diff --git a/xbmc/interfaces/json-rpc/schema/CMakeLists.txt b/xbmc/interfaces/json-rpc/schema/CMakeLists.txt
-index aa6142bc7718..a4d5583fdbed 100644
---- a/xbmc/interfaces/json-rpc/schema/CMakeLists.txt
-+++ b/xbmc/interfaces/json-rpc/schema/CMakeLists.txt
-@@ -14,6 +14,7 @@ add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/addons/xbmc.json/addon.xml
-                    COMMAND ${CMAKE_COMMAND}
-                            -DCMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR}
-                            -DCORE_BINARY_DIR=${CMAKE_BINARY_DIR}
-+                           -DCORE_SYSTEM_NAME=${CORE_SYSTEM_NAME}
-                            -P ${CMAKE_CURRENT_SOURCE_DIR}/GenerateAddonXml.cmake
-                    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
-                    DEPENDS ${JSON_SRCS} ${CMAKE_SOURCE_DIR}/addons/xbmc.json/addon.xml.in
-diff --git a/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake b/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake
-index 53afaf8272f3..7f0817b6a801 100644
---- a/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake
-+++ b/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake
-@@ -1,5 +1,6 @@
--file(STRINGS ${CMAKE_SOURCE_DIR}/xbmc/interfaces/json-rpc/schema/version.txt jsonrpc_version)
-+include(${CMAKE_SOURCE_DIR}/cmake/scripts/common/Macros.cmake)
-+core_find_versions()
- 
--execute_process(COMMAND ${CMAKE_COMMAND} -E remove ${CORE_BINARY_DIR}/addons/xbmc.json/addon.xml)
-+file(REMOVE ${CORE_BINARY_DIR}/addons/xbmc.json/addon.xml)
- configure_file(${CMAKE_SOURCE_DIR}/addons/xbmc.json/addon.xml.in
-                ${CORE_BINARY_DIR}/addons/xbmc.json/addon.xml @ONLY)
-diff --git a/xbmc/interfaces/json-rpc/schema/version.txt b/xbmc/interfaces/json-rpc/schema/version.txt
-index 2bf50aaf17a6..7b9e4ea4acce 100644
---- a/xbmc/interfaces/json-rpc/schema/version.txt
-+++ b/xbmc/interfaces/json-rpc/schema/version.txt
-@@ -1 +1 @@
--8.3.0
-+JSONRPC_VERSION 8.3.0
-
-From 1cea4c73d24af3ed22789ece095379c66269fa6c Mon Sep 17 00:00:00 2001
-From: wsnipex <wsnipex@a1.net>
-Date: Wed, 1 Nov 2017 20:20:19 +0100
-Subject: [PATCH 2/2] [JsonSchemabuilder] adjust version parsing
-
----
- tools/depends/native/JsonSchemaBuilder/CMakeLists.txt            | 2 ++
- tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp | 3 ++-
- tools/depends/native/JsonSchemaBuilder/src/Makefile.am           | 1 +
- 3 files changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt b/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt
-index 4fe8fdce40e8..783b8a3c5801 100644
---- a/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt
-+++ b/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt
-@@ -1,3 +1,5 @@
- set(SOURCES src/JsonSchemaBuilder.cpp)
- 
-+set(CMAKE_CXX_STANDARD 11)
-+set(CMAKE_CXX_STANDARD_REQUIRED ON)
- add_executable(JsonSchemaBuilder ${SOURCES})
-diff --git a/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp b/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp
-index a267fd9d346e..9a8acdbd9bf5 100644
---- a/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp
-+++ b/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp
-@@ -22,6 +22,7 @@
- #include <fstream>
- #include <iostream>
- #include <string>
-+#include <regex>
- 
- using namespace std;
- 
-@@ -29,7 +30,7 @@ void print_version(ifstream &in, ofstream &out)
- {
-   string line;
-   if (getline(in, line))
--    out << line;
-+    out << regex_replace(line, regex("(\\s+)?JSONRPC_VERSION\\s+|(\\s+)?#.*"), "");
- }
- 
- void print_license(ifstream &in, ofstream &out)
-diff --git a/tools/depends/native/JsonSchemaBuilder/src/Makefile.am b/tools/depends/native/JsonSchemaBuilder/src/Makefile.am
-index 1d5e071bcaab..99454a1005e8 100644
---- a/tools/depends/native/JsonSchemaBuilder/src/Makefile.am
-+++ b/tools/depends/native/JsonSchemaBuilder/src/Makefile.am
-@@ -1,3 +1,4 @@
- bin_PROGRAMS = JsonSchemaBuilder
- JsonSchemaBuilder_SOURCES = JsonSchemaBuilder.cpp
-+AM_CXXFLAGS = -O2 -std=c++11
- 
diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 0b58ddb9d57..c203364dc18 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -18,8 +18,8 @@
 
 PKG_NAME="ffmpeg"
 # Current branch is: release/3.4-kodi
-PKG_VERSION="d056a4c"
-PKG_SHA256="c041ac2837473fdafbcbc2605d4104f7a3b9ba4d19e21a27487e3eb8581f7b6c"
+PKG_VERSION="d413620"
+PKG_SHA256="c02de2197f8b70544f018e83f48c1bed2a1b47e1a1aa34ef59d9167fb0d2090a"
 PKG_ARCH="any"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
@@ -87,8 +87,11 @@ pre_configure_target() {
   strip_gold
 
   if [ "$KODIPLAYER_DRIVER" = "bcm2835-driver" ]; then
-    CFLAGS="-I$SYSROOT_PREFIX/usr/include/interface/vcos/pthreads -I$SYSROOT_PREFIX/usr/include/interface/vmcs_host/linux -DRPI=1 $CFLAGS"
+    CFLAGS="-I$SYSROOT_PREFIX/usr/include/interface/vcos/pthreads -I$SYSROOT_PREFIX/usr/include/interface/vmcs_host/linux $CFLAGS"
     FFMPEG_LIBS="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm"
+    FFMPEG_RPI="--enable-rpi"
+  else
+    FFMPEG_RPI="--disable-rpi"
   fi
 }
 
@@ -149,6 +152,7 @@ configure_target() {
               --disable-crystalhd \
               $FFMPEG_VAAPI \
               $FFMPEG_VDPAU \
+              $FFMPEG_RPI \
               --disable-dxva2 \
               --enable-runtime-cpudetect \
               $FFMPEG_TABLES \
@@ -175,6 +179,8 @@ configure_target() {
               --enable-filters \
               --disable-avisynth \
               --enable-bzlib \
+              --disable-lzma \
+              --disable-alsa \
               --disable-frei0r \
               --disable-libopencore-amrnb \
               --disable-libopencore-amrwb \
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-00.0000-fix-version-script.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-00.0000-fix-version-script.patch
deleted file mode 100644
index 18e6501a884..00000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-00.0000-fix-version-script.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-diff --git a/ffbuild/version.sh b/ffbuild/version.sh
-index edc4dd3..d2b90a9 100755
---- a/ffbuild/version.sh
-+++ b/ffbuild/version.sh
-@@ -16,6 +16,9 @@ fi
- test "$revision" || revision=$(cd "$1" &&
-   git log -1 --pretty=format:"git-%cd-%h" --date=short 2> /dev/null)
- 
-+# ignore any current revision, which is just that of the LibreELEC repository
-+revision=
-+
- # Snapshots from gitweb are in a directory called ffmpeg-hhhhhhh or
- # ffmpeg-HEAD-hhhhhhh.
- if [ -z "$revision" ]; then
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
similarity index 100%
rename from packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
rename to packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
deleted file mode 100644
index 5240cf58ce4..00000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From 73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105 Mon Sep 17 00:00:00 2001
-From: Hendrik Leppkes <h.leppkes@gmail.com>
-Date: Mon, 1 Sep 2014 11:39:09 +0200
-Subject: [PATCH] h264_parser: force grabing a new timestamp until a frame
- start was found
-
----
- libavcodec/h264_parser.c | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index 2fd3f2b..7165652 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -525,6 +525,9 @@ static int h264_parse(AVCodecParserContext *s,
-     } else {
-         next = h264_find_frame_end(p, buf, buf_size);
- 
-+        if (next == END_NOT_FOUND && pc->frame_start_found == 0)
-+            s->fetch_timestamp = 1;
-+
-         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
-             *poutbuf      = NULL;
-             *poutbuf_size = 0;
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
new file mode 100644
index 00000000000..c3c09d6325d
--- /dev/null
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
@@ -0,0 +1,48 @@
+From 214a8ccc1489db28ce6cec2739365d7eebbdb0f9 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 5 Jun 2015 22:48:33 +0100
+Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
+ point
+
+---
+ libavcodec/avcodec.h       | 1 +
+ libavcodec/mpeg4videodec.c | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index c207d3a784..08aa8112b1 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -2967,6 +2967,7 @@ typedef struct AVCodecContext {
+ #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
+ #define FF_BUG_TRUNCATED       16384
+ #define FF_BUG_IEDGE           32768
++#define FF_BUG_GMC_UNSUPPORTED (1<<30)
+ 
+     /**
+      * strictly follow the standard (MPEG-4, ...).
+diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
+index cd39131d55..d8c8227cb4 100644
+--- a/libavcodec/mpeg4videodec.c
++++ b/libavcodec/mpeg4videodec.c
+@@ -2250,6 +2250,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+ 
+         if (ctx->divx_version >= 0)
+             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
++
++        if (ctx->num_sprite_warping_points > 1)
++            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
+     }
+ 
+     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
+@@ -2274,6 +2277,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
+                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+ 
++    avctx->workaround_bugs = s->workaround_bugs;
+     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+         s->codec_id == AV_CODEC_ID_MPEG4 &&
+         avctx->idct_algo == FF_IDCT_AUTO) {
+-- 
+2.14.1
+
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
similarity index 60%
rename from packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
rename to packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index 7caefad8a7e..5104bfd261b 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -18,23 +18,53 @@ index dabb51762d..0b1f739d22 100644
  /ffmpeg
  /ffplay
  /ffprobe
+diff --git a/configure b/configure
+index 18d80ee87a..d519af9074 100755
+--- a/configure
++++ b/configure
+@@ -313,6 +313,7 @@ External library support:
+   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
+   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
+   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
++  --enable-rpi             enable other rpi specific stuff [no]
+   --disable-nvenc          disable Nvidia video encoding code [autodetect]
+   --enable-omx             enable OpenMAX IL code [no]
+   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
+@@ -1682,6 +1683,7 @@ FEATURE_LIST="
+     gray
+     hardcoded_tables
+     omx_rpi
++    rpi
+     runtime_cpudetect
+     safe_bitstream_reader
+     shared
+@@ -2500,6 +2502,8 @@ hap_decoder_select="snappy texturedsp"
+ hap_encoder_deps="libsnappy"
+ hap_encoder_select="texturedspenc"
+ hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
++hevc_rpi_decoder_deps="rpi"
++hevc_rpi_decoder_select="hevc_decoder"
+ huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
+ huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
+ iac_decoder_select="imc_decoder"
 diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index 3ee31473dc..aac5b2f6c3 100644
+index 3ee31473dc..312864d737 100644
 --- a/fftools/ffmpeg.c
 +++ b/fftools/ffmpeg.c
-@@ -23,6 +23,11 @@
-  * multimedia converter based on the FFmpeg libraries
+@@ -24,6 +24,12 @@
   */
  
-+#ifdef RPI
+ #include "config.h"
++
++#if CONFIG_RPI
 +#define RPI_DISPLAY
 +#define RPI_DISPLAY_ALL 0
 +#endif
 +
- #include "config.h"
  #include <ctype.h>
  #include <string.h>
-@@ -43,6 +48,7 @@
+ #include <math.h>
+@@ -43,6 +49,7 @@
  #include "libavformat/avformat.h"
  #include "libavdevice/avdevice.h"
  #include "libswresample/swresample.h"
@@ -42,7 +72,7 @@ index 3ee31473dc..aac5b2f6c3 100644
  #include "libavutil/opt.h"
  #include "libavutil/channel_layout.h"
  #include "libavutil/parseutils.h"
-@@ -69,6 +75,25 @@
+@@ -69,6 +76,25 @@
  # include "libavfilter/buffersrc.h"
  # include "libavfilter/buffersink.h"
  
@@ -68,7 +98,7 @@ index 3ee31473dc..aac5b2f6c3 100644
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -165,6 +190,241 @@ static int restore_tty;
+@@ -165,6 +191,241 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -310,7 +340,7 @@ index 3ee31473dc..aac5b2f6c3 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -575,6 +835,11 @@ static void ffmpeg_cleanup(int ret)
+@@ -575,6 +836,11 @@ static void ffmpeg_cleanup(int ret)
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
@@ -322,7 +352,7 @@ index 3ee31473dc..aac5b2f6c3 100644
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -586,7 +851,9 @@ static void ffmpeg_cleanup(int ret)
+@@ -586,7 +852,9 @@ static void ffmpeg_cleanup(int ret)
          av_freep(&ist->filters);
          av_freep(&ist->hwaccel_device);
          av_freep(&ist->dts_buffer);
@@ -333,7 +363,7 @@ index 3ee31473dc..aac5b2f6c3 100644
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -617,6 +884,7 @@ static void ffmpeg_cleanup(int ret)
+@@ -617,6 +885,7 @@ static void ffmpeg_cleanup(int ret)
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -341,7 +371,7 @@ index 3ee31473dc..aac5b2f6c3 100644
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -1052,6 +1320,15 @@ static void do_video_out(OutputFile *of,
+@@ -1052,6 +1321,15 @@ static void do_video_out(OutputFile *of,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
@@ -357,7 +387,7 @@ index 3ee31473dc..aac5b2f6c3 100644
      frame_rate = av_buffersink_get_frame_rate(filter);
      if (frame_rate.num > 0 && frame_rate.den > 0)
          duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base));
-@@ -2165,8 +2442,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
+@@ -2165,8 +2443,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
                         ifilter->channel_layout != frame->channel_layout;
          break;
      case AVMEDIA_TYPE_VIDEO:
@@ -368,7 +398,7 @@ index 3ee31473dc..aac5b2f6c3 100644
          break;
      }
  
-@@ -2896,6 +3173,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+@@ -2896,6 +3174,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
@@ -396,52 +426,78 @@ index aacc185059..33c054294c 100644
      ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
  
      ifilter->sample_rate         = frame->sample_rate;
+diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
+index 100fa76e46..93a1b8edaf 100644
+--- a/fftools/ffmpeg_opt.c
++++ b/fftools/ffmpeg_opt.c
+@@ -706,11 +706,19 @@ static AVCodec *choose_decoder(OptionsContext *o, AVFormatContext *s, AVStream *
+ 
+     MATCH_PER_STREAM_OPT(codec_names, str, codec_name, s, st);
+     if (codec_name) {
++        if (strcmp("hevc_rpi", codec_name) == 0) {
++            return avcodec_find_decoder_by_id_and_fmt(AV_CODEC_ID_HEVC, st->codecpar->format);
++        }
+         AVCodec *codec = find_codec_or_die(codec_name, st->codecpar->codec_type, 0);
+         st->codecpar->codec_id = codec->id;
+         return codec;
+     } else
++    {
++        if (st->codecpar->codec_id == AV_CODEC_ID_HEVC) {
++            return avcodec_find_decoder_by_id_and_fmt(st->codecpar->codec_id, st->codecpar->format);
++        }
+         return avcodec_find_decoder(st->codecpar->codec_id);
++    }
+ }
+ 
+ /* Add all the streams from the given input file to the global
 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index c4ec09b1c4..54297da0ea 100644
+index c4ec09b1c4..3b94d47e9a 100644
 --- a/libavcodec/Makefile
 +++ b/libavcodec/Makefile
-@@ -4,6 +4,15 @@ DESC = FFmpeg codec library
+@@ -4,6 +4,7 @@ DESC = FFmpeg codec library
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
            avfft.h                                                       \
-+          rpi_qpu.h                                                     \
-+          rpi_shader.h                                                  \
-+          rpi_shader_cmd.h                                              \
-+          rpi_shader_template.h                                         \
-+          rpi_shader_template_fn.h                                      \
-+          rpi_mailbox.h                                                 \
-+          rpi_hevc_transform8.h                                         \
-+          rpi_hevc_transform10.h                                        \
 +          rpi_zc.h                                                      \
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -48,6 +57,11 @@ OBJS = allcodecs.o                                                      \
-        resample.o                                                       \
-        resample2.o                                                      \
-        utils.o                                                          \
-+       rpi_qpu.o                                                        \
-+       rpi_shader.o                                                     \
-+       rpi_shader_template.o                                            \
-+       rpi_mailbox.o                                                    \
-+       rpi_zc.o                                                         \
-        vorbis_parser.o                                                  \
-        xiph.o                                                           \
- 
-@@ -1143,3 +1157,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+@@ -123,6 +124,7 @@ OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
+ OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
+ OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
+ OBJS-$(CONFIG_RDFT)                    += rdft.o
++OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
+ OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
+ OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
+ OBJS-$(CONFIG_SINEWIN)                 += sinewin.o sinewin_fixed.o
+@@ -351,6 +353,12 @@ OBJS-$(CONFIG_HAP_ENCODER)             += hapenc.o hap.o
+ OBJS-$(CONFIG_HEVC_DECODER)            += hevcdec.o hevc_mvs.o \
+                                           hevc_cabac.o hevc_refs.o hevcpred.o    \
+                                           hevcdsp.o hevc_filter.o hevc_data.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
++                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
++                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
++                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
++                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
++                                          rpi_hevc_sei.o rpi_hevc_data.o
+ OBJS-$(CONFIG_HEVC_CUVID_DECODER)      += cuvid.o
+ OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
+ OBJS-$(CONFIG_HEVC_NVENC_ENCODER)      += nvenc_hevc.o
+@@ -1143,3 +1151,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
++ifdef CONFIG_HEVC_RPI_DECODER
 +QASM_PY := ../local/bin/qasm.py
 +VASMVIDCORE := ../local/bin/vasmvidcore_std
 +
 +ifneq ("$(wildcard $(QASM_PY))","")
-+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
-+	$(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
++	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,rpi_hevc_shader $< > $@
 +
-+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
-+	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
++	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_shader,rpi_hevc_shader $< > $@
 +endif
 +
 +ifneq ("$(wildcard $(VASMVIDCORE))","")
@@ -454,40 +510,52 @@ index c4ec09b1c4..54297da0ea 100644
 +	python pi-util/make_array.py $<
 +$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
 +	python pi-util/make_array.py $<
-+
 +endif
 +
 +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
-+$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
++endif
 diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index 4f34312e67..bba96efade 100644
+index 4f34312e67..5361a22141 100644
 --- a/libavcodec/allcodecs.c
 +++ b/libavcodec/allcodecs.c
-@@ -731,6 +731,7 @@ static void register_all(void)
-     REGISTER_PARSER(H261,               h261);
-     REGISTER_PARSER(H263,               h263);
-     REGISTER_PARSER(H264,               h264);
-+    REGISTER_PARSER(H264_MVC,           h264_mvc);
-     REGISTER_PARSER(HEVC,               hevc);
-     REGISTER_PARSER(MJPEG,              mjpeg);
-     REGISTER_PARSER(MLP,                mlp);
+@@ -222,6 +222,7 @@ static void register_all(void)
+     REGISTER_DECODER(H264_VDPAU,        h264_vdpau);
+ #endif
+     REGISTER_ENCDEC (HAP,               hap);
++    REGISTER_DECODER(HEVC_RPI,          hevc_rpi);
+     REGISTER_DECODER(HEVC,              hevc);
+     REGISTER_DECODER(HEVC_QSV,          hevc_qsv);
+     REGISTER_DECODER(HEVC_RKMPP,        hevc_rkmpp);
 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index 1eeac5449e..7e23777f5d 100644
+index 1eeac5449e..64aca64e52 100644
 --- a/libavcodec/arm/Makefile
 +++ b/libavcodec/arm/Makefile
-@@ -134,9 +134,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+@@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
+                                           arm/sbrdsp_init_arm.o
+ OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
+ OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o
+ OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
+ OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
+ OBJS-$(CONFIG_VORBIS_DECODER)          += arm/vorbisdsp_init_arm.o
+@@ -134,9 +135,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
  NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
-+                                          arm/hevc_misc_neon.o          \
++                                          arm/hevcdsp_idct_neon.o    \
                                            arm/hevcdsp_deblock_neon.o    \
-+                                          arm/hevcdsp_epel_neon.o       \
-                                           arm/hevcdsp_idct_neon.o       \
--                                          arm/hevcdsp_qpel_neon.o
-+                                          arm/hevcdsp_cres_neon.o       \
-+                                          arm/hevcdsp_res16_neon.o      \
-+                                          arm/hevcdsp_qpel_neon.o       \
-+                                          arm/hevcdsp_sao_neon.o
+-                                          arm/hevcdsp_idct_neon.o       \
+                                           arm/hevcdsp_qpel_neon.o
++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
++                                          arm/rpi_hevc_misc_neon.o       \
++                                          arm/rpi_hevcdsp_deblock_neon.o \
++                                          arm/rpi_hevcdsp_epel_neon.o    \
++                                          arm/rpi_hevcdsp_idct_neon.o    \
++                                          arm/rpi_hevcdsp_res16_neon.o   \
++                                          arm/rpi_hevcdsp_qpel_neon.o    \
++                                          arm/rpi_hevcdsp_sao_neon.o     \
++                                          arm/rpi_hevcdsp_cres_neon.o
  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                            arm/rv40dsp_neon.o
@@ -673,11 +741,11 @@ index fdbf86b45e..0a3980a1ef 100644
  #endif /* HAVE_ARMV6T2_INLINE */
  
  #endif /* AVCODEC_ARM_CABAC_H */
-diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
+diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
 new file mode 100644
 index 0000000000..31d3c59205
 --- /dev/null
-+++ b/libavcodec/arm/hevc_cabac.h
++++ b/libavcodec/arm/rpi_hevc_cabac.h
 @@ -0,0 +1,491 @@
 +/*
 + * This file is part of FFmpeg.
@@ -1170,11 +1238,11 @@ index 0000000000..31d3c59205
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
+diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
 new file mode 100644
-index 0000000000..380d3c8d3b
+index 0000000000..91a7bd4f4f
 --- /dev/null
-+++ b/libavcodec/arm/hevc_idct_fn_neon.S
++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
 @@ -0,0 +1,224 @@
 +@ Included multiple times from hevc_idct_neon.S
 +@ Macros defined there
@@ -1183,7 +1251,7 @@ index 0000000000..380d3c8d3b
 +#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
 +#define TRN_SHIFT (20 - BIT_DEPTH)
 +
-+function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
 +        ldrsh       r1, [r0]
 +        add         r1, #DC_ADD
 +        asr         r1, #DC_SHIFT
@@ -1193,7 +1261,7 @@ index 0000000000..380d3c8d3b
 +        bx lr
 +endfunc
 +
-+function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
 +        ldrsh       r1, [r0]
 +        add         r1, #DC_ADD
 +        asr         r1, #DC_SHIFT
@@ -1209,7 +1277,7 @@ index 0000000000..380d3c8d3b
 +        bx lr
 +endfunc
 +
-+function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
 +        ldrsh       r1, [r0]
 +        add         r1, #DC_ADD
 +        asr         r1, #DC_SHIFT
@@ -1228,7 +1296,7 @@ index 0000000000..380d3c8d3b
 +        bx lr
 +endfunc
 +
-+function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
 +        ldrsh       r1, [r0]
 +        add         r1, #DC_ADD
 +        asr         r1, #DC_SHIFT
@@ -1248,7 +1316,7 @@ index 0000000000..380d3c8d3b
 +endfunc
 +
 +
-+function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
 +        vpush       {d8-d15}
 +        vld1.16     {q14, q15}, [r0]  // coeffs
 +        ldr         r3, =0x00240053 // 36 and 83
@@ -1273,7 +1341,7 @@ index 0000000000..380d3c8d3b
 +
 +
 +
-+function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
 +        vpush       {d8-d15}
 +        vld1.16     {q14, q15}, [r0]  // coeffs
 +        ldr         r3, =0x4a  // 74
@@ -1301,7 +1369,7 @@ index 0000000000..380d3c8d3b
 +
 +
 +
-+function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
 +        push   {r4-r8}
 +        vpush {d8-d15}
 +        mov    r5, #16
@@ -1400,11 +1468,11 @@ index 0000000000..380d3c8d3b
 +#undef DC_ADD
 +#undef TRN_SHIFT
 +
-diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
+diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
 new file mode 100644
 index 0000000000..373576b4cb
 --- /dev/null
-+++ b/libavcodec/arm/hevc_misc_neon.S
++++ b/libavcodec/arm/rpi_hevc_misc_neon.S
 @@ -0,0 +1,62 @@
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
@@ -1468,11 +1536,43 @@ index 0000000000..373576b4cb
 +
 +endfunc
 +
-diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
+diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
+new file mode 100644
+index 0000000000..62b9326532
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_arm.h
+@@ -0,0 +1,26 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
++#define AVCODEC_ARM_HEVCDSP_ARM_H
++
++#include "libavcodec/rpi_hevcdsp.h"
++
++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
+diff --git a/libavcodec/arm/rpi_hevcdsp_cres_neon.S b/libavcodec/arm/rpi_hevcdsp_cres_neon.S
 new file mode 100644
-index 0000000000..bafefd4318
+index 0000000000..883cde35dc
 --- /dev/null
-+++ b/libavcodec/arm/hevcdsp_cres_neon.S
++++ b/libavcodec/arm/rpi_hevcdsp_cres_neon.S
 @@ -0,0 +1,296 @@
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
@@ -1493,7 +1593,7 @@ index 0000000000..bafefd4318
 +@   ptrdiff_t stride,     [r2]
 +@   int dc_v)             [r3]
 +
-+function ff_hevc_add_residual_4x4_u_neon_8, export=1
++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
 +        vld1.8      {d16}, [r0, :64], r2
 +        vld1.8      {d17}, [r0, :64], r2
 +        vld1.8      {d18}, [r0, :64], r2
@@ -1529,7 +1629,7 @@ index 0000000000..bafefd4318
 +@   ptrdiff_t stride)     [r2]
 +@   int dc_v)             [r3]
 +
-+function ff_hevc_add_residual_8x8_u_neon_8, export=1
++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
 +        mov         r12,    #4
 +        vdup.16     q15, r3
 +1:
@@ -1560,7 +1660,7 @@ index 0000000000..bafefd4318
 +@   ptrdiff_t stride)     [r2]
 +@   int dc_v)             [r3]
 +
-+function ff_hevc_add_residual_16x16_u_neon_8, export=1
++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
 +        mov         r12,    #16
 +        vdup.16     q15, r3
 +1:
@@ -1590,7 +1690,7 @@ index 0000000000..bafefd4318
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
 +
-+function ff_hevc_add_residual_4x4_v_neon_8, export=1
++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
 +        vld1.8      {d16}, [r0, :64], r2
 +        vld1.8      {d17}, [r0, :64], r2
 +        vld1.8      {d18}, [r0, :64], r2
@@ -1625,7 +1725,7 @@ index 0000000000..bafefd4318
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
 +
-+function ff_hevc_add_residual_8x8_v_neon_8, export=1
++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
 +        mov         r12,    #4
 +        vdup.16     q15, r3
 +1:
@@ -1655,7 +1755,7 @@ index 0000000000..bafefd4318
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
 +
-+function ff_hevc_add_residual_16x16_v_neon_8, export=1
++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
 +        mov         r12,    #16
 +        vdup.16     q15, r3
 +1:
@@ -1685,7 +1785,7 @@ index 0000000000..bafefd4318
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
 +
-+function ff_hevc_add_residual_4x4_c_neon_8, export=1
++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
 +        vld1.8      {d16}, [r0, :64], r2
 +        vld1.8      {d17}, [r0, :64], r2
 +        vld1.8      {d18}, [r0, :64], r2
@@ -1718,7 +1818,7 @@ index 0000000000..bafefd4318
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
 +
-+function ff_hevc_add_residual_8x8_c_neon_8, export=1
++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
 +        mov         r12,    #8
 +        add         r3, r1, #(8*8*2)  @ Offset to V
 +1:
@@ -1742,7 +1842,7 @@ index 0000000000..bafefd4318
 +@   const int16_t *res,   [r1]
 +@   ptrdiff_t stride)     [r2]
 +
-+function ff_hevc_add_residual_16x16_c_neon_8, export=1
++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
 +        mov         r12,    #16
 +        add         r3, r1, #(16*16*2)  @ Offset to V
 +1:
@@ -1770,41 +1870,44 @@ index 0000000000..bafefd4318
 +@ 32x32 chroma never occurs so NIF
 +
 +@ ============================================================================
-diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-index 166bddb104..15c4329cdb 100644
---- a/libavcodec/arm/hevcdsp_deblock_neon.S
-+++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-@@ -15,7 +15,7 @@
-  *
-  * You should have received a copy of the GNU Lesser General Public
-  * License along with FFmpeg; if not, write to the Free Software
-- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+new file mode 100644
+index 0000000000..d691cda836
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+@@ -0,0 +1,1483 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
-  */
- 
- 
-@@ -24,70 +24,238 @@
- 
- .macro hevc_loop_filter_chroma_start
-         ldr      r12, [r2]
--        ldr      r3, [r2, #4]
--        add      r2, r3, r12
--        cmp      r2, #0
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.macro hevc_loop_filter_chroma_start
++        ldr      r12, [r2]
 +        ldr      r2, [r2, #4]
 +        orrs     r2, r12, r2, lsl #16
-         it       eq
-         bxeq     lr
- .endm
- 
--.macro hevc_loop_filter_chroma_body
--        vsubl.u8  q3, d4, d2
--        vsubl.u8  q11, d18, d19
--        vshl.i16  q3, #2
--        vadd.i16  q11, q3
--        vdup.16   d0, r12
--        vdup.16   d1, r3
--        vrshr.s16 q11, q11, #3
--        vneg.s16  q12, q0
++        it       eq
++        bxeq     lr
++.endm
++
 +@ Uses: d2, d4, d18, d19
 +@ Returns: d2, d4
 +@ Modifies: d0-d7, d22-d25, r12
@@ -1886,13 +1989,7 @@ index 166bddb104..15c4329cdb 100644
 +        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
 +        vrshr.s16 q0, #3
 +        vdup.16   d6, r2
-         vmovl.u8  q2, d4
--        vmin.s16  q11, q11, q0
--        vmax.s16  q11, q11, q12
--        vaddw.u8  q1, q11, d2
--        vsub.i16  q2, q11
--        vqmovun.s16 d2, q1
--        vqmovun.s16 d4, q2
++        vmovl.u8  q2, d4
 +        vmovl.u8  q3, d6
 +        vuzp.16   d4, d5
 +        vrshr.s16 q1, #3
@@ -1916,8 +2013,8 @@ index 166bddb104..15c4329cdb 100644
 +
 +        vqmovun.s16 \Q0u, q2
 +        vqmovun.s16 \Q0v, q3
- .endm
- 
++.endm
++
 +@ Preserves r12
 +@ Clobbers r2
 +.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
@@ -1967,27 +2064,14 @@ index 166bddb104..15c4329cdb 100644
 +
 +
 +
- .macro hevc_loop_filter_luma_start
-         ldr     r12, [r3]
-         ldr      r3, [r3, #4]
--        lsl      r3, #16
--        orr      r3, r12
--        cmp      r3, #0
++.macro hevc_loop_filter_luma_start
++        ldr     r12, [r3]
++        ldr      r3, [r3, #4]
 +        orrs     r3, r12, r3, lsl #16
-         it       eq
-         bxeq     lr
--        lsr      r3, #16
- .endm
- 
--.macro hevc_loop_filter_luma_body
--        vmovl.u8  q8, d16
--        vmovl.u8  q9, d18
--        vmovl.u8  q10, d20
--        vmovl.u8  q11, d22
--        vmovl.u8  q12, d24
--        vmovl.u8  q13, d26
--        vmovl.u8  q14, d28
--        vmovl.u8  q15, d30
++        it       eq
++        bxeq     lr
++.endm
++
 +@ Uses: r2, r3, r12
 +@ Modifies: r5, r6, r7, r8, r9
 +
@@ -2009,7 +2093,7 @@ index 166bddb104..15c4329cdb 100644
 +@  r10[16:23]  no_q[0]
 +@  r10[24:31]  no_q[1]
 +
- 
++
 +.macro m_filter_luma bit_depth
 +.if \bit_depth == 8
 +        vmovl.u8  q15, d23
@@ -2021,78 +2105,225 @@ index 166bddb104..15c4329cdb 100644
 +        vmovl.u8  q9, d17
 +        vmovl.u8  q8, d16
 +.endif
-         vadd.i16   q7, q9, q11
++        vadd.i16   q7, q9, q11
 +.if \bit_depth > 8
 +        lsl        r2, r2, #(\bit_depth - 8)
 +.endif
-         vadd.i16   q6, q14, q12
++        vadd.i16   q6, q14, q12
 +.if \bit_depth > 8
 +        lsl        r3, r3, #(\bit_depth - 8)
 +.endif
-         vsub.i16   q7, q10
++        vsub.i16   q7, q10
 +        ldr        r5, [sp, #96]        @ Bolt no_x values together into r10
-         vsub.i16   q6, q13
-         vabd.s16   q7, q7, q10
-         vabd.s16   q6, q6, q13
--
++        vsub.i16   q6, q13
++        vabd.s16   q7, q7, q10
++        vabd.s16   q6, q6, q13
 +        ldrh       r10, [r5]
- 
-         vdup.16    q0, r2
-         vmov       q4, q7
-         vmov       q5, q6
--        vdup.16    d4, r12
++
++        vdup.16    q0, r2
++        vmov       q4, q7
++        vmov       q5, q6
 +        ldr        r5, [sp, #100]
 +        vdup.16    d4, r3
 +        lsr        r3, r3, #16
-         vtrn.16    q7, q4
++        vtrn.16    q7, q4
 +        ldrh       r5, [r5]
-         vtrn.16    q6, q5
- 
-         vshl.u64   q7, #32
-         vshr.u64   q4, #32
-         vshl.u64   q6, #32
++        vtrn.16    q6, q5
++
++        vshl.u64   q7, #32
++        vshr.u64   q4, #32
++        vshl.u64   q6, #32
 +        orr        r10, r10, r5, lsl #16
-         vshr.u64   q5, #32
-         vshr.u64   q7, #32
-         vshr.u64   q6, #32
-@@ -152,7 +320,7 @@
- 
-         and        r9, r8, r7
-         cmp        r9, #0
--        beq        weakfilter_\@
++        vshr.u64   q5, #32
++        vshr.u64   q7, #32
++        vshr.u64   q6, #32
++        vshl.u64   q5, #32
++        vshl.u64   q4, #32
++        vorr       q6, q5
++        vorr       q7, q4
++        vdup.16    d5, r3
++        vadd.i16   q5, q7, q6
++
++        vmov       q4, q5
++        vmov       q3, q5
++        vtrn.32    q3, q4
++
++        vadd.i16   q4, q3
++
++        vshl.s16   q5, q5, #1
++        vcgt.s16   q3, q0, q4
++
++        vmovn.i16  d6, q3
++        vshr.s16   q1, q0, #2
++        vmovn.i16  d6, q3
++        vcgt.s16   q5, q1, q5
++        vmov       r7, s12
++        cmp        r7, #0
++        beq        bypasswrite
++
++        vpadd.i32  d0, d14, d12
++        vpadd.i32  d1, d15, d13
++        vmov       q4, q2
++        vshl.s16   q2, #2
++        vshr.s16   q1, q1, #1
++        vrhadd.s16 q2, q4
++
++        vabd.s16   q7, q8, q11
++        vaba.s16   q7, q15, q12
++
++        vmovn.i32  d0, q0
++        vmov       r5, r6, s0, s1
++        vcgt.s16   q6, q1, q7
++        vand       q5, q5, q6
++        vabd.s16   q7, q11, q12
++        vcgt.s16   q6, q2, q7
++        vand       q5, q5, q6
++
++        vmov       q2, q5
++        vtrn.s16   q5, q2
++        vshr.u64   q2, #32
++        vshl.u64   q5, #32
++        vshl.u64   q2, #32
++        vshr.u64   q5, #32
++        vorr       q5, q2
++
++        vmov       q2, q5
++        vshl.i16   q7, q4, #1
++        vtrn.32    q2, q5
++        vand       q5, q2
++        vneg.s16   q6, q7
++        vmovn.i16  d4, q5
++        vmovn.i16  d4, q2
++        vmov       r8, s8
++
++        and        r9, r8, r7
++        cmp        r9, #0
 +        beq        1f
- 
-         vadd.i16  q2, q11, q12
-         vadd.i16  q4, q9, q8
-@@ -210,11 +378,11 @@
-         vbit      q13, q3, q5
-         vbit      q14, q2, q5
- 
--weakfilter_\@:
++
++        vadd.i16  q2, q11, q12
++        vadd.i16  q4, q9, q8
++        vadd.i16  q1, q2, q10
++        vdup.16   d10, r9
++        vadd.i16  q0, q1, q9
++        vshl.i16  q4, #1
++        lsr        r9, #16
++        vadd.i16  q1, q0
++        vrshr.s16 q3, q0, #2
++        vadd.i16  q1, q13
++        vadd.i16  q4, q0
++        vsub.i16  q3, q10
++        vrshr.s16 q1, #3
++        vrshr.s16 q4, #3
++        vmax.s16  q3, q6
++        vsub.i16  q1, q11
++        vsub.i16  q4, q9
++        vmin.s16  q3, q7
++        vmax.s16  q4, q6
++        vmax.s16  q1, q6
++        vadd.i16  q3, q10
++        vmin.s16  q4, q7
++        vmin.s16  q1, q7
++        vdup.16   d11, r9
++        vadd.i16  q4, q9
++        vadd.i16  q1, q11
++        vbit      q9, q4, q5
++        vadd.i16  q4, q2, q13
++        vbit      q11, q1, q5
++        vadd.i16  q0, q4, q14
++        vadd.i16  q2, q15, q14
++        vadd.i16  q4, q0
++
++        vshl.i16  q2, #1
++        vadd.i16  q4, q10
++        vbit      q10, q3, q5
++        vrshr.s16 q4, #3
++        vadd.i16  q2, q0
++        vrshr.s16 q3, q0, #2
++        vsub.i16  q4, q12
++        vrshr.s16 q2, #3
++        vsub.i16  q3, q13
++        vmax.s16  q4, q6
++        vsub.i16  q2, q14
++        vmax.s16  q3, q6
++        vmin.s16  q4, q7
++        vmax.s16  q2, q6
++        vmin.s16  q3, q7
++        vadd.i16  q4, q12
++        vmin.s16  q2, q7
++        vadd.i16  q3, q13
++        vbit      q12, q4, q5
++        vadd.i16  q2, q14
++        vbit      q13, q3, q5
++        vbit      q14, q2, q5
++
 +1:
-         mvn       r8, r8
-         and       r9, r8, r7
-         cmp       r9, #0
--        beq       ready_\@
++        mvn       r8, r8
++        and       r9, r8, r7
++        cmp       r9, #0
 +        beq       2f
- 
-         vdup.16    q4, r2
- 
-@@ -275,111 +443,1041 @@ weakfilter_\@:
-         vbit      q11, q0, q5
-         vbit      q12, q4, q5
- 
--ready_\@:
++
++        vdup.16    q4, r2
++
++        vdup.16   d10, r9
++        lsr       r9, #16
++        vmov       q1, q4
++        vdup.16   d11, r9
++        vshr.s16   q1, #1
++        vsub.i16  q2, q12, q11
++        vadd.i16   q4, q1
++        vshl.s16  q0, q2, #3
++        vshr.s16   q4, #3
++        vadd.i16  q2, q0
++        vsub.i16  q0, q13, q10
++        vsub.i16  q2, q0
++        vshl.i16  q0, q0, #1
++        vsub.i16  q2, q0
++        vshl.s16  q1, q7, 2
++        vrshr.s16 q2, q2, #4
++        vadd.i16  q1, q7
++        vabs.s16  q3, q2
++        vshr.s16  q6, q6, #1
++        vcgt.s16  q1, q1, q3
++        vand      q5, q1
++        vshr.s16  q7, q7, #1
++        vmax.s16  q2, q2, q6
++        vmin.s16  q2, q2, q7
++
++        vshr.s16  q7, q7, #1
++        vrhadd.s16 q3, q9, q11
++        vneg.s16  q6, q7
++        vsub.s16  q3, q10
++        vdup.16   d2, r5
++        vhadd.s16 q3, q2
++        vdup.16   d3, r6
++        vmax.s16  q3, q3, q6
++        vcgt.s16  q1, q4, q1
++        vmin.s16  q3, q3, q7
++        vand      q1, q5
++        vadd.i16  q3, q10
++        lsr       r5, #16
++        lsr       r6, #16
++        vbit      q10, q3, q1
++
++        vrhadd.s16 q3, q14, q12
++        vdup.16   d2, r5
++        vsub.s16  q3, q13
++        vdup.16   d3, r6
++        vhsub.s16 q3, q2
++        vcgt.s16  q1, q4, q1
++        vmax.s16  q3, q3, q6
++        vand      q1, q5
++        vmin.s16  q3, q3, q7
++        vadd.i16  q3, q13
++        vbit      q13, q3, q1
++        vadd.i16  q0, q11, q2
++        vsub.i16  q4, q12, q2
++        vbit      q11, q0, q5
++        vbit      q12, q4, q5
++
 +2:
 +.if \bit_depth == 8
-         vqmovun.s16 d16, q8
--        vqmovun.s16 d18, q9
--        vqmovun.s16 d20, q10
--        vqmovun.s16 d22, q11
--        vqmovun.s16 d24, q12
--        vqmovun.s16 d26, q13
--        vqmovun.s16 d28, q14
--        vqmovun.s16 d30, q15
++        vqmovun.s16 d16, q8
 +        cmp       r10, #0
 +        vqmovun.s16 d17, q9
 +        vqmovun.s16 d18, q10
@@ -2121,14 +2352,14 @@ index 166bddb104..15c4329cdb 100644
 +        vmin.s16  q14, q1
 +.endif
 +        mov       pc, lr
- .endm
- 
++.endm
++
 +function hevc_loop_filter_luma_body
 +        m_filter_luma 8
 +endfunc
 +
-+@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
-+function ff_hevc_v_loop_filter_luma2_neon_8, export=1
++@ ff_hevc_rpi_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
 +        hevc_loop_filter_luma_start
 +        push     {r4-r10,lr}       @ 8 regs = 32 bytes
 +
@@ -2137,7 +2368,7 @@ index 166bddb104..15c4329cdb 100644
 +endfunc
 +
 +
-+@ void ff_hevc_v_loop_filter_luma_neon(
++@ void ff_hevc_rpi_v_loop_filter_luma_neon(
 +@   uint8_t *_pix,      [r0]
 +@   ptrdiff_t _stride,  [r1]
 +@   int _beta,          [r2]
@@ -2146,35 +2377,13 @@ index 166bddb104..15c4329cdb 100644
 +@   uint8_t *_no_q)     [sp+4]
 +
 +
- function ff_hevc_v_loop_filter_luma_neon, export=1
-         hevc_loop_filter_luma_start
--        push     {r5-r11}
++function ff_hevc_rpi_v_loop_filter_luma_neon, export=1
++        hevc_loop_filter_luma_start
 +        push     {r4-r10,lr}
 +
 +        sub      r4, r0, #4
 +v_loop_luma_common:
-         vpush    {d8-d15}
--        sub      r0, #4
--        vld1.8   {d16}, [r0], r1
--        vld1.8   {d18}, [r0], r1
--        vld1.8   {d20}, [r0], r1
--        vld1.8   {d22}, [r0], r1
--        vld1.8   {d24}, [r0], r1
--        vld1.8   {d26}, [r0], r1
--        vld1.8   {d28}, [r0], r1
--        vld1.8   {d30}, [r0], r1
--        sub      r0, r0, r1, lsl #3
--        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
--        hevc_loop_filter_luma_body
--        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
--        vst1.8   {d16}, [r0], r1
--        vst1.8   {d18}, [r0], r1
--        vst1.8   {d20}, [r0], r1
--        vst1.8   {d22}, [r0], r1
--        vst1.8   {d24}, [r0], r1
--        vst1.8   {d26}, [r0], r1
--        vst1.8   {d28}, [r0], r1
--        vst1.8   {d30}, [r0]
++        vpush    {d8-d15}
 +
 +        @ Uses slightly fewer instructions to do laned loads than unlaned
 +        @ and transpose.  This also means that we can use the same code for
@@ -2242,12 +2451,10 @@ index 166bddb104..15c4329cdb 100644
 +        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
 +1:
 +bypasswrite:
-         vpop     {d8-d15}
--        pop      {r5-r11}
--        bx lr
++        vpop     {d8-d15}
 +        pop      {r4-r10,pc}
- endfunc
- 
++endfunc
++
 +.macro m_filter_v_luma_common_16 bit_depth
 +        vpush    {d8-d15}
 +
@@ -2332,42 +2539,25 @@ index 166bddb104..15c4329cdb 100644
 +@
 +@ Src should always be on 8 byte boundry & all in the same slice
 +
- function ff_hevc_h_loop_filter_luma_neon, export=1
-         hevc_loop_filter_luma_start
--        push     {r5-r11}
++function ff_hevc_rpi_h_loop_filter_luma_neon, export=1
++        hevc_loop_filter_luma_start
 +        push     {r4-r10,lr}
 +
-         vpush    {d8-d15}
-         sub      r0, r0, r1, lsl #2
++        vpush    {d8-d15}
++        sub      r0, r0, r1, lsl #2
 +
-         vld1.8  {d16}, [r0], r1
++        vld1.8  {d16}, [r0], r1
 +        vld1.8  {d17}, [r0], r1
-         vld1.8  {d18}, [r0], r1
++        vld1.8  {d18}, [r0], r1
 +        vld1.8  {d19}, [r0], r1
-         vld1.8  {d20}, [r0], r1
++        vld1.8  {d20}, [r0], r1
 +        vld1.8  {d21}, [r0], r1
-         vld1.8  {d22}, [r0], r1
--        vld1.8  {d24}, [r0], r1
--        vld1.8  {d26}, [r0], r1
--        vld1.8  {d28}, [r0], r1
--        vld1.8  {d30}, [r0], r1
--        sub        r0, r0, r1, lsl #3
--        add        r0, r1
--        hevc_loop_filter_luma_body
--        vst1.8   {d18}, [r0], r1
--        vst1.8   {d20}, [r0], r1
--        vst1.8   {d22}, [r0], r1
--        vst1.8   {d24}, [r0], r1
--        vst1.8   {d26}, [r0], r1
--        vst1.8   {d28}, [r0]
--bypasswrite:
++        vld1.8  {d22}, [r0], r1
 +        vld1.8  {d23}, [r0]
 +
 +        bl hevc_loop_filter_luma_body
 +
-         vpop     {d8-d15}
--        pop      {r5-r11}
--        bx lr
++        vpop     {d8-d15}
 +
 +        neg     r1, r1
 +        add     r0, r0, r1
@@ -2497,13 +2687,13 @@ index 166bddb104..15c4329cdb 100644
 +.endm
 +
 +
-+@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
 +@                                     unsigned int no_f);    // r3
 +@
 +@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
-+function ff_hevc_h_loop_filter_uv_neon_8, export=1
++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
 +        sub      r0, r0, r1, lsl #1
 +        vld2.8   {d16,d17}, [r0], r1
 +        vld2.8   {d18,d19}, [r0], r1
@@ -2543,7 +2733,7 @@ index 166bddb104..15c4329cdb 100644
 +endfunc
 +
 +
-+@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
 +@                                     unsigned int no_f);    // r3
@@ -2593,7 +2783,7 @@ index 166bddb104..15c4329cdb 100644
 +.endm
 +
 +
-+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
 +@                                     uint8_t * src_l,       // r3
@@ -2601,7 +2791,7 @@ index 166bddb104..15c4329cdb 100644
 +@
 +@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
 +
-+function ff_hevc_v_loop_filter_uv2_neon_8, export=1
++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
 +        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
 +        vld4.8   {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
 +        sub      r12, r0, r3
@@ -2681,10 +2871,10 @@ index 166bddb104..15c4329cdb 100644
 +        vst2.8   {d18[1], d19[1]}, [r2], r1
 +        vst2.8   {d18[0], d19[0]}, [r2]
 +        bx       lr
- endfunc
- 
++endfunc
 +
-+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
 +@                                     uint8_t * src_l,       // r3
@@ -2775,8 +2965,8 @@ index 166bddb104..15c4329cdb 100644
 +
 +
 +
- function ff_hevc_v_loop_filter_chroma_neon, export=1
-         hevc_loop_filter_chroma_start
++function ff_hevc_rpi_v_loop_filter_chroma_neon, export=1
++        hevc_loop_filter_chroma_start
 +
 +        sub      r0, #2
 +        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
@@ -2851,7 +3041,7 @@ index 166bddb104..15c4329cdb 100644
 +.macro m_filter_v_chroma_16 bit_depth
 +        hevc_loop_filter_chroma_start
 +
-         sub      r0, #4
++        sub      r0, #4
 +        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
 +        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
 +        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
@@ -2920,40 +3110,24 @@ index 166bddb104..15c4329cdb 100644
 +.endm
 +
 +
-+@ void ff_hevc_h_loop_filter_chroma_neon(
++@ void ff_hevc_rpi_h_loop_filter_chroma_neon(
 +@   uint8_t *_pix,     [r0]
 +@   ptrdiff_t _stride, [r1]
 +@   int *_tc,          [r2]
 +@   uint8_t *_no_p,    [r3]
 +@   uint8_t *_no_q);   [sp+0]
 +
-+function ff_hevc_h_loop_filter_chroma_neon, export=1
++function ff_hevc_rpi_h_loop_filter_chroma_neon, export=1
 +        hevc_loop_filter_chroma_start
 +        sub      r0, r0, r1, lsl #1
-         vld1.8   {d16}, [r0], r1
-         vld1.8   {d17}, [r0], r1
-         vld1.8   {d18}, [r0], r1
--        vld1.8   {d2},  [r0], r1
--        vld1.8   {d4},  [r0], r1
--        vld1.8   {d19}, [r0], r1
--        vld1.8   {d20}, [r0], r1
--        vld1.8   {d21}, [r0], r1
--        sub      r0, r0, r1, lsl #3
--        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
--        hevc_loop_filter_chroma_body
--        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
--        vst1.8   {d16}, [r0], r1
++        vld1.8   {d16}, [r0], r1
++        vld1.8   {d17}, [r0], r1
++        vld1.8   {d18}, [r0], r1
 +        vld1.8   {d19}, [r0]
 +        sub      r0, r0, r1, lsl #1
 +        hevc_loop_filter_chroma_body d16, d17, d18, d19
 +        bne      1f     @ Partial write
-         vst1.8   {d17}, [r0], r1
--        vst1.8   {d18}, [r0], r1
--        vst1.8   {d2},  [r0], r1
--        vst1.8   {d4},  [r0], r1
--        vst1.8   {d19}, [r0], r1
--        vst1.8   {d20}, [r0], r1
--        vst1.8   {d21}, [r0]
++        vst1.8   {d17}, [r0], r1
 +        vst1.8   {d18}, [r0]
 +        bx       lr
 +1:
@@ -2974,25 +3148,17 @@ index 166bddb104..15c4329cdb 100644
 +        it eq
 +        streq    r3, [r0, #4]
 +
-         bx       lr
- endfunc
- 
--function ff_hevc_h_loop_filter_chroma_neon, export=1
++        bx       lr
++endfunc
++
 +.macro m_filter_h_chroma_16 bit_depth
-         hevc_loop_filter_chroma_start
-         sub      r0, r0, r1, lsl #1
--        vld1.8   {d18}, [r0], r1
--        vld1.8   {d2}, [r0], r1
--        vld1.8   {d4}, [r0], r1
--        vld1.8   {d19}, [r0]
++        hevc_loop_filter_chroma_start
++        sub      r0, r0, r1, lsl #1
 +        vld1.16  {q8}, [r0], r1
 +        vld1.16  {q9}, [r0], r1
 +        vld1.16  {q10}, [r0], r1
 +        vld1.16  {q11}, [r0]
-         sub      r0, r0, r1, lsl #1
--        hevc_loop_filter_chroma_body
--        vst1.8   {d2}, [r0], r1
--        vst1.8   {d4}, [r0]
++        sub      r0, r0, r1, lsl #1
 +        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
 +        bne      1f     @ Partial write
 +        vst1.16  {q9}, [r0], r1
@@ -3020,15 +3186,15 @@ index 166bddb104..15c4329cdb 100644
 +        add      r0, #8
 +        vst1.16  {d21}, [r0]
 +
-         bx       lr
++        bx       lr
 +.endm
 +
 +
-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
++/* ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
 + *                                            int *curr_rpl0, int *curr_
 + *                                            MvField *curr, MvField *ne
 + */
-+function ff_hevc_deblocking_boundary_strengths_neon, export=1
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
 +        add         ip, sp, #4*4
 +        push        {a2-a4,v1-v8,lr}
 +        ldmia       ip, {v5-v7}
@@ -3156,11 +3322,11 @@ index 166bddb104..15c4329cdb 100644
 +        m_filter_luma 10
 +endfunc
 +
-+function ff_hevc_h_loop_filter_luma_neon_10, export=1
++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
 +        m_filter_h_luma_16 10
 +endfunc
 +
-+function ff_hevc_v_loop_filter_luma2_neon_10, export=1
++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
 +        hevc_loop_filter_luma_start
 +        push     {r4-r10,lr}       @ 8 regs = 32 bytes
 +
@@ -3168,7 +3334,7 @@ index 166bddb104..15c4329cdb 100644
 +        b        v_loop_luma_common_10
 +endfunc
 +
-+function ff_hevc_v_loop_filter_luma_neon_10, export=1
++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
 +        hevc_loop_filter_luma_start
 +        push     {r4-r10,lr}
 +
@@ -3177,27 +3343,27 @@ index 166bddb104..15c4329cdb 100644
 +        m_filter_v_luma_common_16 10
 +endfunc
 +
-+function ff_hevc_h_loop_filter_uv_neon_10, export=1
++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
 +        m_filter_h_uv_16 10
 +endfunc
 +
-+function ff_hevc_v_loop_filter_uv2_neon_10, export=1
++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
 +        m_filter_v_uv2_16 10
 +endfunc
 +
-+function ff_hevc_h_loop_filter_chroma_neon_10, export=1
++function ff_hevc_rpi_h_loop_filter_chroma_neon_10, export=1
 +        m_filter_h_chroma_16 10
 +endfunc
 +
-+function ff_hevc_v_loop_filter_chroma_neon_10, export=1
++function ff_hevc_rpi_v_loop_filter_chroma_neon_10, export=1
 +        m_filter_v_chroma_16 10
- endfunc
++endfunc
 +
-diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+diff --git a/libavcodec/arm/rpi_hevcdsp_epel_neon.S b/libavcodec/arm/rpi_hevcdsp_epel_neon.S
 new file mode 100644
-index 0000000000..00eab9eeee
+index 0000000000..acc6911091
 --- /dev/null
-+++ b/libavcodec/arm/hevcdsp_epel_neon.S
++++ b/libavcodec/arm/rpi_hevcdsp_epel_neon.S
 @@ -0,0 +1,337 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
@@ -3303,7 +3469,7 @@ index 0000000000..00eab9eeee
 +    vqshrn.s32  d6, q7, #6
 +.endm
 +
-+function ff_hevc_put_epel_h_neon_8, export=1
++function ff_hevc_rpi_put_epel_h_neon_8, export=1
 +        push   {r4-r7}
 +        mov    r4, MAX_PB_SIZE
 +        ldr    r7, [sp, #16] // mx
@@ -3365,7 +3531,7 @@ index 0000000000..00eab9eeee
 +        bx lr
 +endfunc
 +
-+function ff_hevc_put_epel_v_neon_8, export=1
++function ff_hevc_rpi_put_epel_v_neon_8, export=1
 +        push   {r4-r7}
 +        mov    r4, MAX_PB_SIZE
 +        ldr    r7, [sp, #20] // my
@@ -3441,7 +3607,7 @@ index 0000000000..00eab9eeee
 +        bx lr
 +endfunc
 +
-+function ff_hevc_put_epel_hv_neon_8, export=1
++function ff_hevc_rpi_put_epel_hv_neon_8, export=1
 +        push   {r4-r7}
 +        mov    r4, MAX_PB_SIZE
 +        ldr    r6, [sp, #16] // mx
@@ -3536,104 +3702,113 @@ index 0000000000..00eab9eeee
 +       .byte 4, 28, 46, 6
 +       .byte 2, 16, 54, 4
 +       .byte 2, 10, 58, 2
-diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
-index e39d00634b..ee2111f9b2 100644
---- a/libavcodec/arm/hevcdsp_idct_neon.S
-+++ b/libavcodec/arm/hevcdsp_idct_neon.S
-@@ -21,82 +21,6 @@
- #include "libavutil/arm/asm.S"
- #include "neon.S"
- 
--function ff_hevc_idct_4x4_dc_neon_8, export=1
--        ldrsh       r1, [r0]
--        ldr         r2, =0x20
--        add         r1, #1
--        asr         r1, #1
--        add         r1, r2
--        asr         r1, #6
--        vdup.16     q0, r1
--        vdup.16     q1, r1
--        vst1.16     {q0, q1}, [r0]
--        bx lr
--endfunc
--
--function ff_hevc_idct_8x8_dc_neon_8, export=1
--        ldrsh       r1, [r0]
--        ldr         r2, =0x20
--        add         r1, #1
--        asr         r1, #1
--        add         r1, r2
--        asr         r1, #6
--        vdup.16     q8, r1
--        vdup.16     q9, r1
--        vmov.16     q10, q8
--        vmov.16     q11, q8
--        vmov.16     q12, q8
--        vmov.16     q13, q8
--        vmov.16     q14, q8
--        vmov.16     q15, q8
--        vstm        r0, {q8-q15}
--        bx lr
--endfunc
--
--function ff_hevc_idct_16x16_dc_neon_8, export=1
--        ldrsh       r1, [r0]
--        ldr         r2, =0x20
--        add         r1, #1
--        asr         r1, #1
--        add         r1, r2
--        asr         r1, #6
--        vdup.16     q8, r1
--        vdup.16     q9, r1
--        vmov.16     q10, q8
--        vmov.16     q11, q8
--        vmov.16     q12, q8
--        vmov.16     q13, q8
--        vmov.16     q14, q8
--        vmov.16     q15, q8
--        vstm        r0!, {q8-q15}
--        vstm        r0!, {q8-q15}
--        vstm        r0!, {q8-q15}
--        vstm        r0, {q8-q15}
--        bx lr
--endfunc
--
--function ff_hevc_idct_32x32_dc_neon_8, export=1
--        ldrsh       r1, [r0]
--        ldr         r2, =0x20
--        add         r1, #1
--        asr         r1, #1
--        add         r1, r2
--        asr         r1, #6
--        mov         r3, #16
--        vdup.16     q8, r1
--        vdup.16     q9, r1
--        vmov.16     q10, q8
--        vmov.16     q11, q8
--        vmov.16     q12, q8
--        vmov.16     q13, q8
--        vmov.16     q14, q8
--        vmov.16     q15, q8
--1:      subs        r3, #1
--        vstm        r0!, {q8-q15}
--        bne         1b
--        bx lr
--endfunc
--
- function ff_hevc_add_residual_4x4_neon_8, export=1
-         vldm        r1, {q0-q1}
-         vld1.32     d4[0], [r0], r2
-@@ -168,6 +92,131 @@ function ff_hevc_add_residual_32x32_neon_8, export=1
-         bx          lr
- endfunc
- 
+diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+new file mode 100644
+index 0000000000..cd79460984
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+@@ -0,0 +1,379 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
++        vldm        r1, {q0-q1}
++        vld1.32     d4[0], [r0], r2
++        vld1.32     d4[1], [r0], r2
++        vld1.32     d5[0], [r0], r2
++        vld1.32     d5[1], [r0], r2
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q8, d4
++        vmovl.u8    q9, d5
++        vqadd.s16   q0, q0, q8
++        vqadd.s16   q1, q1, q9
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r2
++        vst1.32     d0[1], [r0], r2
++        vst1.32     d1[0], [r0], r2
++        vst1.32     d1[1], [r0], r2
++        bx          lr
++endfunc
++
++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
++        mov         r3,   #8
++1:      subs        r3,   #1
++        vld1.16     {q0}, [r1]!
++        vld1.8      d16,  [r0]
++        vmovl.u8    q8,   d16
++        vqadd.s16   q0,   q8
++        vqmovun.s16 d0,   q0
++        vst1.32     d0,   [r0], r2
++        bne         1b
++        bx          lr
++endfunc
++
++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
++        mov         r3,   #16
++1:      subs        r3,   #1
++        vld1.16     {q0, q1}, [r1]!
++        vld1.8      {q8},  [r0]
++        vmovl.u8    q9,  d16
++        vmovl.u8    q10, d17
++        vqadd.s16   q0,  q9
++        vqadd.s16   q1,  q10
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vst1.8      {q0},   [r0], r2
++        bne         1b
++        bx          lr
++endfunc
++
++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
++        mov         r3,   #32
++1:      subs        r3,   #1
++        vldm        r1!, {q0-q3}
++        vld1.8      {q8, q9},  [r0]
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vqmovun.s16 d2,  q2
++        vqmovun.s16 d3,  q3
++        vst1.8     {q0, q1},   [r0], r2
++        bne         1b
++        bx          lr
++endfunc
++
 +
-+@ ff_hevc_add_residual_4x4_dc_neon_8(
++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
 +@   uint8_t * dst,              // [r0]
 +@   unsigned int stride,        // [r1]
 +@   int dc)                     // [r2]
 +
-+function ff_hevc_add_residual_4x4_dc_neon_8, export=1
++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
 +        vdup.16     q15, r2
 +
 +        vld1.32     d4[0], [r0], r1
@@ -3653,23 +3828,23 @@ index e39d00634b..ee2111f9b2 100644
 +endfunc
 +
 +
-+@ ff_hevc_add_residual_4x4_dc_c_neon_8(
++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
 +@   uint8_t * dst,              // [r0]
 +@   unsigned int stride,        // [r1]
 +@   int dc)                     // [r2]
 +
-+function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
 +        vdup.32     q15, r2
 +        mov         r3,  #4
 +        b           1f
 +endfunc
 +
-+@ ff_hevc_add_residual_8x8_dc_neon_8(
++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
 +@   uint8_t * dst,              // [r0]
 +@   unsigned int stride,        // [r1]
 +@   int dc)                     // [r2]
 +
-+function ff_hevc_add_residual_8x8_dc_neon_8, export=1
++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
 +        vdup.16     q15, r2
 +        mov         r3,  #8
 +
@@ -3683,23 +3858,23 @@ index e39d00634b..ee2111f9b2 100644
 +endfunc
 +
 +
-+@ ff_hevc_add_residual_8x8_dc_c_neon_8(
++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
 +@   uint8_t * dst,              // [r0]
 +@   unsigned int stride,        // [r1]
 +@   int dc)                     // [r2]
 +
-+function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
 +        vdup.32     q15, r2
 +        mov         r3,  #8
 +        b           1f
 +endfunc
 +
-+@ ff_hevc_add_residual_16x16_dc_neon_8(
++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
 +@   uint8_t * dst,              // [r0]
 +@   unsigned int stride,        // [r1]
 +@   int dc)                     // [r2]
 +
-+function ff_hevc_add_residual_16x16_dc_neon_8, export=1
++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
 +        vdup.16     q15, r2
 +        mov         r3,  #16
 +
@@ -3715,23 +3890,23 @@ index e39d00634b..ee2111f9b2 100644
 +endfunc
 +
 +
-+@ ff_hevc_add_residual_16x16_dc_c_neon_8(
++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
 +@   uint8_t * dst,              // [r0]
 +@   unsigned int stride,        // [r1]
 +@   int dc)                     // [r2]
 +
-+function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
 +        vdup.32     q15, r2
 +        mov         r3,  #16
 +        b           1f
 +endfunc
 +
-+@ ff_hevc_add_residual_32x32_dc_neon_8(
++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
 +@   uint8_t * dst,              // [r0]
 +@   unsigned int stride,        // [r1]
 +@   int dc)                     // [r2]
 +
-+function ff_hevc_add_residual_32x32_dc_neon_8, export=1
++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
 +        vdup.16     q15, r2
 +        mov         r3,  #32
 +
@@ -3752,512 +3927,640 @@ index e39d00634b..ee2111f9b2 100644
 +
 +
 +
- .macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
-         vtrn.64         \r0, \r4
-         vtrn.64         \r1, \r5
-@@ -263,55 +312,6 @@ endfunc
-         vqrshrn.s32   \r3, q3, \shift
- .endm
- 
--function ff_hevc_transform_4x4_neon_8, export=1
--        vpush       {d8-d15}
--        vld1.16     {q14, q15}, [r0]  // coeffs
--        ldr         r3, =0x00240053 // 36 and 83
--        vmov.32     d0[0], r3
--
--        tr4_shift d28, d29, d30, d31, #7
--
--        vtrn.16     d28, d29
--        vtrn.16     d30, d31
--        vtrn.32     q14, q15
--
--        tr4_shift d28, d29, d30, d31, #12
--
--        vtrn.16     d28, d29
--        vtrn.16     d30, d31
--        vtrn.32     q14, q15
--
--        vst1.16     {q14, q15}, [r0]
--        vpop        {d8-d15}
--        bx lr
--endfunc
--
--function ff_hevc_transform_luma_4x4_neon_8, export=1
--        vpush       {d8-d15}
--        vld1.16     {q14, q15}, [r0]  // coeffs
--        ldr         r3, =0x4a  // 74
--        vmov.32     d0[0], r3
--        ldr         r3, =0x1d  // 29
--        vmov.32     d0[1], r3
--        ldr         r3, =0x37  // 55
--        vmov.32     d1[0], r3
--
--        tr4_luma_shift d28, d29, d30, d31, #7
--
--        vtrn.16     d28, d29
--        vtrn.16     d30, d31
--        vtrn.32     q14, q15
--
--        tr4_luma_shift d28, d29, d30, d31, #12
--
--        vtrn.16     d28, d29
--        vtrn.16     d30, d31
--        vtrn.32     q14, q15
--        vst1.16     {q14, q15}, [r0]
--        vpop        {d8-d15}
--        bx lr
--endfunc
--
- .macro tr8_begin in0, in1, in2, in3
-         vmull.s16  q7, \in0, d1[1]   // 89 * src1
-         vmull.s16  q8, \in0, d1[0]   // 75 * src1
-@@ -356,100 +356,6 @@ endfunc
-         vqrshrn.s32   d8, q5, \shift
- .endm
- 
--function ff_hevc_transform_8x8_neon_8, export=1
--        push   {r4-r8}
--        vpush {d8-d15}
--        mov    r5, #16
--
--        adr       r3, tr4f
--        vld1.16   {d0, d1}, [r3]
--
--        // left half
--        vld1.16 {d24}, [r0], r5
--        vld1.16 {d25}, [r0], r5
--        vld1.16 {d26}, [r0], r5
--        vld1.16 {d27}, [r0], r5
--        vld1.16 {d28}, [r0], r5
--        vld1.16 {d29}, [r0], r5
--        vld1.16 {d30}, [r0], r5
--        vld1.16 {d31}, [r0], r5
--        sub      r0, #128
--        tr8_begin d25, d27, d29, d31
--        tr4       d24, d26, d28, d30
--        tr8_end   #7
--        vst1.16 {d2}, [r0], r5
--        vst1.16 {d3}, [r0], r5
--        vst1.16 {d4}, [r0], r5
--        vst1.16 {d5}, [r0], r5
--        vst1.16 {d6}, [r0], r5
--        vst1.16 {d7}, [r0], r5
--        vst1.16 {d8}, [r0], r5
--        vst1.16 {d9}, [r0], r5
--        sub      r0, #128
--        //skip right half if col_limit in r1 is less than 4
--        cmp      r1, #4
--        blt      1f
--        //right half
--        add      r0, #8
--        vld1.16 {d24}, [r0], r5
--        vld1.16 {d25}, [r0], r5
--        vld1.16 {d26}, [r0], r5
--        vld1.16 {d27}, [r0], r5
--        vld1.16 {d28}, [r0], r5
--        vld1.16 {d29}, [r0], r5
--        vld1.16 {d30}, [r0], r5
--        vld1.16 {d31}, [r0], r5
--        sub      r0, #128
--        tr8_begin d25, d27, d29, d31
--        tr4       d24, d26, d28, d30
--        tr8_end   #7
--        vst1.16 {d2}, [r0], r5
--        vst1.16 {d3}, [r0], r5
--        vst1.16 {d4}, [r0], r5
--        vst1.16 {d5}, [r0], r5
--        vst1.16 {d6}, [r0], r5
--        vst1.16 {d7}, [r0], r5
--        vst1.16 {d8}, [r0], r5
--        vst1.16 {d9}, [r0], r5
--        sub      r0, #136
--1:
--        // top half
--        vldm r0, {q12-q15} // coeffs
--        transpose_16b_4x4 d24, d26, d28, d30
--        transpose_16b_4x4 d25, d27, d29, d31
--        tr8_begin d26, d30, d27, d31
--        tr4 d24, d28, d25, d29
--        tr8_end #12
--        transpose_16b_4x4 d2, d3, d4, d5
--        transpose_16b_4x4 d6, d7, d8, d9
--        vswp     d7, d5
--        vswp     d7, d8
--        vswp     d3, d6
--        vswp     d6, d4
--        vstm r0!, {q1-q4}
--
--        // bottom half
--        vldm r0, {q12-q15} // coeffs
--        transpose_16b_4x4 d24, d26, d28, d30
--        transpose_16b_4x4 d25, d27, d29, d31
--        tr8_begin d26, d30, d27, d31
--        tr4 d24, d28, d25, d29
--        tr8_end #12
--        transpose_16b_4x4 d2, d3, d4, d5
--        transpose_16b_4x4 d6, d7, d8, d9
--        vswp     d7, d5
--        vswp     d7, d8
--        vswp     d3, d6
--        vswp     d6, d4
--        //vstm     r0, {q1-q4}
--        vst1.16 {q1-q2}, [r0]
--        add     r0, #32
--        vst1.16 {q3-q4}, [r0]
--        sub     r0, #32
--        vpop {d8-d15}
--        pop {r4-r8}
--        bx lr
--endfunc
- 
- .align 4
- tr4f:
-@@ -463,3 +369,11 @@ tr16:
- .word 0x00500046  // 80, d2[2] = 70
- .word 0x0039002b  // 57, d2[0] = 43
- .word 0x00190009  // 25, d2[2] = 9
++.macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
++        vtrn.64         \r0, \r4
++        vtrn.64         \r1, \r5
++        vtrn.64         \r2, \r6
++        vtrn.64         \r3, \r7
++        vtrn.32         \r0, \r2
++        vtrn.32         \r1, \r3
++        vtrn.32         \r4, \r6
++        vtrn.32         \r5, \r7
++        vtrn.16         \r0, \r1
++        vtrn.16         \r2, \r3
++        vtrn.16         \r4, \r5
++        vtrn.16         \r6, \r7
++.endm
 +
-+#define BIT_DEPTH 8
-+#include "hevc_idct_fn_neon.S"
++// in 4 q regs
++// output 8 d regs
++.macro transpose_16b_4x4    r0, r1, r2, r3
++        vtrn.32         \r0, \r2
++        vtrn.32         \r1, \r3
++        vtrn.16         \r0, \r1
++        vtrn.16         \r2, \r3
++.endm
 +
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 10
-+#include "hevc_idct_fn_neon.S"
-+
-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 1a3912c609..ad625d067a 100644
---- a/libavcodec/arm/hevcdsp_init_neon.c
-+++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,11 +22,41 @@
- #include "libavutil/arm/cpu.h"
- #include "libavcodec/hevcdsp.h"
- #include "hevcdsp_arm.h"
-+#include "libavcodec/avcodec.h"
-+#include "libavcodec/bit_depth_template.c"
- 
- void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
- void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
- void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
- void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+#ifdef RPI
-+void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
-+                             const uint8_t no_p[2], const uint8_t no_q[2],
-+                             uint8_t * _pix_l);
-+void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
++/* uses registers q2 - q9 for temp values */
++/* TODO: reorder */
++.macro tr4_luma_shift r0, r1, r2, r3, shift
++        vaddl.s16   q5, \r0, \r2    // c0 = src0 + src2
++        vaddl.s16   q2, \r2, \r3    // c1 = src2 + src3
++        vsubl.s16   q4, \r0, \r3    // c2 = src0 - src3
++        vmull.s16   q6, \r1, d0[0]  // c3 = 74 * src1
++
++        vaddl.s16   q7, \r0, \r3    // src0 + src3
++        vsubw.s16   q7, q7, \r2     // src0 - src2 + src3
++        vmul.s32    q7, q7, d0[0]   // dst2 = 74 * (src0 - src2 + src3)
++
++        vmul.s32    q8, q5, d0[1]   // 29 * c0
++        vmul.s32    q9, q2, d1[0]   // 55 * c1
++        vadd.s32    q8, q9          // 29 * c0 + 55 * c1
++        vadd.s32    q8, q6          // dst0 = 29 * c0 + 55 * c1 + c3
++
++        vmul.s32    q2, q2, d0[1]   // 29 * c1
++        vmul.s32    q9, q4, d1[0]   // 55 * c2
++        vsub.s32    q9, q2          // 55 * c2 - 29 * c1
++        vadd.s32    q9, q6          // dst1 = 55 * c2 - 29 * c1 + c3
++
++        vmul.s32    q5, q5, d1[0]   // 55 * c0
++        vmul.s32    q4, q4, d0[1]   // 29 * c2
++        vadd.s32    q5, q4          // 55 * c0 + 29 * c2
++        vsub.s32    q5, q6          // dst3 = 55 * c0 + 29 * c2 - c3
++
++        vqrshrn.s32   \r0, q8, \shift
++        vqrshrn.s32   \r1, q9, \shift
++        vqrshrn.s32   \r2, q7, \shift
++        vqrshrn.s32   \r3, q5, \shift
++.endm
 +
-+void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
-+                             const uint8_t no_p[2], const uint8_t no_q[2],
-+                             uint8_t * _pix_l);
-+void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                             unsigned int no_f);
-+void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                             uint8_t * src_l,
-+                             unsigned int no_f);
-+#endif
++/* uses registers q2 - q6 for temp values */
++.macro tr4 r0, r1, r2, r3
++        vmull.s16  q4, \r1, d0[0]   // 83 * src1
++        vmull.s16  q6, \r1, d0[1]   // 36 * src1
++        vshll.s16  q2, \r0, #6   // 64 * src0
++        vshll.s16  q3, \r2, #6   // 64 * src2
++        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
++        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
++        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
++        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
++
++        vsub.s32   q3, q5, q4    // e0 - o0
++        vadd.s32   q4, q5, q4    // e0 + o0
++        vadd.s32   q5, q2, q6    // e1 + o1
++        vsub.s32   q6, q2, q6    // e1 - o1
++.endm
 +
- void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
- void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
- void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
-@@ -34,6 +64,15 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
- void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
- void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
- void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
-+
-+void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
-+
- void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                      ptrdiff_t stride);
- void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-@@ -43,6 +82,157 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
- void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                        ptrdiff_t stride);
- 
-+void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++.macro tr4_shift r0, r1, r2, r3, shift
++        vmull.s16  q4, \r1, d0[0]   // 83 * src1
++        vmull.s16  q6, \r1, d0[1]   // 36 * src1
++        vshll.s16  q2, \r0, #6   // 64 * src0
++        vshll.s16  q3, \r2, #6   // 64 * src2
++        vadd.s32   q5, q2, q3    // 64 * (src0 + src2)     e0
++        vsub.s32   q2, q2, q3    // 64 * (src0 - src2)     e1
++        vmlal.s16  q4, \r3, d0[1]   // 83 * src1 + 36 * src3  o0
++        vmlsl.s16  q6, \r3, d0[0]   // 36 * src1 - 83 * src3  o1
++
++        vsub.s32   q3, q5, q4    // e0 - o0
++        vadd.s32   q4, q5, q4    // e0 + o0
++        vadd.s32   q5, q2, q6    // e1 + o1
++        vsub.s32   q6, q2, q6    // e1 - o1
++
++        vqrshrn.s32   \r0, q4, \shift
++        vqrshrn.s32   \r1, q5, \shift
++        vqrshrn.s32   \r2, q6, \shift
++        vqrshrn.s32   \r3, q3, \shift
++.endm
 +
++.macro tr8_begin in0, in1, in2, in3
++        vmull.s16  q7, \in0, d1[1]   // 89 * src1
++        vmull.s16  q8, \in0, d1[0]   // 75 * src1
++        vmull.s16  q9, \in0, d1[3]   // 50 * src1
++        vmull.s16  q10, \in0, d1[2]  // 18 * src1
++
++        vmlal.s16  q7, \in1, d1[0]   // 75 * src3
++        vmlsl.s16  q8, \in1, d1[2]   //-18 * src3
++        vmlsl.s16  q9, \in1, d1[1]   //-89 * src3
++        vmlsl.s16  q10, \in1, d1[3]  //-50 * src3
++
++        vmlal.s16  q7, \in2, d1[3]   // 50 * src5
++        vmlsl.s16  q8, \in2, d1[1]   //-89 * src5
++        vmlal.s16  q9, \in2, d1[2]   // 18 * src5
++        vmlal.s16  q10, \in2, d1[0]  // 75 * src5
++
++        vmlal.s16  q7, \in3, d1[2]   // 18 * src7
++        vmlsl.s16  q8, \in3, d1[3]   //-50 * src7
++        vmlal.s16  q9, \in3, d1[0]   // 75 * src7
++        vmlsl.s16  q10, \in3, d1[1]  //-89 * src7
++.endm
 +
-+void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                     ptrdiff_t stride);
-+void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
-+void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
-+                                       ptrdiff_t stride);
++.macro tr8_end shift
++        vadd.s32   q1, q4, q7   //  e_8[0] + o_8[0], dst[0]
++        vsub.s32   q4, q4, q7   //  e_8[0] - o_8[0], dst[7]
++
++        vadd.s32   q2, q5, q8   // e_8[1] + o_8[1], dst[1]
++        vsub.s32   q5, q5, q8   // e_8[1] - o_8[1], dst[6]
++
++        vadd.s32   q11, q6, q9  // e_8[2] + o_8[2], dst[2]
++        vsub.s32    q6, q6, q9  // e_8[2] - o_8[2], dst[5]
++
++        vadd.s32   q12, q3, q10 // e_8[3] + o_8[3], dst[3]
++        vsub.s32   q3, q3, q10  // e_8[3] - o_8[3], dst[4]
++        vqrshrn.s32   d2, q1, \shift
++        vqrshrn.s32   d3, q2, \shift
++        vqrshrn.s32   d4, q11, \shift
++        vqrshrn.s32   d5, q12, \shift
++        vqrshrn.s32   d6, q3, \shift
++        vqrshrn.s32   d7, q6, \shift
++        vqrshrn.s32   d9, q4, \shift
++        vqrshrn.s32   d8, q5, \shift
++.endm
 +
-+void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
 +
++.align 4
++tr4f:
++.word 0x00240053  // 36 and d1[0] = 83
++.word 0x00000000
++tr8f:
++.word 0x0059004b  // 89, d0[0] = 75
++.word 0x00320012  // 50, d0[2] = 18
++tr16:
++.word 0x005a0057  // 90, d2[0] = 87
++.word 0x00500046  // 80, d2[2] = 70
++.word 0x0039002b  // 57, d2[0] = 43
++.word 0x00190009  // 25, d2[2] = 9
 +
-+#if RPI_HEVC_SAND
-+void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride);
-+void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++#define BIT_DEPTH 8
++#include "rpi_hevc_idct_fn_neon.S"
 +
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "rpi_hevc_idct_fn_neon.S"
 +
-+void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_v);
-+void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
+diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+new file mode 100644
+index 0000000000..109fa98c29
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+@@ -0,0 +1,32 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++
++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags))
++        ff_hevcdsp_rpi_init_neon(c, bit_depth);
++}
+diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+new file mode 100644
+index 0000000000..472d9d75c9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+@@ -0,0 +1,652 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/bit_depth_template.c"
++
++void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++
++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++
++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
++
++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride, int dc_u);
-+void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride);
-+void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride);
-+void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
 +                                       ptrdiff_t stride);
-+void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+#endif
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
 +
-+void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
 +
-+void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
 +
-+#if RPI_HEVC_SAND
-+void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height);
-+void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height);
-+void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height);
 +
-+void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height);
-+void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height);
-+void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height);
 +
-+void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
-+void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
-+void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
 +
-+void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
-+void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
-+void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
-+#endif
 +
-+void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
 +
-+void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
 +
 +
- #define PUT_PIXELS(name) \
-     void name(int16_t *dst, uint8_t *src, \
-                                 ptrdiff_t srcstride, int height, \
-@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
- PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
- PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
- #undef PUT_PIXELS
-+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
++#define PUT_PIXELS(name) \
++    void name(int16_t *dst, uint8_t *src, \
++                                ptrdiff_t srcstride, int height, \
++                                intptr_t mx, intptr_t my, int width)
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w2_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w4_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w6_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w8_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w12_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w16_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w24_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w32_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w48_neon_8);
++PUT_PIXELS(ff_hevc_rpi_put_pixels_w64_neon_8);
++#undef PUT_PIXELS
++void ff_hevc_rpi_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
 +                                ptrdiff_t srcstride, int height,
 +                                intptr_t mx, intptr_t my, int width);
-+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
++void ff_hevc_rpi_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
 +                                ptrdiff_t srcstride, int height,
 +                                intptr_t mx, intptr_t my, int width);
-+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
++void ff_hevc_rpi_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
 +                                ptrdiff_t srcstride, int height,
 +                                intptr_t mx, intptr_t my, int width);
- 
- static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-                                    int height, int width);
-@@ -142,14 +341,124 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
-     put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
- }
- 
-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++
++static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                   int height, int width);
++static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
++void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                   int height, intptr_t mx, intptr_t my, int width);
++void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                   int height, intptr_t mx, intptr_t my, int width);
++void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width);
++#define QPEL_FUNC(name) \
++    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
++                                   int height, int width)
++
++QPEL_FUNC(ff_hevc_rpi_put_qpel_v1_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_v2_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_v3_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v1_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v2_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v3_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v1_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v2_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v3_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v1_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v2_neon_8);
++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v3_neon_8);
++#undef QPEL_FUNC
++
++#define QPEL_FUNC_UW_PIX(name) \
++    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
++                                   int height, intptr_t mx, intptr_t my, int width);
++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8);
++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8);
++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8);
++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8);
++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8);
++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8);
++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8);
++#undef QPEL_FUNC_UW_PIX
++
++#define QPEL_FUNC_UW(name) \
++    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
++                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_pixels_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v1_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v2_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v3_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v1_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v2_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v3_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v1_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v2_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v3_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v1_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v2_neon_8);
++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v3_neon_8);
++#undef QPEL_FUNC_UW
++
++void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                   int height, intptr_t mx, intptr_t my, int width) {
++
++    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
++}
++
++void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                   int height, intptr_t mx, intptr_t my, int width) {
++
++    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
++}
++
++void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width) {
++    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
++}
++
++void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
 +                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
 +                                                const MvField *curr, const MvField *neigh, uint8_t *bs);
 +
 +
-+static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
 +{
-+    ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
 +}
-+static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
 +{
-+    ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+    ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
 +}
 +
-+static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
 +{
-+    ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
 +}
-+static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
 +{
-+    ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+    ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
 +}
 +
 +#if SAO_FILTER_N == 6
-+static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
 +{
-+    ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
 +}
-+static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
 +{
-+    ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+    ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
 +}
 +
-+static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
 +{
-+    ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
 +}
-+static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
 +{
-+    ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+    ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
 +}
 +
-+#if RPI_HEVC_SAND
-+static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height)
 +{
-+    ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
 +}
-+static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
 +                                  int eo, int width, int height)
 +{
-+    ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+    ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
 +}
 +
-+static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height)
 +{
-+    ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
 +                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
 +                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
-+static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height)
 +{
-+    ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
 +                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+    ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
 +                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
 +#endif
-+#endif
 +
 +
 +
@@ -4265,787 +4568,1420 @@ index 1a3912c609..ad625d067a 100644
 +#error SAO edge src stride not 160 - value used in .S
 +#endif
 +
- av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
- {
-     if (bit_depth == 8) {
-         int x;
-         c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon;
-         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
-         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
-+        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
-         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
-+        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
-+#ifdef RPI
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_8;
-+#endif
-         c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
-         c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
-         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
-@@ -160,7 +469,53 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_8;
-         c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_8;
-         c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_8;
-+        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_8;
-+        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_8;
-+        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_8;
-+        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_8;
-+#if RPI_HEVC_SAND
-+        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
-+        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
-+        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_8;
-+        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_8;
-+        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_8;
-+        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_8;
-+        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
-+        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
-+        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
-+        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_8;
-+        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_8;
-+        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_8;
-+#endif
-         c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
-+        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_8;
-+        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_8;
-+        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_8;
-+        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_8;
-+        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_8;
-+        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_8;
-+        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_8;
-+        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_8;
-+        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_8;
-+        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_8;
++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
++{
++    if (bit_depth == 8) {
++        int x;
++        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon;
++        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon;
++        c->hevc_v_loop_filter_chroma   = ff_hevc_rpi_v_loop_filter_chroma_neon;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_rpi_v_loop_filter_chroma_neon;
++        c->hevc_h_loop_filter_chroma   = ff_hevc_rpi_h_loop_filter_chroma_neon;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_rpi_h_loop_filter_chroma_neon;
++        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
++        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
++        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
++        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
++        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
++        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
++        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
++        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
++        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
++        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
++        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
++        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
++        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
++        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
++        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
++        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
++        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
++        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
++        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
++        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
++        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
++        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
++        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
++        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
++        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
++        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
++        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
++        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
++        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
++        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
++        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
++        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
++        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
++        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
++        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
++        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
++        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
++        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
++        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
 +#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_8;
-+        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_8;
++        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
++        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
 +#endif
-+#if RPI_HEVC_SAND
-+        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_8;
-+        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_8;
-+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_8;
++        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
++        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
++        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
 +
-+        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_8;
-+        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_8;
-+        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_8;
++        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
++        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
++        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
 +
 +#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_8;
-+        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_8;
-+#endif
-+#endif
-         put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
-         put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
-         put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-             c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
-             c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-             c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
-+            c->put_hevc_epel[x][1][0]         = ff_hevc_put_epel_v_neon_8;
-+            c->put_hevc_epel[x][0][1]         = ff_hevc_put_epel_h_neon_8;
-+            c->put_hevc_epel[x][1][1]         = ff_hevc_put_epel_hv_neon_8;
-         }
-+        c->put_hevc_epel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-+        c->put_hevc_epel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-+        c->put_hevc_epel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-+        c->put_hevc_epel[3][0][0]  = ff_hevc_put_pixels_w8_neon_8;
-+        c->put_hevc_epel[4][0][0]  = ff_hevc_put_pixels_w12_neon_8;
-+        c->put_hevc_epel[5][0][0]  = ff_hevc_put_pixels_w16_neon_8;
-+        c->put_hevc_epel[6][0][0]  = ff_hevc_put_pixels_w24_neon_8;
-+        c->put_hevc_epel[7][0][0]  = ff_hevc_put_pixels_w32_neon_8;
-+        c->put_hevc_epel[8][0][0]  = ff_hevc_put_pixels_w48_neon_8;
-+        c->put_hevc_epel[9][0][0]  = ff_hevc_put_pixels_w64_neon_8;
-+
-         c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
-         c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
-         c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
-         c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
-     }
++        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
++        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
++#endif
++        put_hevc_qpel_neon[1][0]       = ff_hevc_rpi_put_qpel_v1_neon_8;
++        put_hevc_qpel_neon[2][0]       = ff_hevc_rpi_put_qpel_v2_neon_8;
++        put_hevc_qpel_neon[3][0]       = ff_hevc_rpi_put_qpel_v3_neon_8;
++        put_hevc_qpel_neon[0][1]       = ff_hevc_rpi_put_qpel_h1_neon_8;
++        put_hevc_qpel_neon[0][2]       = ff_hevc_rpi_put_qpel_h2_neon_8;
++        put_hevc_qpel_neon[0][3]       = ff_hevc_rpi_put_qpel_h3_neon_8;
++        put_hevc_qpel_neon[1][1]       = ff_hevc_rpi_put_qpel_h1v1_neon_8;
++        put_hevc_qpel_neon[1][2]       = ff_hevc_rpi_put_qpel_h2v1_neon_8;
++        put_hevc_qpel_neon[1][3]       = ff_hevc_rpi_put_qpel_h3v1_neon_8;
++        put_hevc_qpel_neon[2][1]       = ff_hevc_rpi_put_qpel_h1v2_neon_8;
++        put_hevc_qpel_neon[2][2]       = ff_hevc_rpi_put_qpel_h2v2_neon_8;
++        put_hevc_qpel_neon[2][3]       = ff_hevc_rpi_put_qpel_h3v2_neon_8;
++        put_hevc_qpel_neon[3][1]       = ff_hevc_rpi_put_qpel_h1v3_neon_8;
++        put_hevc_qpel_neon[3][2]       = ff_hevc_rpi_put_qpel_h2v3_neon_8;
++        put_hevc_qpel_neon[3][3]       = ff_hevc_rpi_put_qpel_h3v3_neon_8;
++        put_hevc_qpel_uw_neon[1][0]      = ff_hevc_rpi_put_qpel_uw_v1_neon_8;
++        put_hevc_qpel_uw_neon[2][0]      = ff_hevc_rpi_put_qpel_uw_v2_neon_8;
++        put_hevc_qpel_uw_neon[3][0]      = ff_hevc_rpi_put_qpel_uw_v3_neon_8;
++        put_hevc_qpel_uw_neon[0][1]      = ff_hevc_rpi_put_qpel_uw_h1_neon_8;
++        put_hevc_qpel_uw_neon[0][2]      = ff_hevc_rpi_put_qpel_uw_h2_neon_8;
++        put_hevc_qpel_uw_neon[0][3]      = ff_hevc_rpi_put_qpel_uw_h3_neon_8;
++        put_hevc_qpel_uw_neon[1][1]      = ff_hevc_rpi_put_qpel_uw_h1v1_neon_8;
++        put_hevc_qpel_uw_neon[1][2]      = ff_hevc_rpi_put_qpel_uw_h2v1_neon_8;
++        put_hevc_qpel_uw_neon[1][3]      = ff_hevc_rpi_put_qpel_uw_h3v1_neon_8;
++        put_hevc_qpel_uw_neon[2][1]      = ff_hevc_rpi_put_qpel_uw_h1v2_neon_8;
++        put_hevc_qpel_uw_neon[2][2]      = ff_hevc_rpi_put_qpel_uw_h2v2_neon_8;
++        put_hevc_qpel_uw_neon[2][3]      = ff_hevc_rpi_put_qpel_uw_h3v2_neon_8;
++        put_hevc_qpel_uw_neon[3][1]      = ff_hevc_rpi_put_qpel_uw_h1v3_neon_8;
++        put_hevc_qpel_uw_neon[3][2]      = ff_hevc_rpi_put_qpel_uw_h2v3_neon_8;
++        put_hevc_qpel_uw_neon[3][3]      = ff_hevc_rpi_put_qpel_uw_h3v3_neon_8;
++        for (x = 0; x < 10; x++) {
++            c->put_hevc_qpel[x][1][0]         = ff_hevc_rpi_put_qpel_neon_wrapper;
++            c->put_hevc_qpel[x][0][1]         = ff_hevc_rpi_put_qpel_neon_wrapper;
++            c->put_hevc_qpel[x][1][1]         = ff_hevc_rpi_put_qpel_neon_wrapper;
++            c->put_hevc_qpel_uni[x][1][0]     = ff_hevc_rpi_put_qpel_uni_neon_wrapper;
++            c->put_hevc_qpel_uni[x][0][1]     = ff_hevc_rpi_put_qpel_uni_neon_wrapper;
++            c->put_hevc_qpel_uni[x][1][1]     = ff_hevc_rpi_put_qpel_uni_neon_wrapper;
++            c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_rpi_put_qpel_bi_neon_wrapper;
++            c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_rpi_put_qpel_bi_neon_wrapper;
++            c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_rpi_put_qpel_bi_neon_wrapper;
++            c->put_hevc_epel[x][1][0]         = ff_hevc_rpi_put_epel_v_neon_8;
++            c->put_hevc_epel[x][0][1]         = ff_hevc_rpi_put_epel_h_neon_8;
++            c->put_hevc_epel[x][1][1]         = ff_hevc_rpi_put_epel_hv_neon_8;
++        }
++        c->put_hevc_epel[0][0][0]  = ff_hevc_rpi_put_pixels_w2_neon_8;
++        c->put_hevc_epel[1][0][0]  = ff_hevc_rpi_put_pixels_w4_neon_8;
++        c->put_hevc_epel[2][0][0]  = ff_hevc_rpi_put_pixels_w6_neon_8;
++        c->put_hevc_epel[3][0][0]  = ff_hevc_rpi_put_pixels_w8_neon_8;
++        c->put_hevc_epel[4][0][0]  = ff_hevc_rpi_put_pixels_w12_neon_8;
++        c->put_hevc_epel[5][0][0]  = ff_hevc_rpi_put_pixels_w16_neon_8;
++        c->put_hevc_epel[6][0][0]  = ff_hevc_rpi_put_pixels_w24_neon_8;
++        c->put_hevc_epel[7][0][0]  = ff_hevc_rpi_put_pixels_w32_neon_8;
++        c->put_hevc_epel[8][0][0]  = ff_hevc_rpi_put_pixels_w48_neon_8;
++        c->put_hevc_epel[9][0][0]  = ff_hevc_rpi_put_pixels_w64_neon_8;
++
++        c->put_hevc_qpel[0][0][0]  = ff_hevc_rpi_put_pixels_w2_neon_8;
++        c->put_hevc_qpel[1][0][0]  = ff_hevc_rpi_put_pixels_w4_neon_8;
++        c->put_hevc_qpel[2][0][0]  = ff_hevc_rpi_put_pixels_w6_neon_8;
++        c->put_hevc_qpel[3][0][0]  = ff_hevc_rpi_put_pixels_w8_neon_8;
++        c->put_hevc_qpel[4][0][0]  = ff_hevc_rpi_put_pixels_w12_neon_8;
++        c->put_hevc_qpel[5][0][0]  = ff_hevc_rpi_put_pixels_w16_neon_8;
++        c->put_hevc_qpel[6][0][0]  = ff_hevc_rpi_put_pixels_w24_neon_8;
++        c->put_hevc_qpel[7][0][0]  = ff_hevc_rpi_put_pixels_w32_neon_8;
++        c->put_hevc_qpel[8][0][0]  = ff_hevc_rpi_put_pixels_w48_neon_8;
++        c->put_hevc_qpel[9][0][0]  = ff_hevc_rpi_put_pixels_w64_neon_8;
++
++        c->put_hevc_qpel_uni[1][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8;
++        c->put_hevc_qpel_uni[3][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8;
++        c->put_hevc_qpel_uni[5][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8;
++        c->put_hevc_qpel_uni[6][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8;
++        c->put_hevc_qpel_uni[7][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8;
++        c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8;
++        c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8;
++    }
 +    else if (bit_depth == 10) {
-+        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon_10;
-+        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon_10;
-+        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon_10;
-+        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon_10;
-+        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
-+        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon_10;
-+        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
-+#ifdef RPI
-+        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_10;
-+        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_10;
-+        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_10;
-+#endif
-+        c->idct[0]                     = ff_hevc_transform_4x4_neon_10;
-+        c->idct[1]                     = ff_hevc_transform_8x8_neon_10;
-+        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_10;
-+        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_10;
-+        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_10;
-+        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_10;
-+        c->add_residual[0]             = ff_hevc_add_residual_4x4_neon_10;
-+        c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_10;
-+        c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_10;
-+        c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_10;
-+        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_10;
-+        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_10;
-+        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_10;
-+        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_10;
-+#if RPI_HEVC_SAND
-+        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_10;
-+        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_10;
-+        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_10;
-+        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_10;
-+        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_10;
-+        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_10;
-+        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_10;
-+        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_10;
-+        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_10;
-+        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_10;
-+        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_10;
-+        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_10;
-+#endif
-+        c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_10;
-+        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_10;
-+        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_10;
-+        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_10;
-+        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_10;
-+        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_10;
-+
-+        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_10;
-+        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_10;
-+        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_10;
-+        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_10;
-+        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_10;
++        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_chroma   = ff_hevc_rpi_v_loop_filter_chroma_neon_10;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_rpi_v_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma   = ff_hevc_rpi_h_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_rpi_h_loop_filter_chroma_neon_10;
++        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
++        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
++        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
++        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
++        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
++        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
++        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
++        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
++        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
++        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
++        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
++        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
++        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
++        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
++        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
++        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
++        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
++        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
++        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
++        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
++        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
++        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
++        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
++        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
++        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
++        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
++        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
++        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
++        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
++        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
++        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
++        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
++        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
++        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
++
++        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
++        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
++        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
++        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
++        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
 +#if SAO_FILTER_N == 6
-+        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_10;
-+        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_10;
++        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
++        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
 +#endif
-+#if RPI_HEVC_SAND
-+        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_10;
-+        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_10;
-+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_10;
++        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
++        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
++        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
 +
-+        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_10;
-+        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_10;
-+        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_10;
++        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
++        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
++        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
 +
 +#if SAO_FILTER_N == 6
-+        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_10;
-+        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_10;
-+#endif
++        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
++        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
 +#endif
 +    }
 +
 +    assert(offsetof(MvField, mv) == 0);
 +    assert(offsetof(MvField, ref_idx) == 8);
 +    assert(offsetof(MvField, pred_flag) == 10);
-+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
- }
-diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
++    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
++}
+diff --git a/libavcodec/arm/rpi_hevcdsp_qpel_neon.S b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S
 new file mode 100644
-index 0000000000..7cc5cd5e5c
+index 0000000000..86a9dcc377
 --- /dev/null
-+++ b/libavcodec/arm/hevcdsp_res16_neon.S
-@@ -0,0 +1,610 @@
++++ b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S
+@@ -0,0 +1,999 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
 +
-+#define BIT_DEPTH 10
++#define MAX_PB_SIZE #64
 +
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
++.macro regshuffle_d8
++    vmov d16, d17
++    vmov d17, d18
++    vmov d18, d19
++    vmov d19, d20
++    vmov d20, d21
++    vmov d21, d22
++    vmov d22, d23
 +.endm
 +
-+@ add_residual4x4(
-+@  uint8_t *_dst,     [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
-+
-+function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
-+        vld1.16     {q10, q11}, [r1]
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vld1.16     {d0}, [r0, :64], r2
-+        vld1.16     {d1}, [r0, :64], r2
-+        vld1.16     {d2}, [r0, :64], r2
-+        vld1.16     {d3}, [r0, :64], r2
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q11
-+        sub         r0,  r0,  r2, lsl #2
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0, :64], r2
-+        vst1.16     {d1}, [r0, :64], r2
-+        vst1.16     {d2}, [r0, :64], r2
-+        vst1.16     {d3}, [r0, :64], r2
-+        bx          lr
++.macro regshuffle_q8
++    vmov q0, q1
++    vmov q1, q2
++    vmov q2, q3
++    vmov q3, q4
++    vmov q4, q5
++    vmov q5, q6
++    vmov q6, q7
++.endm
 +
-+endfunc
++.macro vextin8
++        pld       [r2]
++        vld1.8    {q11}, [r2], r3
++        vext.8    d16, d22, d23, #1
++        vext.8    d17, d22, d23, #2
++        vext.8    d18, d22, d23, #3
++        vext.8    d19, d22, d23, #4
++        vext.8    d20, d22, d23, #5
++        vext.8    d21, d22, d23, #6
++        vext.8    d22, d22, d23, #7
++.endm
 +
-+@ add_residual4x4(
-+@  uint8_t *_dst,     [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
++.macro loadin8
++        pld       [r2]
++        vld1.8    {d16}, [r2], r3
++        pld       [r2]
++        vld1.8    {d17}, [r2], r3
++        pld       [r2]
++        vld1.8    {d18}, [r2], r3
++        pld       [r2]
++        vld1.8    {d19}, [r2], r3
++        pld       [r2]
++        vld1.8    {d20}, [r2], r3
++        pld       [r2]
++        vld1.8    {d21}, [r2], r3
++        pld       [r2]
++        vld1.8    {d22}, [r2], r3
++        pld       [r2]
++        vld1.8    {d23}, [r2], r3
++.endm
 +
-+function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vdup.i16    q9,  r3
-+        vld1.16     {d0}, [r0, :64], r1
-+        vld1.16     {d1}, [r0, :64], r1
-+        vdup.16     q15, r2
-+        vld1.16     {d2}, [r0, :64], r1
-+        vld1.16     {d3}, [r0, :64], r1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q15
-+        sub         r0,  r0,  r1, lsl #2
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {d0}, [r0, :64], r1
-+        vst1.16     {d1}, [r0, :64], r1
-+        vst1.16     {d2}, [r0, :64], r1
-+        vst1.16     {d3}, [r0, :64], r1
-+        bx          lr
++.macro qpel_filter_1_32b
++        vmov.i16   d16, #58
++        vmov.i16   d17, #10
++        vmull.s16   q9, d6, d16   // 58 * d0
++        vmull.s16  q10, d7, d16   // 58 * d1
++        vmov.i16   d16, #17
++        vmull.s16  q11, d4, d17   // 10 * c0
++        vmull.s16  q12, d5, d17   // 10 * c1
++        vmov.i16   d17, #5
++        vmull.s16  q13, d8, d16   // 17 * e0
++        vmull.s16  q14, d9, d16   // 17 * e1
++        vmull.s16  q15, d10, d17  //  5 * f0
++        vmull.s16   q8, d11, d17  //  5 * f1
++        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
++        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
++        vshll.s16  q11, d2, #2    // 4 * b0
++        vshll.s16  q12, d3, #2    // 4 * b1
++        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
++        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
++        vsubl.s16  q13, d12, d0   // g0 - a0
++        vsubl.s16  q14, d13, d1   // g1 - a1
++        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
++        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
++        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
++        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
++        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
++        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
++        vqshrn.s32  d16, q9, #6
++        vqshrn.s32  d17, q10, #6
++.endm
 +
-+endfunc
++// input  q0 - q7
++// output q8
++.macro qpel_filter_2_32b
++        vmov.i32   q8, #11
++        vaddl.s16   q9, d6, d8   // d0 + e0
++        vaddl.s16  q10, d7, d9   // d1 + e1
++        vaddl.s16  q11, d4, d10  // c0 + f0
++        vaddl.s16  q12, d5, d11  // c1 + f1
++        vmul.s32   q11, q8       // 11 * (c0 + f0)
++        vmul.s32   q12, q8       // 11 * (c1 + f1)
++        vmov.i32   q8, #40
++        vaddl.s16  q15, d2, d12  // b0 + g0
++        vmul.s32    q9, q8       // 40 * (d0 + e0)
++        vmul.s32   q10, q8       // 40 * (d1 + e1)
++        vaddl.s16   q8, d3, d13  // b1 + g1
++        vaddl.s16  q13, d0, d14  // a0 + h0
++        vaddl.s16  q14, d1, d15  // a1 + h1
++        vshl.s32   q15, #2       // 4*(b0+g0)
++        vshl.s32    q8, #2       // 4*(b1+g1)
++        vadd.s32   q11, q13      // 11 * (c0 + f0) + a0 + h0
++        vadd.s32   q12, q14      // 11 * (c1 + f1) + a1 + h1
++        vadd.s32   q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
++        vadd.s32   q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
++        vsub.s32   q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
++        vsub.s32   q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
++        vqshrn.s32  d16, q9, #6
++        vqshrn.s32  d17, q10, #6
++.endm
 +
++.macro qpel_filter_3_32b
++        vmov.i16   d16, #58
++        vmov.i16   d17, #10
++        vmull.s16   q9, d8, d16   // 58 * d0
++        vmull.s16  q10, d9, d16   // 58 * d1
++        vmov.i16   d16, #17
++        vmull.s16  q11, d10, d17  // 10 * c0
++        vmull.s16  q12, d11, d17  // 10 * c1
++        vmov.i16   d17, #5
++        vmull.s16  q13, d6, d16   // 17 * e0
++        vmull.s16  q14, d7, d16   // 17 * e1
++        vmull.s16  q15, d4, d17   //  5 * f0
++        vmull.s16   q8, d5, d17   //  5 * f1
++        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
++        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
++        vshll.s16  q11, d12, #2   // 4 * b0
++        vshll.s16  q12, d13, #2   // 4 * b1
++        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
++        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
++        vsubl.s16  q13, d2, d14   // g0 - a0
++        vsubl.s16  q14, d3, d15   // g1 - a1
++        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
++        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
++        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
++        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
++        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
++        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
++        vqshrn.s32  d16, q9, #6
++        vqshrn.s32  d17, q10, #6
++.endm
 +
-+@ add_residual8x8(
-+@  uint8_t *_dst,     [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
++.macro qpel_filter_1 out=q7
++        vmov.u8    d24, #58
++        vmov.u8    d25, #10
++        vshll.u8   q13, d20, #4   // 16*e
++        vshll.u8   q14, d21, #2   // 4*f
++        vmull.u8  \out, d19, d24  // 58*d
++        vaddw.u8   q13, q13, d20  // 17*e
++        vmull.u8   q15, d18, d25  // 10*c
++        vaddw.u8   q14, q14, d21  // 5*f
++        vsubl.u8   q12, d22, d16  // g - a
++        vadd.u16  \out, q13       // 58d + 17e
++        vshll.u8   q13, d17, #2   // 4*b
++        vadd.u16   q15, q14       // 10*c + 5*f
++        vadd.s16   q13, q12       // - a + 4*b + g
++        vsub.s16  \out, q15       // -10*c + 58*d + 17*e -5*f
++        vadd.s16  \out, q13       // -a + 4*b -10*c + 58*d + 17*e -5*f
++.endm
 +
-+function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+        mov         r12, #2
-+1:
-+        vldm        r1!, {q10-q13}
-+        vld1.16     {q0}, [r0, :128], r2
-+        subs        r12, #1
-+        vld1.16     {q1}, [r0, :128], r2
-+        vqadd.s16   q0,  q10
-+        vld1.16     {q2}, [r0, :128], r2
-+        vqadd.s16   q1,  q11
-+        vld1.16     {q3}, [r0, :128], r2
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        sub         r0,  r0,  r2, lsl #2
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmax.s16    q2,  q2,  q8
-+        vmax.s16    q3,  q3,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vst1.16     {q0}, [r0, :128], r2
-+        vmin.s16    q2,  q2,  q9
-+        vst1.16     {q1}, [r0, :128], r2
-+        vmin.s16    q3,  q3,  q9
-+        vst1.16     {q2}, [r0, :128], r2
-+        vst1.16     {q3}, [r0, :128], r2
-+        bne         1b
-+        bx          lr
++.macro qpel_filter_2 out=q7
++        vmov.i16   q12, #10
++        vmov.i16   q14, #11
++        vaddl.u8   q13, d19, d20   // d + e
++        vaddl.u8   q15, d18, d21   // c + f
++        vmul.u16   q13, q12        // 10 * (d+e)
++        vmul.u16   q15, q14        // 11 * ( c + f)
++        vaddl.u8  \out, d17, d22   // b + g
++        vaddl.u8   q12, d16, d23   // a + h
++        vadd.u16  \out, q13        // b + 10 * (d + e) + g
++        vadd.s16   q12, q15
++        vshl.u16  \out, #2         // 4 * (b + 10 * (d + e) + g)
++        vsub.s16  \out, q12
++.endm
 +
-+endfunc
++.macro qpel_filter_3 out=q7
++        vmov.u8    d24, #58
++        vmov.u8    d25, #10
++        vshll.u8   q13, d19, #4     // 16*e
++        vshll.u8   q14, d18, #2     // 4*f
++        vmull.u8  \out, d20, d24    // 58*d
++        vaddw.u8   q13, q13, d19    // 17*e
++        vmull.u8   q15, d21, d25    // 10*c
++        vaddw.u8   q14, q14, d18    // 5*f
++        vsubl.u8   q12, d17, d23    // g - a
++        vadd.u16  \out, q13         // 58d + 17e
++        vshll.u8   q13, d22, #2     // 4*b
++        vadd.u16   q15, q14         // 10*c + 5*f
++        vadd.s16   q13, q12         // - a + 4*b + g
++        vsub.s16  \out, q15         // -10*c + 58*d + 17*e -5*f
++        vadd.s16  \out, q13         // -a + 4*b -10*c + 58*d + 17*e -5*f
++.endm
 +
-+@ add_residual4x4_dc_c(
-+@  uint8_t *_dst,     [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
++.macro  hevc_put_qpel_vX_neon_8 filter
++        push   {r4, r5, r6, r7}
++        ldr    r4, [sp, #16] // height
++        ldr    r5, [sp, #20] // width
++        vpush {d8-d15}
++        sub       r2, r2, r3, lsl #1
++        sub       r2, r3
++        mov       r12, r4
++        mov       r6, r0
++        mov       r7, r2
++        lsl       r1, #1
++0:      loadin8
++        cmp       r5, #4
++        beq       4f
++8:      subs r4, #1
++        \filter
++        vst1.16    {q7}, [r0], r1
++        regshuffle_d8
++        vld1.8    {d23}, [r2], r3
++        bne 8b
++        subs  r5, #8
++        beq       99f
++        mov r4, r12
++        add r6, #16
++        mov r0, r6
++        add r7, #8
++        mov r2, r7
++        b     0b
++4:      subs r4, #1
++        \filter
++        vst1.16    d14, [r0], r1
++        regshuffle_d8
++        vld1.32    {d23[0]}, [r2], r3
++        bne 4b
++99:     vpop {d8-d15}
++        pop {r4, r5, r6, r7}
++        bx lr
++.endm
 +
-+function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r12, #1
-+        vdup.32     q15, r2
-+        b           9f
++.macro  hevc_put_qpel_uw_vX_neon_8 filter
++        push   {r4-r10}
++        ldr    r5, [sp, #28] // width
++        ldr    r4, [sp, #32] // height
++        ldr    r8, [sp, #36] // src2
++        ldr    r9, [sp, #40] // src2stride
++        vpush {d8-d15}
++        sub       r2, r2, r3, lsl #1
++        sub       r2, r3
++        mov       r12, r4
++        mov       r6, r0
++        mov       r7, r2
++        cmp       r8, #0
++        bne       .Lbi\@
++0:      loadin8
++        cmp       r5, #4
++        beq       4f
++8:      subs r4, #1
++        \filter
++        vqrshrun.s16   d0, q7, #6
++        vst1.8    d0, [r0], r1
++        regshuffle_d8
++        vld1.8    {d23}, [r2], r3
++        bne 8b
++        subs  r5, #8
++        beq       99f
++        mov r4, r12
++        add r6, #8
++        mov r0, r6
++        add r7, #8
++        mov r2, r7
++        b     0b
++4:      subs r4, #1
++        \filter
++        vqrshrun.s16   d0, q7, #6
++        vst1.32    d0[0], [r0], r1
++        regshuffle_d8
++        vld1.32    {d23[0]}, [r2], r3
++        bne 4b
++        b   99f
++.Lbi\@: lsl       r9, #1
++        mov       r10, r8
++0:      loadin8
++        cmp       r5, #4
++        beq       4f
++8:      subs r4, #1
++        \filter
++        vld1.16        {q0}, [r8], r9
++        vqadd.s16      q0, q7
++        vqrshrun.s16   d0, q0, #7
++        vst1.8         d0, [r0], r1
++        regshuffle_d8
++        vld1.8    {d23}, [r2], r3
++        bne 8b
++        subs  r5, #8
++        beq       99f
++        mov r4, r12
++        add r6, #8
++        mov r0, r6
++        add r10, #16
++        mov r8, r10
++        add r7, #8
++        mov r2, r7
++        b     0b
++4:      subs r4, #1
++        \filter
++        vld1.16      d0, [r8], r9
++        vqadd.s16    d0, d14
++        vqrshrun.s16 d0, q0, #7
++        vst1.32      d0[0], [r0], r1
++        regshuffle_d8
++        vld1.32    {d23[0]}, [r2], r3
++        bne 4b
++99:     vpop {d8-d15}
++        pop {r4-r10}
++        bx lr
++.endm
++
++function ff_hevc_rpi_put_qpel_v1_neon_8, export=1
++        hevc_put_qpel_vX_neon_8 qpel_filter_1
 +endfunc
 +
-+@ add_residual8x8_dc(
-+@  uint8_t *_dst,     [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
++function ff_hevc_rpi_put_qpel_v2_neon_8, export=1
++        hevc_put_qpel_vX_neon_8 qpel_filter_2
++endfunc
++
++function ff_hevc_rpi_put_qpel_v3_neon_8, export=1
++        hevc_put_qpel_vX_neon_8 qpel_filter_3
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
-+        mov         r12, #2
-+        vdup.16     q15, r2
-+9:
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+1:
-+        vld1.16     {q0}, [r0, :128], r1
-+        subs        r12, #1
-+        vld1.16     {q1}, [r0, :128], r1
-+        vqadd.s16   q0,  q15
-+        vld1.16     {q2}, [r0, :128], r1
-+        vqadd.s16   q1,  q15
-+        vld1.16     {q3}, [r0, :128], r1
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        sub         r0,  r0,  r1, lsl #2
-+        vmax.s16    q0,  q8
-+        vmax.s16    q1,  q8
-+        vmax.s16    q2,  q8
-+        vmax.s16    q3,  q8
-+        vmin.s16    q0,  q9
-+        vmin.s16    q1,  q9
-+        vst1.16     {q0}, [r0, :128], r1
-+        vmin.s16    q2,  q9
-+        vst1.16     {q1}, [r0, :128], r1
-+        vmin.s16    q3,  q9
-+        vst1.16     {q2}, [r0, :128], r1
-+        vst1.16     {q3}, [r0, :128], r1
-+        bne         1b
-+        bx          lr
 +
++function ff_hevc_rpi_put_qpel_uw_v1_neon_8, export=1
++        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
 +endfunc
 +
-+@ add_residual16x16(
-+@  uint8_t *_dst,     [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
++function ff_hevc_rpi_put_qpel_uw_v2_neon_8, export=1
++        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+        mov         r12, #8
-+1:
-+        vldm        r1!, {q10-q13}
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0, :128], r2
-+        subs        r12, #1
-+        vld1.16     {q2, q3}, [r0, :128]
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q11
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        sub         r0,  r2
-+        vmax.s16    q0,  q0,  q8
-+        vmax.s16    q1,  q1,  q8
-+        vmax.s16    q2,  q2,  q8
-+        vmax.s16    q3,  q3,  q8
-+        vmin.s16    q0,  q0,  q9
-+        vmin.s16    q1,  q1,  q9
-+        vmin.s16    q2,  q2,  q9
-+        vmin.s16    q3,  q3,  q9
-+        vst1.16     {q0, q1}, [r0, :128], r2
-+        vst1.16     {q2, q3}, [r0, :128], r2
-+        bne         1b
-+        bx          lr
++function ff_hevc_rpi_put_qpel_uw_v3_neon_8, export=1
++        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
 +endfunc
 +
-+@ add_residual8x8_dc_c(
-+@  uint8_t *_dst,     [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
++.macro hevc_put_qpel_hX_neon_8 filter
++        push     {r4, r5, r6, r7}
++        ldr    r4, [sp, #16] // height
++        ldr    r5, [sp, #20] // width
 +
-+function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r12, #4
-+        vdup.32     q15, r2
-+        b           9f
++        vpush    {d8-d15}
++        sub       r2, #4
++        lsl       r1, #1
++        mov      r12, r4
++        mov       r6, r0
++        mov       r7, r2
++        cmp       r5, #4
++        beq       4f
++8:      subs      r4, #1
++        vextin8
++        \filter
++        vst1.16   {q7}, [r0], r1
++        bne       8b
++        subs      r5, #8
++        beq      99f
++        mov       r4, r12
++        add       r6, #16
++        mov       r0, r6
++        add       r7, #8
++        mov       r2, r7
++        cmp       r5, #4
++        bne       8b
++4:      subs      r4, #1
++        vextin8
++        \filter
++        vst1.16  d14, [r0], r1
++        bne       4b
++99:     vpop     {d8-d15}
++        pop      {r4, r5, r6, r7}
++        bx lr
++.endm
++
++.macro hevc_put_qpel_uw_hX_neon_8 filter
++        push     {r4-r10}
++        ldr       r5, [sp, #28] // width
++        ldr       r4, [sp, #32] // height
++        ldr       r8, [sp, #36] // src2
++        ldr       r9, [sp, #40] // src2stride
++        vpush    {d8-d15}
++        sub       r2, #4
++        mov      r12, r4
++        mov       r6, r0
++        mov       r7, r2
++        cmp       r8, #0
++        bne       .Lbi\@
++        cmp       r5, #4
++        beq       4f
++8:      subs      r4, #1
++        vextin8
++        \filter
++        vqrshrun.s16   d0, q7, #6
++        vst1.8    d0, [r0], r1
++        bne       8b
++        subs      r5, #8
++        beq      99f
++        mov       r4, r12
++        add       r6, #8
++        mov       r0, r6
++        add       r7, #8
++        mov       r2, r7
++        cmp       r5, #4
++        bne       8b
++4:      subs      r4, #1
++        vextin8
++        \filter
++        vqrshrun.s16   d0, q7, #6
++        vst1.32  d0[0], [r0], r1
++        bne       4b
++        b         99f
++.Lbi\@:
++        lsl       r9, #1
++        cmp       r5, #4
++        beq       4f
++        mov       r10, r8
++8:      subs      r4, #1
++        vextin8
++        \filter
++        vld1.16        {q0}, [r8], r9
++        vqadd.s16      q0, q7
++        vqrshrun.s16   d0, q0, #7
++        vst1.8         d0, [r0], r1
++        bne       8b
++        subs      r5, #8
++        beq      99f
++        mov       r4, r12
++        add       r6, #8
++        add       r10, #16
++        mov       r8, r10
++        mov       r0, r6
++        add       r7, #8
++        mov       r2, r7
++        cmp       r5, #4
++        bne       8b
++4:      subs      r4, #1
++        vextin8
++        \filter
++        vld1.16      d0, [r8], r9
++        vqadd.s16    d0, d14
++        vqrshrun.s16 d0, q0, #7
++        vst1.32      d0[0], [r0], r1
++        bne       4b
++99:     vpop     {d8-d15}
++        pop      {r4-r10}
++        bx lr
++.endm
++
++function ff_hevc_rpi_put_qpel_h1_neon_8, export=1
++        hevc_put_qpel_hX_neon_8 qpel_filter_1
 +endfunc
 +
-+@ add_residual16x16_dc(
-+@  uint8_t *_dst,     [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
++function ff_hevc_rpi_put_qpel_h2_neon_8, export=1
++        hevc_put_qpel_hX_neon_8 qpel_filter_2
++endfunc
++
++function ff_hevc_rpi_put_qpel_h3_neon_8, export=1
++        hevc_put_qpel_hX_neon_8 qpel_filter_3
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
-+        vdup.i16    q15, r2
-+        mov         r12, #8
-+9:
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+1:
-+        @ For RPI Sand we could guarantee :256 but not for general
-+        @ non-RPI allocation. :128 is as good as we can claim
-+        vld1.16     {q0, q1}, [r0, :128], r1
-+        subs        r12, #1
-+        vld1.16     {q2, q3}, [r0, :128]
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        sub         r0,  r1
-+        clip16_4 q0, q1, q2, q3, q8, q9
-+        vst1.16     {q0, q1}, [r0, :128], r1
-+        vst1.16     {q2, q3}, [r0, :128], r1
-+        bne         1b
-+        bx          lr
 +
++function ff_hevc_rpi_put_qpel_uw_h1_neon_8, export=1
++        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
 +endfunc
 +
++function ff_hevc_rpi_put_qpel_uw_h2_neon_8, export=1
++        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
++endfunc
 +
-+@ add_residual32x32(
-+@  uint8_t *_dst,     [r0]
-+@  int16_t *res,      [r1]
-+@  ptrdiff_t stride)  [r2]
++function ff_hevc_rpi_put_qpel_uw_h3_neon_8, export=1
++        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+        mov         r12, #32
-+1:
-+        vldm        r1!, {q10-q13}
-+        vldm        r0,  {q0-q3}
-+        subs        r12, #1
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q11
-+        vqadd.s16   q2,  q12
-+        vqadd.s16   q3,  q13
-+        clip16_4 q0, q1, q2, q3, q8, q9
-+        vstm        r0,  {q0-q3}
-+        add         r0,  r2
-+        bne         1b
-+        bx          lr
++.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
++        push   {r4, r5, r6, r7}
++        ldr    r4, [sp, #16] // height
++        ldr    r5, [sp, #20] // width
++
++        vpush {d8-d15}
++        sub       r2, #4
++        sub       r2, r2, r3, lsl #1
++        sub       r2, r3  // extra_before 3
++        lsl       r1, #1
++        mov       r12, r4
++        mov       r6, r0
++        mov       r7, r2
++0:      vextin8
++        \filterh q0
++        vextin8
++        \filterh q1
++        vextin8
++        \filterh q2
++        vextin8
++        \filterh q3
++        vextin8
++        \filterh q4
++        vextin8
++        \filterh q5
++        vextin8
++        \filterh q6
++        vextin8
++        \filterh q7
++        cmp r5, #4
++        beq 4f
++8:      subs  r4, #1
++        \filterv
++        vst1.16    {q8}, [r0], r1
++        regshuffle_q8
++        vextin8
++        \filterh q7
++        bne 8b
++        subs  r5, #8
++        beq 99f
++        mov r4, r12
++        add r6, #16
++        mov r0, r6
++        add r7, #8
++        mov r2, r7
++        b 0b
++4:      subs  r4, #1
++        \filterv
++        vst1.16    d16, [r0], r1
++        regshuffle_q8
++        vextin8
++        \filterh q7
++        bne 4b
++99:     vpop {d8-d15}
++        pop {r4, r5, r6, r7}
++        bx lr
++.endm
 +
++.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
++        push     {r4-r10}
++        ldr       r5, [sp, #28] // width
++        ldr       r4, [sp, #32] // height
++        ldr       r8, [sp, #36] // src2
++        ldr       r9, [sp, #40] // src2stride
++        vpush {d8-d15}
++        sub       r2, #4
++        sub       r2, r2, r3, lsl #1
++        sub       r2, r3  // extra_before 3
++        mov       r12, r4
++        mov       r6, r0
++        mov       r7, r2
++        cmp       r8, #0
++        bne       .Lbi\@
++0:      vextin8
++        \filterh q0
++        vextin8
++        \filterh q1
++        vextin8
++        \filterh q2
++        vextin8
++        \filterh q3
++        vextin8
++        \filterh q4
++        vextin8
++        \filterh q5
++        vextin8
++        \filterh q6
++        vextin8
++        \filterh q7
++        cmp r5, #4
++        beq 4f
++8:      subs  r4, #1
++        \filterv
++        vqrshrun.s16   d0, q8, #6
++        vst1.8    d0, [r0], r1
++        regshuffle_q8
++        vextin8
++        \filterh q7
++        bne 8b
++        subs  r5, #8
++        beq 99f
++        mov r4, r12
++        add r6, #8
++        mov r0, r6
++        add r7, #8
++        mov r2, r7
++        b 0b
++4:      subs  r4, #1
++        \filterv
++        vqrshrun.s16   d0, q8, #6
++        vst1.32        d0[0], [r0], r1
++        regshuffle_q8
++        vextin8
++        \filterh q7
++        bne 4b
++        b   99f
++.Lbi\@: lsl      r9, #1
++        mov      r10, r8
++0:      vextin8
++        \filterh q0
++        vextin8
++        \filterh q1
++        vextin8
++        \filterh q2
++        vextin8
++        \filterh q3
++        vextin8
++        \filterh q4
++        vextin8
++        \filterh q5
++        vextin8
++        \filterh q6
++        vextin8
++        \filterh q7
++        cmp r5, #4
++        beq 4f
++8:      subs  r4, #1
++        \filterv
++        vld1.16        {q0}, [r8], r9
++        vqadd.s16      q0, q8
++        vqrshrun.s16   d0, q0, #7
++        vst1.8         d0, [r0], r1
++        regshuffle_q8
++        vextin8
++        \filterh q7
++        bne 8b
++        subs  r5, #8
++        beq 99f
++        mov r4, r12
++        add r6, #8
++        mov r0, r6
++        add r10, #16
++        mov r8, r10
++        add r7, #8
++        mov r2, r7
++        b 0b
++4:      subs  r4, #1
++        \filterv
++        vld1.16      d0, [r8], r9
++        vqadd.s16    d0, d16
++        vqrshrun.s16 d0, q0, #7
++        vst1.32      d0[0], [r0], r1
++        regshuffle_q8
++        vextin8
++        \filterh q7
++        bne 4b
++99:     vpop {d8-d15}
++        pop {r4-r10}
++        bx lr
++.endm
++
++
++function ff_hevc_rpi_put_qpel_h1v1_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
 +endfunc
 +
-+@ add_residual8x8_dc_c(
-+@  uint8_t *_dst,     [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc_uv)         [r2]
++function ff_hevc_rpi_put_qpel_h2v1_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
-+        mov         r12, #16
-+        vdup.32     q15, r2
-+        b           9f
++function ff_hevc_rpi_put_qpel_h3v1_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
 +endfunc
 +
-+@ add_residual32x32_dc(
-+@  uint8_t *_dst,     [r0]
-+@  ptrdiff_t stride,  [r1]
-+@  int dc)            [r2]
++function ff_hevc_rpi_put_qpel_h1v2_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
-+        vdup.i16    q15, r2
-+        mov         r12, #32
-+9:
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
-+1:
-+        vldm        r0,  {q0-q3}
-+        subs        r12, #1
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q15
-+        clip16_4 q0, q1, q2, q3, q8, q9
-+        vstm        r0,  {q0-q3}
-+        add         r0,  r1
-+        bne         1b
-+        bx          lr
++function ff_hevc_rpi_put_qpel_h2v2_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
++endfunc
 +
++function ff_hevc_rpi_put_qpel_h3v2_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
 +endfunc
 +
-+@ ============================================================================
-+@ U add
++function ff_hevc_rpi_put_qpel_h1v3_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
++endfunc
 +
-+@ add_residual4x4_u(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
++function ff_hevc_rpi_put_qpel_h2v3_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
-+        vld1.16     {q10, q11}, [r1, :256]
-+        vdup.16     q15, r3
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
++function ff_hevc_rpi_put_qpel_h3v3_neon_8, export=1
++        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
++endfunc
 +
-+        vld2.16     {d0, d2}, [r0, :128], r2
-+        vld2.16     {d1, d3}, [r0, :128], r2
-+        vld2.16     {d4, d6}, [r0, :128], r2
-+        vld2.16     {d5, d7}, [r0, :128], r2
 +
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        sub         r0,  r0,  r2, lsl #2
-+        clip16_4 q0, q1, q2, q3, q8, q9
++function ff_hevc_rpi_put_qpel_uw_h1v1_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
++endfunc
 +
-+        vst2.16     {d0, d2}, [r0, :128], r2
-+        vst2.16     {d1, d3}, [r0, :128], r2
-+        vst2.16     {d4, d6}, [r0, :128], r2
-+        vst2.16     {d5, d7}, [r0, :128]
-+        bx          lr
++function ff_hevc_rpi_put_qpel_uw_h2v1_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
 +endfunc
 +
-+@ add_residual8x8_u(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
++function ff_hevc_rpi_put_qpel_uw_h3v1_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        mov         r12, #4
-+        vdup.i16    q9,  r3
-+1:
-+        vld2.16     {q0, q1}, [r0, :256], r2
-+        vld2.16     {q2, q3}, [r0, :256]
-+        vld1.16     {q10, q11}, [r1, :256]!
-+        subs        r12, #1
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        sub         r0,  r2
-+        clip16_4 q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0, :256], r2
-+        vst2.16     {q2, q3}, [r0, :256], r2
-+        bne         1b
-+        bx          lr
++function ff_hevc_rpi_put_qpel_uw_h1v2_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
 +endfunc
 +
-+@ add_residual16x16_u(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
++function ff_hevc_rpi_put_qpel_uw_h2v2_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        mov         r12, #16
-+        vdup.i16    q9,  r3
-+        sub         r2,  #32
-+1:
-+        vld2.16     {q0, q1}, [r0, :256]!
-+        vld2.16     {q2, q3}, [r0, :256]
-+        vld1.16     {q10, q11}, [r1, :256]!
-+        subs        r12, #1
-+        vqadd.s16   q0,  q10
-+        vqadd.s16   q1,  q15
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q3,  q15
-+        sub         r0,  #32
-+        clip16_4 q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0, :256]!
-+        vst2.16     {q2, q3}, [r0, :256], r2
-+        bne         1b
-+        bx          lr
++function ff_hevc_rpi_put_qpel_uw_h3v2_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
 +endfunc
 +
-+@ ============================================================================
-+@ V add
++function ff_hevc_rpi_put_qpel_uw_h1v3_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
++endfunc
 +
-+@ add_residual4x4_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
++function ff_hevc_rpi_put_qpel_uw_h2v3_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
-+        vld1.16     {q10, q11}, [r1, :256]
-+        vdup.16     q15, r3
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        vdup.i16    q9,  r3
++function ff_hevc_rpi_put_qpel_uw_h3v3_neon_8, export=1
++        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
++endfunc
 +
-+        vld2.16     {d0, d2}, [r0, :128], r2
-+        vld2.16     {d1, d3}, [r0, :128], r2
-+        vld2.16     {d4, d6}, [r0, :128], r2
-+        vld2.16     {d5, d7}, [r0, :128], r2
++.macro init_put_pixels
++        pld    [r1]
++        pld    [r1, r2]
++        mov    r12, MAX_PB_SIZE
++        lsl    r12, #1
++.endm
 +
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        sub         r0,  r0,  r2, lsl #2
-+        clip16_4 q0, q1, q2, q3, q8, q9
++function ff_hevc_rpi_put_pixels_w2_neon_8, export=1
++        init_put_pixels
++        vmov.u8      d5, #255
++        vshr.u64     d5, #32
++0:      subs r3, #1
++        vld1.32     {d0[0]}, [r1], r2
++        pld [r1]
++        vld1.32     d6, [r0]
++        vshll.u8    q0, d0, #6
++        vbit        d6, d0, d5
++        vst1.32     d6, [r0], r12
++        bne 0b
++        bx lr
++endfunc
 +
-+        vst2.16     {d0, d2}, [r0, :128], r2
-+        vst2.16     {d1, d3}, [r0, :128], r2
-+        vst2.16     {d4, d6}, [r0, :128], r2
-+        vst2.16     {d5, d7}, [r0, :128]
-+        bx          lr
++function ff_hevc_rpi_put_pixels_w4_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #2
++        vld1.32   {d0[0]}, [r1], r2
++        vld1.32   {d0[1]}, [r1], r2
++        pld       [r1]
++        pld       [r1, r2]
++        vshll.u8   q0, d0, #6
++        vst1.64   {d0}, [r0], r12
++        vst1.64   {d1}, [r0], r12
++        bne 0b
++        bx lr
 +endfunc
 +
-+@ add_residual8x8_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
++function ff_hevc_rpi_put_pixels_w6_neon_8, export=1
++        init_put_pixels
++        vmov.u8      q10, #255
++        vshr.u64     d21, #32
++0:      subs r3, #1
++        vld1.16     {d0}, [r1], r2
++        pld [r1]
++        vshll.u8    q0, d0, #6
++        vld1.8      {q12}, [r0]
++        vbit        q12, q0, q10
++        vst1.8      {q12}, [r0], r12
++        bne 0b
++        bx lr
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        mov         r12, #4
-+        vdup.i16    q9,  r3
-+1:
-+        vld2.16     {q0, q1}, [r0, :256], r2
-+        vld2.16     {q2, q3}, [r0, :256]
-+        vld1.16     {q10, q11}, [r1, :256]!
-+        subs        r12, #1
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        sub         r0,  r2
-+        clip16_4 q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0, :256], r2
-+        vst2.16     {q2, q3}, [r0, :256], r2
-+        bne         1b
-+        bx          lr
++function ff_hevc_rpi_put_pixels_w8_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #2
++        vld1.8   {d0}, [r1], r2
++        vld1.8   {d2}, [r1], r2
++        pld        [r1]
++        pld        [r1, r2]
++        vshll.u8   q0, d0, #6
++        vshll.u8   q1, d2, #6
++        vst1.16   {q0}, [r0], r12
++        vst1.16   {q1}, [r0], r12
++        bne 0b
++        bx lr
 +endfunc
 +
-+@ add_residual16x16_v(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride,     [r2]
-+@   int dc)               [r3]
++function ff_hevc_rpi_put_pixels_w12_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #2
++        vld1.64    {d0}, [r1]
++        add       r1, #8
++        vld1.32   {d1[0]}, [r1], r2
++        sub       r1, #8
++        vld1.64    {d2}, [r1]
++        add       r1, #8
++        vld1.32   {d1[1]}, [r1], r2
++        sub       r1, #8
++        pld       [r1]
++        pld       [r1, r2]
++        vshll.u8  q8, d0, #6
++        vshll.u8  q9, d1, #6
++        vshll.u8  q10, d2, #6
++        vmov      d22, d19
++        vst1.64   {d16, d17, d18}, [r0], r12
++        vst1.64   {d20, d21, d22}, [r0], r12
++        bne 0b
++        bx lr
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
-+        vdup.16     q15, r3
-+        movw        r3,  #(1 << BIT_DEPTH) - 1
-+        vmov.i64    q8,  #0
-+        mov         r12, #16
-+        vdup.i16    q9,  r3
-+        sub         r2,  #32
-+1:
-+        vld2.16     {q0, q1}, [r0, :256]!
-+        vld2.16     {q2, q3}, [r0, :256]
-+        vld1.16     {q10, q11}, [r1, :256]!
-+        subs        r12, #1
-+        vqadd.s16   q0,  q15
-+        vqadd.s16   q1,  q10
-+        vqadd.s16   q2,  q15
-+        vqadd.s16   q3,  q11
-+        sub         r0,  #32
-+        clip16_4 q0, q1, q2, q3, q8, q9
-+        vst2.16     {q0, q1}, [r0, :256]!
-+        vst2.16     {q2, q3}, [r0, :256], r2
-+        bne         1b
-+        bx          lr
++function ff_hevc_rpi_put_pixels_w16_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #2
++        vld1.8   {q0}, [r1], r2
++        vld1.8   {q1}, [r1], r2
++        pld       [r1]
++        pld       [r1, r2]
++        vshll.u8  q8, d0, #6
++        vshll.u8  q9, d1, #6
++        vshll.u8  q10, d2, #6
++        vshll.u8  q11, d3, #6
++        vst1.8    {q8, q9}, [r0], r12
++        vst1.8    {q10, q11}, [r0], r12
++        bne 0b
++        bx lr
 +endfunc
 +
-+@ ============================================================================
-+@ U & V add
++function ff_hevc_rpi_put_pixels_w24_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #1
++        vld1.8   {d0, d1, d2}, [r1], r2
++        pld       [r1]
++        vshll.u8  q10, d0, #6
++        vshll.u8  q11, d1, #6
++        vshll.u8  q12, d2, #6
++        vstm     r0, {q10, q11, q12}
++        add      r0, r12
++        bne 0b
++        bx lr
++endfunc
 +
-+@ add_residual4x4_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
++function ff_hevc_rpi_put_pixels_w32_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #1
++        vld1.8 {q0, q1}, [r1], r2
++        pld       [r1]
++        vshll.u8  q8, d0, #6
++        vshll.u8  q9, d1, #6
++        vshll.u8  q10, d2, #6
++        vshll.u8  q11, d3, #6
++        vstm    r0, {q8, q9, q10, q11}
++        add     r0, r12
++        bne 0b
++        bx lr
++endfunc
 +
-+function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
-+        vldm        r1, {q10-q13}
++function ff_hevc_rpi_put_pixels_w48_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #1
++        vld1.8    {q0, q1}, [r1]
++        add r1, #32
++        vld1.8    {q2}, [r1], r2
++        sub r1, #32
++        pld       [r1]
++        vshll.u8  q8, d0, #6
++        vshll.u8  q9, d1, #6
++        vshll.u8  q10, d2, #6
++        vshll.u8  q11, d3, #6
++        vshll.u8  q12, d4, #6
++        vshll.u8  q13, d5, #6
++        vstm r0, {q8, q9, q10, q11, q12, q13}
++        add  r0, r12
++        bne 0b
++        bx lr
++endfunc
++
++function ff_hevc_rpi_put_pixels_w64_neon_8, export=1
++        init_put_pixels
++0:      subs r3, #1
++        vld1.8    {q0, q1}, [r1]
++        add      r1, #32
++        vld1.8    {q2, q3}, [r1], r2
++        sub      r1, #32
++        pld       [r1]
++        vshll.u8  q8, d0, #6
++        vshll.u8  q9, d1, #6
++        vshll.u8  q10, d2, #6
++        vshll.u8  q11, d3, #6
++        vshll.u8  q12, d4, #6
++        vshll.u8  q13, d5, #6
++        vshll.u8  q14, d6, #6
++        vshll.u8  q15, d7, #6
++        vstm    r0, {q8, q9, q10, q11, q12, q13, q14, q15}
++        add r0, r12
++        bne 0b
++        bx lr
++endfunc
++
++function ff_hevc_rpi_put_qpel_uw_pixels_neon_8, export=1
++        push   {r4-r9}
++        ldr    r5, [sp, #24] // width
++        ldr    r4, [sp, #28] // height
++        ldr    r8, [sp, #32] // src2
++        ldr    r9, [sp, #36] // src2stride
++        vpush {d8-d15}
++        cmp    r8, #0
++        bne    2f
++1:      subs r4, #1
++        vld1.8     {d0}, [r2], r3
++        vst1.8      d0, [r0], r1
++        bne 1b
++        vpop {d8-d15}
++        pop   {r4-r9}
++        bx lr
++2:      subs  r4, #1
++        vld1.8         {d0}, [r2], r3
++        vld1.16        {q1}, [r8], r9
++        vshll.u8       q0, d0, #6
++        vqadd.s16      q0, q1
++        vqrshrun.s16   d0, q0, #7
++        vst1.8      d0, [r0], r1
++        bne 2b
++        vpop {d8-d15}
++        pop   {r4-r9}
++        bx lr
++endfunc
++
++.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
++function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1
++        ldr    r12, [sp] // height
++1:      subs   r12, #4
++        vld1.32     {\regs}  , [r2], r3
++        vld1.32     {\regs2} , [r2], r3
++        vld1.32     {\regs3} , [r2], r3
++        vld1.32     {\regs4} , [r2], r3
++        vst1.32     {\regs}  , [r0], r1
++        vst1.32     {\regs2} , [r0], r1
++        vst1.32     {\regs3} , [r0], r1
++        vst1.32     {\regs4} , [r0], r1
++        bne 1b
++        bx lr
++endfunc
++.endm
++
++.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
++function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1
++        push   {r4-r5}
++        ldr    r12, [sp, #8] // height
++1:      subs r12, #2
++        mov      r4, r2
++        vld1.32   {\regs} , [r2]!
++        vld1.32   {\regs2} , [r2]
++        add      r2, r4, r3
++        mov      r4, r2
++        vld1.32   {\regs3} , [r2]!
++        vld1.32   {\regs4} , [r2]
++        add      r2, r4, r3
++        mov      r5, r0
++        vst1.32   {\regs} , [r0]!
++        vst1.32   {\regs2} , [r0]
++        add      r0, r5, r1
++        mov      r5, r0
++        vst1.32   {\regs3} , [r0]!
++        vst1.32   {\regs4} , [r0]
++        add      r0, r5, r1
++        bne 1b
++        pop   {r4-r5}
++        bx lr
++endfunc
++.endm
++
++put_qpel_uw_pixels    4, d0[0], d0[1], d1[0], d1[1]
++put_qpel_uw_pixels    8, d0,    d1,    d2,    d3
++put_qpel_uw_pixels_m 12, d0,    d1[0], d2,    d3[0]
++put_qpel_uw_pixels   16, q0,    q1,    q2,    q3
++put_qpel_uw_pixels   24, d0-d2, d3-d5, d16-d18, d19-d21
++put_qpel_uw_pixels   32, q0-q1, q2-q3, q8-q9, q10-q11
++put_qpel_uw_pixels_m 48, q0-q1, q2,    q8-q9, q10
++put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
+diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+new file mode 100644
+index 0000000000..7dfcc2751a
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+@@ -0,0 +1,610 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1]
 +        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vld1.16     {d0}, [r0, :64], r2
++        vld1.16     {d1}, [r0, :64], r2
++        vld1.16     {d2}, [r0, :64], r2
++        vld1.16     {d3}, [r0, :64], r2
 +        vmov.i64    q8,  #0
 +        vdup.i16    q9,  r3
-+
-+        vld2.16     {d0, d2}, [r0, :128], r2
-+        vld2.16     {d1, d3}, [r0, :128], r2
-+        vld2.16     {d4, d6}, [r0, :128], r2
-+        vld2.16     {d5, d7}, [r0, :128], r2
-+
 +        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
-+        vqadd.s16   q3,  q13
++        vqadd.s16   q1,  q11
 +        sub         r0,  r0,  r2, lsl #2
 +        vmax.s16    q0,  q0,  q8
 +        vmax.s16    q1,  q1,  q8
-+        vmax.s16    q2,  q2,  q8
-+        vmax.s16    q3,  q3,  q8
 +        vmin.s16    q0,  q0,  q9
 +        vmin.s16    q1,  q1,  q9
-+        vmin.s16    q2,  q2,  q9
-+        vmin.s16    q3,  q3,  q9
-+
-+        vst2.16     {d0, d2}, [r0, :128], r2
-+        vst2.16     {d1, d3}, [r0, :128], r2
-+        vst2.16     {d4, d6}, [r0, :128], r2
-+        vst2.16     {d5, d7}, [r0, :128]
++        vst1.16     {d0}, [r0, :64], r2
++        vst1.16     {d1}, [r0, :64], r2
++        vst1.16     {d2}, [r0, :64], r2
++        vst1.16     {d3}, [r0, :64], r2
 +        bx          lr
++
 +endfunc
 +
-+@ add_residual8x8_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
 +
-+function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
 +        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vdup.i16    q9,  r3
++        vld1.16     {d0}, [r0, :64], r1
++        vld1.16     {d1}, [r0, :64], r1
++        vdup.16     q15, r2
++        vld1.16     {d2}, [r0, :64], r1
++        vld1.16     {d3}, [r0, :64], r1
 +        vmov.i64    q8,  #0
-+        mov         r12, #4
 +        vdup.i16    q9,  r3
-+        add         r3, r1, #(8*8*2)  @ Offset to V
-+1:
-+        vld2.16     {q0, q1}, [r0, :256], r2
-+        vld2.16     {q2, q3}, [r0, :256]
-+        vld1.16     {q10, q11}, [r1, :256]!
-+        vld1.16     {q12, q13}, [r3, :256]!
-+        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0, :64], r1
++        vst1.16     {d1}, [r0, :64], r1
++        vst1.16     {d2}, [r0, :64], r1
++        vst1.16     {d3}, [r0, :64], r1
++        bx          lr
++
++endfunc
++
++
++@ add_residual8x8(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #2
++1:
++        vldm        r1!, {q10-q13}
++        vld1.16     {q0}, [r0, :128], r2
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r2
 +        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
++        vld1.16     {q2}, [r0, :128], r2
++        vqadd.s16   q1,  q11
++        vld1.16     {q3}, [r0, :128], r2
++        vqadd.s16   q2,  q12
 +        vqadd.s16   q3,  q13
-+        sub         r0,  r2
++        sub         r0,  r0,  r2, lsl #2
 +        vmax.s16    q0,  q0,  q8
 +        vmax.s16    q1,  q1,  q8
 +        vmax.s16    q2,  q2,  q8
 +        vmax.s16    q3,  q3,  q8
 +        vmin.s16    q0,  q0,  q9
 +        vmin.s16    q1,  q1,  q9
++        vst1.16     {q0}, [r0, :128], r2
 +        vmin.s16    q2,  q2,  q9
++        vst1.16     {q1}, [r0, :128], r2
 +        vmin.s16    q3,  q3,  q9
-+        vst2.16     {q0, q1}, [r0, :256], r2
-+        vst2.16     {q2, q3}, [r0, :256], r2
++        vst1.16     {q2}, [r0, :128], r2
++        vst1.16     {q3}, [r0, :128], r2
 +        bne         1b
 +        bx          lr
++
 +endfunc
 +
-+@ add_residual16x16_c(
-+@   uint8_t *_dst,        [r0]
-+@   const int16_t *res,   [r1]
-+@   ptrdiff_t stride)     [r2]
++@ add_residual4x4_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #1
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual8x8_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++        mov         r12, #2
++        vdup.16     q15, r2
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vld1.16     {q0}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r1
++        vqadd.s16   q0,  q15
++        vld1.16     {q2}, [r0, :128], r1
++        vqadd.s16   q1,  q15
++        vld1.16     {q3}, [r0, :128], r1
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q8
++        vmax.s16    q1,  q8
++        vmax.s16    q2,  q8
++        vmax.s16    q3,  q8
++        vmin.s16    q0,  q9
++        vmin.s16    q1,  q9
++        vst1.16     {q0}, [r0, :128], r1
++        vmin.s16    q2,  q9
++        vst1.16     {q1}, [r0, :128], r1
++        vmin.s16    q3,  q9
++        vst1.16     {q2}, [r0, :128], r1
++        vst1.16     {q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual16x16(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
 +
-+function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
 +        movw        r3,  #(1 << BIT_DEPTH) - 1
 +        vmov.i64    q8,  #0
-+        mov         r12, #16
 +        vdup.i16    q9,  r3
-+        add         r3,  r1, #(16*16*2)  @ Offset to V
-+        sub         r2,  #32
++        mov         r12, #8
 +1:
-+        vld2.16     {q0, q1}, [r0, :256]!
-+        vld2.16     {q2, q3}, [r0, :256]
-+        vld1.16     {q10, q11}, [r1, :256]!
-+        vld1.16     {q12, q13}, [r3, :256]!
++        vldm        r1!, {q10-q13}
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r2
 +        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
 +        vqadd.s16   q0,  q10
-+        vqadd.s16   q2,  q11
-+        vqadd.s16   q1,  q12
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
 +        vqadd.s16   q3,  q13
-+        sub         r0,  #32
++        sub         r0,  r2
 +        vmax.s16    q0,  q0,  q8
 +        vmax.s16    q1,  q1,  q8
 +        vmax.s16    q2,  q2,  q8
@@ -5054,477 +5990,633 @@ index 0000000000..7cc5cd5e5c
 +        vmin.s16    q1,  q1,  q9
 +        vmin.s16    q2,  q2,  q9
 +        vmin.s16    q3,  q3,  q9
-+        vst2.16     {q0, q1}, [r0, :256]!
-+        vst2.16     {q2, q3}, [r0, :256], r2
++        vst1.16     {q0, q1}, [r0, :128], r2
++        vst1.16     {q2, q3}, [r0, :128], r2
 +        bne         1b
 +        bx          lr
 +endfunc
 +
-diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
-new file mode 100644
-index 0000000000..30113d9c93
---- /dev/null
-+++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -0,0 +1,1882 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
 +
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #4
++        vdup.32     q15, r2
++        b           9f
++endfunc
 +
-+.set EDGE_SRC_STRIDE, 160
++@ add_residual16x16_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
 +
-+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
-+        vshr.u8 q12, q8, #3
-+        vadd.s8  q8, \Q_K128
-+        vshr.u8 q13, q9, #3
-+        vadd.s8  q9, \Q_K128
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #8
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r1
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0, :128], r1
++        vst1.16     {q2, q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
 +
-+        vtbl.8   d24, \XLAT0, d24
-+        vtbl.8   d25, \XLAT0, d25
-+        vtbl.8   d26, \XLAT1, d26
-+        vtbl.8   d27, \XLAT1, d27
++endfunc
 +
-+        vqadd.s8 q8, q12
-+        vshr.u8 q12, q10, #3
-+        vadd.s8  q10, \Q_K128
-+        vqadd.s8 q9, q13
-+        vshr.u8 q13, q11, #3
-+        vadd.s8  q11, \Q_K128
 +
-+        vsub.s8  q8, \Q_K128
-+        vtbl.8   d24, \XLAT0, d24
-+        vtbl.8   d25, \XLAT0, d25
-+        vsub.s8  q9, \Q_K128
-+        vtbl.8   d26, \XLAT1, d26
-+        vtbl.8   d27, \XLAT1, d27
-+        vqadd.s8 q10, q12
-+        vqadd.s8 q11, q13
-+        vsub.s8  q10, \Q_K128
-+        vsub.s8  q11, \Q_K128
-+.endm
++@ add_residual32x32(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
 +
-+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
-+        vshr.u8 q12, q8, #3
-+        vadd.s8  q8, \Q_K128
++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #32
++1:
++        vldm        r1!, {q10-q13}
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r2
++        bne         1b
++        bx          lr
 +
-+        vtbl.8   d24, \XLAT0, d24
-+        vtbl.8   d25, \XLAT1, d25
++endfunc
 +
-+        vqadd.s8 q8, q12
-+        vsub.s8  q8, \Q_K128
-+.endm
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
 +
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #16
++        vdup.32     q15, r2
++        b           9f
++endfunc
 +
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmax.s16  \Q2, \Q_MIN
-+        vmax.s16  \Q3, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+        vmin.s16  \Q2, \Q_MAX
-+        vmin.s16  \Q3, \Q_MAX
-+.endm
-+
-+@ Clobbers q12, q13
-+.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
-+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
-+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
-+        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
-+        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT1, d25
-+        vtbl.8    d26, \XLAT0, d26
-+        vtbl.8    d27, \XLAT1, d27
-+        vaddw.s8  \Q0, d24
-+        vaddw.s8  \Q1, d25
-+        vaddw.s8  \Q2, d26
-+        vaddw.s8  \Q3, d27
-+        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
-+.endm
++@ add_residual32x32_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
 +
-+@ Clobbers q12
-+.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
-+        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
-+        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
-+        vtbl.8    d24, \XLAT0, d24
-+        vtbl.8    d25, \XLAT1, d25
-+        vaddw.s8  \Q0, d24
-+        vaddw.s8  \Q1, d25
-+        vmax.s16  \Q0, \Q_MIN
-+        vmax.s16  \Q1, \Q_MIN
-+        vmin.s16  \Q0, \Q_MAX
-+        vmin.s16  \Q1, \Q_MAX
-+.endm
++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #32
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r1
++        bne         1b
++        bx          lr
 +
++endfunc
 +
-+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
-+@ so we are quite safe stuffing it into a byte array
-+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
-+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
-+@ precision
++@ ============================================================================
++@ U add
 +
-+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
-+@ array via the stack
-+@ Given that sao_left_class > 28 can cause wrap we can't just poke
-+@ all 4 bytes in at once
-+@
-+@ It also loads other common regs
++@ add_residual4x4_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
 +
-+function band_load_y
-+        vmov.i64  q0, #0
-+        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
-+        add       r12, #2               @ 1st interesting val is [1]
-+        vld1.16   {d16}, [r12]          @ Unaligned
-+        vmov.i64  q1, #0
-+        ldr       r12, [sp, #12]        @ sao_left_class
++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
 +
-+        mov       r4, sp
-+        sub       sp, #32
-+        and       sp, #~63              @ Align stack so we can wrap with a simple AND
-+        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
-+        add       r12, sp
-+        vst1.8    {d16[0]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[2]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[4]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[6]}, [r12]
-+        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
-+        mov       sp, r4
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
 +
-+        ldr       r12, [sp, #20]        @ height
-+        pld       [r1]
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
 +
-+        sub       r12, #1
-+        add       r4, r1, r3
-+        bx        lr
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
 +endfunc
 +
++@ add_residual8x8_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
 +
-+function band_load_c
-+        vmov.i64  q2, #0
-+        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
-+        add       r12, #2               @ 1st interesting val is [1]
-+        vld1.16   {d16}, [r12]          @ Unaligned
-+        vmov.i64  q3, #0
-+        ldr       r12, [sp, #12]        @ sao_left_class
++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
 +
-+        mov       r4, sp                @ Remember SP
-+        sub       sp, #32
-+        and       sp, #~63              @ Align stack so we can wrap with a simple AND
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
 +
-+        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
-+        add       r12, sp
-+        vst1.8    {d16[0]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[2]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[4]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[6]}, [r12]
-+        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
 +
-+        @ And again for the 2nd set
-+        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
-+        add       r12, #2               @ 1st interesting val is [1]
-+        vld1.16   {d16}, [r12]          @ Unaligned
-+        ldr       r12, [r4, #20]        @ sao_left_class2
++@ ============================================================================
++@ V add
 +
-+        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
-+        add       r12, sp
-+        vst1.8    {d16[0]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[2]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[4]}, [r12]!
-+        and       r12, #~32
-+        vst1.8    {d16[6]}, [r12]
-+        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
 +
-+        mov       sp, r4
++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
 +
-+        ldr       r12, [sp, #28]        @ height
-+        pld       [r1]
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
 +
-+        subs      r12, #1
-+        add       r4, r1, r3
-+        bx        lr
-+endfunc
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
 +
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
 +
-+@ ff_hevc_sao_band_64_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
 +
-+function ff_hevc_sao_band_64_neon_8, export=1
-+        push      {r4, lr}
-+        bl        band_load_y
-+        vmov.u8   q15, #128
-+
-+1:      subs      r12, #1
-+        vldm      r1, {q8-q11}
-+        pld       [r4]
-+        add       r1, r3
-+
-+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
 +
-+        it ne
-+        addne     r4, r3
-+        vstm      r0, {q8-q11}
-+        add       r0, r2
-+        bpl       1b
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
 +
-+        pop       {r4, pc}
++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
 +endfunc
 +
-+@ ff_hevc_sao_band_32_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
++@ ============================================================================
++@ U & V add
 +
-+function ff_hevc_sao_band_32_neon_8, export=1
-+        push      {r4, lr}
-+        bl        band_load_y
-+        vmov.u8   q15, #128
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
 +
-+1:      subs      r12, #2
-+        vld1.8    { q8, q9 }, [r1, :128], r3
-+        vld1.8    {q10, q11}, [r1, :128], r3
++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++        vldm        r1, {q10-q13}
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
 +
-+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
 +
-+        vst1.8    { q8, q9 }, [r0, :128], r2
-+        vst1.8    {q10, q11}, [r0, :128], r2
-+        bpl       1b
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
 +
-+        pop       {r4, pc}
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
 +endfunc
 +
-+@ ff_hevc_sao_band_16_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
-+
-+function ff_hevc_sao_band_16_neon_8, export=1
-+        push      {r4, lr}
-+        bl        band_load_y
-+        vmov.u8   q15, #128
-+
-+1:      subs      r12, #4
-+        vld1.8    { q8}, [r1, :128], r3
-+        vld1.8    { q9}, [r1, :128], r3
-+        vld1.8    {q10}, [r1, :128], r3
-+        vld1.8    {q11}, [r1, :128], r3
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
 +
-+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++        add         r3, r1, #(8*8*2)  @ Offset to V
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
 +
-+        vst1.8    { q8}, [r0, :128], r2
-+        vst1.8    { q9}, [r0, :128], r2
-+        vst1.8    {q10}, [r0, :128], r2
-+        vst1.8    {q11}, [r0, :128], r2
-+        bpl       1b
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
 +
-+        pop       {r4, pc}
++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        add         r3,  r1, #(16*16*2)  @ Offset to V
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  #32
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
 +endfunc
 +
-+@ ff_hevc_sao_band_8_neon_8 (
-+@   uint8_t *_dst,              [r0]
-+@   uint8_t *_src,              [r1]
-+@   ptrdiff_t stride_dst,       [r2]
-+@   ptrdiff_t stride_src,       [r3]
-+@   int16_t *sao_offset_val,    [sp, #0]
-+@   int sao_left_class,         [sp, #4]
-+@   int width,                  [sp, #8]
-+@   int height)                 [sp, #12]
+diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+new file mode 100644
+index 0000000000..8c32cb23e7
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+@@ -0,0 +1,1882 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+function ff_hevc_sao_band_8_neon_8, export=1
-+        push      {r4, lr}
-+        bl        band_load_y
-+        ldr       lr, [sp, #16]         @ width
-+        vmov.u8   q15, #128
-+        cmp       lr, #8
-+        blt       4f
++#include "libavutil/arm/asm.S"
++#include "neon.S"
 +
-+1:      subs      r12, #2
-+        vld1.8    {d16}, [r1, :64], r3
-+        vld1.8    {d17}, [r1, :64], r3
++.set EDGE_SRC_STRIDE, 160
 +
-+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
++        vshr.u8 q13, q9, #3
++        vadd.s8  q9, \Q_K128
 +
-+        vst1.8    {d16}, [r0, :64], r2
-+        vst1.8    {d17}, [r0, :64], r2
-+        bpl       1b
-+        pop       {r4, pc}
-+
-+4:
-+1:      subs      r12, #4
-+        vld1.32   {d16[0]}, [r1, :32], r3
-+        vld1.32   {d16[1]}, [r1, :32], r3
-+        vld1.32   {d17[0]}, [r1, :32], r3
-+        vld1.32   {d17[1]}, [r1, :32], r3
-+
-+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
-+
-+        vst1.32   {d16[0]}, [r0, :32], r2
-+        vst1.32   {d16[1]}, [r0, :32], r2
-+        vst1.32   {d17[0]}, [r0, :32], r2
-+        vst1.32   {d17[1]}, [r0, :32], r2
-+        bpl       1b
-+        pop       {r4, pc}
-+endfunc
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
 +
-+@ ff_hevc_sao_band_c_32_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
++        vqadd.s8 q8, q12
++        vshr.u8 q12, q10, #3
++        vadd.s8  q10, \Q_K128
++        vqadd.s8 q9, q13
++        vshr.u8 q13, q11, #3
++        vadd.s8  q11, \Q_K128
 +
-+function ff_hevc_sao_band_c_32_neon_8, export=1
-+        push    {r4, lr}
-+        bl      band_load_c
++        vsub.s8  q8, \Q_K128
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vsub.s8  q9, \Q_K128
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
++        vqadd.s8 q10, q12
++        vqadd.s8 q11, q13
++        vsub.s8  q10, \Q_K128
++        vsub.s8  q11, \Q_K128
++.endm
 +
-+        vmov.i8   q15, #128
-+        sub       r3, #32
-+        sub       r2, #32
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
 +
-+1:      subs      r12, #1
-+        vld2.8    { q8, q9 }, [r1, :128]!
-+        vld2.8    {q10, q11}, [r1, :128], r3
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT1, d25
 +
-+        pld       [r4]
++        vqadd.s8 q8, q12
++        vsub.s8  q8, \Q_K128
++.endm
 +
-+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
 +
-+        vst2.8    { q8, q9 }, [r0, :128]!
-+        vst2.8    {q10, q11}, [r0, :128], r2
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
 +
-+        itt ne
-+        addne     r4, r3
-+        addne     r4, #32
++@ Clobbers q12, q13
++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vtbl.8    d26, \XLAT0, d26
++        vtbl.8    d27, \XLAT1, d27
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vaddw.s8  \Q2, d26
++        vaddw.s8  \Q3, d27
++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
 +
-+        bpl       1b
++@ Clobbers q12
++.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++.endm
 +
-+        pop     {r4, pc}
-+endfunc
 +
-+@ ff_hevc_sao_band_c_16_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
 +
-+function ff_hevc_sao_band_c_16_neon_8, export=1
-+        push    {r4, lr}
-+        bl      band_load_c
-+        vmov.i8   q15, #128
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
 +
-+1:      subs      r12, #2
-+        vld2.8    { q8, q9 }, [r1, :128], r3
-+        vld2.8    {q10, q11}, [r1, :128], r3
++function band_load_y
++        vmov.i64  q0, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q1, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
 +
-+        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++        mov       r4, sp
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
++        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
++        mov       sp, r4
 +
-+        vst2.8    { q8, q9 }, [r0, :128], r2
-+        vst2.8    {q10, q11}, [r0, :128], r2
++        ldr       r12, [sp, #20]        @ height
++        pld       [r1]
 +
-+        bpl       1b
-+        pop     {r4, pc}
++        sub       r12, #1
++        add       r4, r1, r3
++        bx        lr
 +endfunc
 +
-+@ ff_hevc_sao_band_c_8_neon_8(
-+@   uint8_t * dst          [r0]
-+@   uint8_t * src          [r1]
-+@   uint32_t dst_stride    [r2]
-+@   uint32_t src_stride    [r3]
-+@   const int16_t * table1 sp[0]
-+@   uint32_t offset1       sp[4]
-+@   const int16_t * table2 sp[8]
-+@   uint32_t offset2       sp[12]
-+@   int width              sp[16]
-+@   int height             sp[20]
 +
-+function ff_hevc_sao_band_c_8_neon_8, export=1
-+        push    {r4, lr}
-+        bl      band_load_c
-+        ldr       lr, [sp, #16]         @ width
-+        vmov.u8   q15, #128
-+        cmp       lr, #8
-+        blt       4f
++function band_load_c
++        vmov.i64  q2, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q3, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
 +
-+1:      subs      r12, #1
-+        vld2.8    {d16, d17}, [r1, :128], r3
++        mov       r4, sp                @ Remember SP
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
 +
-+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
 +
-+        vst2.8    {d16, d17}, [r0, :128], r2
-+        bpl       1b
-+        pop     {r4, pc}
++        @ And again for the 2nd set
++        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        ldr       r12, [r4, #20]        @ sao_left_class2
 +
-+4:
-+1:      subs      r12, #1
-+        vld1.8    {d16}, [r1, :64], r3
-+        vld1.8    {d17}, [r1, :64], r3
-+        vuzp.8    d16, d17
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
 +
-+        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++        mov       sp, r4
 +
-+        vzip.8    d16, d17
-+        vst1.8    {d16}, [r0, :64], r2
-+        vst1.8    {d17}, [r0, :64], r2
-+        bpl       1b
-+        pop     {r4, pc}
++        ldr       r12, [sp, #28]        @ height
++        pld       [r1]
++
++        subs      r12, #1
++        add       r4, r1, r3
++        bx        lr
 +endfunc
 +
 +
-+@ ff_hevc_sao_band_64_neon_10 (
++@ ff_hevc_rpi_sao_band_64_neon_8 (
 +@   uint8_t *_dst,              [r0]
 +@   uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,       [r2]
@@ -5534,16 +6626,268 @@ index 0000000000..30113d9c93
 +@   int width,                  [sp, #8]
 +@   int height)                 [sp, #12]
 +
-+.macro band_64_16 bit_depth
++function ff_hevc_rpi_sao_band_64_neon_8, export=1
 +        push      {r4, lr}
-+        movw      lr, #(1 << \bit_depth) - 1
-+        vmov.i64  q2, #0
-+        vdup.i16  q3, lr
 +        bl        band_load_y
-+        vpush     {q4-q7}
++        vmov.u8   q15, #128
 +
 +1:      subs      r12, #1
-+        vldm      r1, {q4-q11}
++        vldm      r1, {q8-q11}
++        pld       [r4]
++        add       r1, r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        it ne
++        addne     r4, r3
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_32_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #2
++        vld1.8    { q8, q9 }, [r1, :128], r3
++        vld1.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8, q9 }, [r0, :128], r2
++        vst1.8    {q10, q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_16_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #4
++        vld1.8    { q8}, [r1, :128], r3
++        vld1.8    { q9}, [r1, :128], r3
++        vld1.8    {q10}, [r1, :128], r3
++        vld1.8    {q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8}, [r0, :128], r2
++        vst1.8    { q9}, [r0, :128], r2
++        vst1.8    {q10}, [r0, :128], r2
++        vst1.8    {q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_8_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #2
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #4
++        vld1.32   {d16[0]}, [r1, :32], r3
++        vld1.32   {d16[1]}, [r1, :32], r3
++        vld1.32   {d17[0]}, [r1, :32], r3
++        vld1.32   {d17[1]}, [r1, :32], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.32   {d16[0]}, [r0, :32], r2
++        vst1.32   {d16[1]}, [r0, :32], r2
++        vst1.32   {d17[0]}, [r0, :32], r2
++        vst1.32   {d17[1]}, [r0, :32], r2
++        bpl       1b
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_32_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++
++        vmov.i8   q15, #128
++        sub       r3, #32
++        sub       r2, #32
++
++1:      subs      r12, #1
++        vld2.8    { q8, q9 }, [r1, :128]!
++        vld2.8    {q10, q11}, [r1, :128], r3
++
++        pld       [r4]
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    { q8, q9 }, [r0, :128]!
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        itt ne
++        addne     r4, r3
++        addne     r4, #32
++
++        bpl       1b
++
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_16_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        vmov.i8   q15, #128
++
++1:      subs      r12, #2
++        vld2.8    { q8, q9 }, [r1, :128], r3
++        vld2.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_8_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #1
++        vld2.8    {d16, d17}, [r1, :128], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    {d16, d17}, [r0, :128], r2
++        bpl       1b
++        pop     {r4, pc}
++
++4:
++1:      subs      r12, #1
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++        vuzp.8    d16, d17
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vzip.8    d16, d17
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_64_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q2, #0
++        vdup.i16  q3, lr
++        bl        band_load_y
++        vpush     {q4-q7}
++
++1:      subs      r12, #1
++        vldm      r1, {q4-q11}
 +        add       r1, r3
 +        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
 +        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
@@ -5555,11 +6899,11 @@ index 0000000000..30113d9c93
 +        pop       {r4, pc}
 +.endm
 +
-+function ff_hevc_sao_band_64_neon_10, export=1
++function ff_hevc_rpi_sao_band_64_neon_10, export=1
 +        band_64_16 10
 +endfunc
 +
-+@ ff_hevc_sao_band_32_neon_10 (
++@ ff_hevc_rpi_sao_band_32_neon_10 (
 +@   uint8_t *_dst,              [r0]
 +@   uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,       [r2]
@@ -5587,11 +6931,11 @@ index 0000000000..30113d9c93
 +        pop       {r4, pc}
 +.endm
 +
-+function ff_hevc_sao_band_32_neon_10, export=1
++function ff_hevc_rpi_sao_band_32_neon_10, export=1
 +        band_32_16 10
 +endfunc
 +
-+@ ff_hevc_sao_band_16_neon_10 (
++@ ff_hevc_rpi_sao_band_16_neon_10 (
 +@   uint8_t *_dst,              [r0]
 +@   uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,       [r2]
@@ -5619,11 +6963,11 @@ index 0000000000..30113d9c93
 +        pop       {r4, pc}
 +.endm
 +
-+function ff_hevc_sao_band_16_neon_10, export=1
++function ff_hevc_rpi_sao_band_16_neon_10, export=1
 +        band_16_16 10
 +endfunc
 +
-+@ ff_hevc_sao_band_8_neon_10 (
++@ ff_hevc_rpi_sao_band_8_neon_10 (
 +@   uint8_t *_dst,              [r0]
 +@   uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,       [r2]
@@ -5667,12 +7011,12 @@ index 0000000000..30113d9c93
 +        pop       {r4, pc}
 +.endm
 +
-+function ff_hevc_sao_band_8_neon_10, export=1
++function ff_hevc_rpi_sao_band_8_neon_10, export=1
 +        band_8_16 10
 +endfunc
 +
 +
-+@ ff_hevc_sao_band_c_32_neon_10(
++@ ff_hevc_rpi_sao_band_c_32_neon_10(
 +@   uint8_t * dst          [r0]
 +@   uint8_t * src          [r1]
 +@   uint32_t dst_stride    [r2]
@@ -5720,12 +7064,12 @@ index 0000000000..30113d9c93
 +        pop       {r4, pc}
 +.endm
 +
-+function ff_hevc_sao_band_c_32_neon_10, export=1
++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
 +        band_c_32_16 10
 +endfunc
 +
 +
-+@ ff_hevc_sao_band_c_16_neon_10(
++@ ff_hevc_rpi_sao_band_c_16_neon_10(
 +@   uint8_t * dst          [r0]
 +@   uint8_t * src          [r1]
 +@   uint32_t dst_stride    [r2]
@@ -5760,12 +7104,12 @@ index 0000000000..30113d9c93
 +        pop       {r4, pc}
 +.endm
 +
-+function ff_hevc_sao_band_c_16_neon_10, export=1
++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
 +        band_c_16_16 10
 +endfunc
 +
 +
-+@ ff_hevc_sao_band_c_8_neon_10(
++@ ff_hevc_rpi_sao_band_c_8_neon_10(
 +@   uint8_t * dst          [r0]
 +@   uint8_t * src          [r1]
 +@   uint32_t dst_stride    [r2]
@@ -5811,7 +7155,7 @@ index 0000000000..30113d9c93
 +        pop       {r4, pc}
 +.endm
 +
-+function ff_hevc_sao_band_c_8_neon_10, export=1
++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
 +        band_c_8_16 10
 +endfunc
 +
@@ -6079,7 +7423,7 @@ index 0000000000..30113d9c93
 +endfunc
 +
 +
-+@ ff_hevc_sao_edge_[c_]xx_neon(
++@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
 +@   uint8_t *_dst,                    [r0]
 +@   const uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,             [r2]
@@ -6729,7 +8073,7 @@ index 0000000000..30113d9c93
 +35:     edge_4bx4_e3    \body_fn, \pb
 +.endm
 +
-+@ void ff_hevc_sao_edge_8_neon_8(
++@ void ff_hevc_rpi_sao_edge_8_neon_8(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6738,13 +8082,13 @@ index 0000000000..30113d9c93
 +@   int width,                [sp, #4]
 +@   int height)               [sp, #8]
 +
-+function ff_hevc_sao_edge_8_neon_8, export=1
++function ff_hevc_rpi_sao_edge_8_neon_8, export=1
 +        edge_16b_init   8, 0, 1, 99f
 +99:
 +        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
 +endfunc
 +
-+@ void ff_hevc_sao_edge_16_neon_8(
++@ void ff_hevc_rpi_sao_edge_16_neon_8(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6753,13 +8097,13 @@ index 0000000000..30113d9c93
 +@   int width,                [sp, #4]
 +@   int height)               [sp, #8]
 +
-+function ff_hevc_sao_edge_16_neon_8, export=1
++function ff_hevc_rpi_sao_edge_16_neon_8, export=1
 +        edge_16b_init   8, 0, 0, 99f
 +99:
 +        edge_16b_bodies edge_16b_body_8, 1
 +endfunc
 +
-+@ void ff_hevc_sao_edge_32_neon_8(
++@ void ff_hevc_rpi_sao_edge_32_neon_8(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6768,13 +8112,13 @@ index 0000000000..30113d9c93
 +@   int width,                [sp, #4]
 +@   int height)               [sp, #8]
 +
-+function ff_hevc_sao_edge_32_neon_8, export=1
++function ff_hevc_rpi_sao_edge_32_neon_8, export=1
 +        edge_64b_init   8, 0, 0, 99f
 +99:
 +        edge_32bx2_bodies edge_64b_body_8, 1
 +endfunc
 +
-+@ void ff_hevc_sao_edge_64_neon_8(
++@ void ff_hevc_rpi_sao_edge_64_neon_8(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6783,13 +8127,13 @@ index 0000000000..30113d9c93
 +@   int width,                [sp, #4]
 +@   int height)               [sp, #8]
 +
-+function ff_hevc_sao_edge_64_neon_8, export=1
++function ff_hevc_rpi_sao_edge_64_neon_8, export=1
 +        edge_64b_init   8, 0, 0, 99f
 +99:
 +        edge_64b_bodies edge_64b_body_8, 1
 +endfunc
 +
-+@ ff_hevc_sao_edge_c_8_neon_8(
++@ ff_hevc_rpi_sao_edge_c_8_neon_8(
 +@   uint8_t *_dst,                    [r0]
 +@   const uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,             [r2]
@@ -6799,13 +8143,13 @@ index 0000000000..30113d9c93
 +@   int width,                        [sp, #8]
 +@   int height)                       [sp, #12]
 +
-+function ff_hevc_sao_edge_c_8_neon_8, export=1
++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
 +        edge_16b_init   8, 1, 1, 99f
 +99:
 +        edge_16b_8bx2_bodies edge_16b_body_8, 2
 +endfunc
 +
-+@ ff_hevc_sao_edge_c_16_neon_8(
++@ ff_hevc_rpi_sao_edge_c_16_neon_8(
 +@   uint8_t *_dst,                    [r0]
 +@   const uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,             [r2]
@@ -6815,13 +8159,13 @@ index 0000000000..30113d9c93
 +@   int width,                        [sp, #8]
 +@   int height)                       [sp, #12]
 +
-+function ff_hevc_sao_edge_c_16_neon_8, export=1
++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
 +        edge_64b_init   8, 1, 0, 99f
 +99:
 +        edge_32bx2_bodies edge_64b_body_8, 2
 +endfunc
 +
-+@ ff_hevc_sao_edge_c_32_neon_8(
++@ ff_hevc_rpi_sao_edge_c_32_neon_8(
 +@   uint8_t *_dst,                    [r0]
 +@   const uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,             [r2]
@@ -6831,13 +8175,13 @@ index 0000000000..30113d9c93
 +@   int width,                        [sp, #8]
 +@   int height)                       [sp, #12]
 +
-+function ff_hevc_sao_edge_c_32_neon_8, export=1
++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
 +        edge_64b_init   8, 1, 0, 99f
 +99:
 +        edge_64b_bodies edge_64b_body_8, 2
 +endfunc
 +
-+@ void ff_hevc_sao_edge_8_neon_10(
++@ void ff_hevc_rpi_sao_edge_8_neon_10(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6846,13 +8190,13 @@ index 0000000000..30113d9c93
 +@   int width,                [sp, #4]
 +@   int height)               [sp, #8]
 +
-+function ff_hevc_sao_edge_8_neon_10, export=1
++function ff_hevc_rpi_sao_edge_8_neon_10, export=1
 +        edge_16b_init   10, 0, 1, 99f
 +99:
 +        edge_16b_8bx2_bodies edge_16b_body_16, 2
 +endfunc
 +
-+@ void ff_hevc_sao_edge_16_neon_10(
++@ void ff_hevc_rpi_sao_edge_16_neon_10(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6861,13 +8205,13 @@ index 0000000000..30113d9c93
 +@   int width,                [sp, #4]
 +@   int height)               [sp, #8]
 +
-+function ff_hevc_sao_edge_16_neon_10, export=1
++function ff_hevc_rpi_sao_edge_16_neon_10, export=1
 +        edge_64b_init   10, 0, 0, 99f
 +99:
 +        edge_32bx2_bodies edge_64b_body_16, 2
 +endfunc
 +
-+@ void ff_hevc_sao_edge_64_neon_10(
++@ void ff_hevc_rpi_sao_edge_64_neon_10(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6882,11 +8226,11 @@ index 0000000000..30113d9c93
 +@ Calling code will always have src != dst so we don't have to worry
 +@ about edge effects
 +
-+function ff_hevc_sao_edge_64_neon_10, export=1
++function ff_hevc_rpi_sao_edge_64_neon_10, export=1
 +        edge_64b_init   10, 0, 1, 99f
 +endfunc
 +
-+@ void ff_hevc_sao_edge_32_neon_10(
++@ void ff_hevc_rpi_sao_edge_32_neon_10(
 +@   uint8_t *_dst,            [r0]
 +@   uint8_t *_src,            [r1]
 +@   int  stride_dst,          [r2]
@@ -6895,13 +8239,13 @@ index 0000000000..30113d9c93
 +@   int width,                [sp, #4]
 +@   int height)               [sp, #8]
 +
-+function ff_hevc_sao_edge_32_neon_10, export=1
++function ff_hevc_rpi_sao_edge_32_neon_10, export=1
 +        edge_64b_init   10, 0, 0, 99f
 +99:
 +        edge_64b_bodies edge_64b_body_16, 2
 +endfunc
 +
-+@ ff_hevc_sao_edge_c_8_neon_10(
++@ ff_hevc_rpi_sao_edge_c_8_neon_10(
 +@   uint8_t *_dst,                    [r0]
 +@   const uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,             [r2]
@@ -6911,13 +8255,13 @@ index 0000000000..30113d9c93
 +@   int width,                        [sp, #8]
 +@   int height)                       [sp, #12]
 +
-+function ff_hevc_sao_edge_c_8_neon_10, export=1
++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
 +        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
 +99:
 +        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
 +endfunc
 +
-+@ ff_hevc_sao_edge_c_32_neon_10(
++@ ff_hevc_rpi_sao_edge_c_32_neon_10(
 +@   uint8_t *_dst,                    [r0]
 +@   const uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,             [r2]
@@ -6927,12 +8271,12 @@ index 0000000000..30113d9c93
 +@   int width,                        [sp, #8]
 +@   int height)                       [sp, #12]
 +
-+function ff_hevc_sao_edge_c_32_neon_10, export=1
++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
 +        edge_64b_init   10, 1, 1, 99f
 +endfunc
 +
 +
-+@ ff_hevc_sao_edge_c_16_neon_10(
++@ ff_hevc_rpi_sao_edge_c_16_neon_10(
 +@   uint8_t *_dst,                    [r0]
 +@   const uint8_t *_src,              [r1]
 +@   ptrdiff_t stride_dst,             [r2]
@@ -6942,44 +8286,17 @@ index 0000000000..30113d9c93
 +@   int width,                        [sp, #8]
 +@   int height)                       [sp, #12]
 +
-+function ff_hevc_sao_edge_c_16_neon_10, export=1
++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
 +        edge_64b_init   10, 1, 0, 99f
 +99:
 +        edge_64b_bodies edge_64b_body_16, 4
 +endfunc
 +
 diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 18c3e3ea1e..6b380bbdf2 100644
+index 18c3e3ea1e..c26b6d607c 100644
 --- a/libavcodec/avcodec.h
 +++ b/libavcodec/avcodec.h
-@@ -449,6 +449,8 @@ enum AVCodecID {
-     AV_CODEC_ID_GDV,
-     AV_CODEC_ID_FITS,
- 
-+    AV_CODEC_ID_H264_MVC,
-+
-     /* various PCM "codecs" */
-     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
-     AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -2965,6 +2967,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED       16384
- #define FF_BUG_IEDGE           32768
-+#define FF_BUG_GMC_UNSUPPORTED (1 << 16)
- 
-     /**
-      * strictly follow the standard (MPEG-4, ...).
-@@ -3317,6 +3320,9 @@ typedef struct AVCodecContext {
- #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
- #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
- #define FF_PROFILE_H264_CAVLC_444            44
-+#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
-+#define FF_PROFILE_H264_STEREO_HIGH          128
-+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
- 
- #define FF_PROFILE_VC1_SIMPLE   0
- #define FF_PROFILE_VC1_MAIN     1
-@@ -3627,7 +3633,13 @@ typedef struct AVCodecContext {
+@@ -3627,7 +3627,13 @@ typedef struct AVCodecContext {
  #endif
  
      /**
@@ -6994,6 +8311,24 @@ index 18c3e3ea1e..6b380bbdf2 100644
       * the end of the audio. I.e. this number of decoded samples must be
       * discarded by the caller from the end of the stream to get the original
       * audio without any trailing padding.
+@@ -4816,6 +4822,17 @@ void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst);
+  */
+ AVCodec *avcodec_find_decoder(enum AVCodecID id);
+ 
++/**
++ * Find a registered decoder with a matching codec ID and pix_fmt.
++ * A decoder will pix_fmt set to NULL will match any fmt.
++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
++ *
++ * @param id AVCodecID of the requested decoder
++ * @param fmt AVPixelForma that msut be supported by decoder
++ * @return A decoder if one was found, NULL otherwise.
++ */
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
++
+ /**
+  * Find a registered decoder with the specified name.
+  *
 diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
 index 1bf1c620d6..ccfa991f60 100644
 --- a/libavcodec/cabac.h
@@ -7030,10 +8365,10 @@ index af0f6da2e9..bd491c0c55 100644
      AVCodecContext *avctx;
      BswapDSPContext bdsp;
 diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 6a13bbbf0e..ff10f3b2bc 100644
+index 6a13bbbf0e..478b7c0ffc 100644
 --- a/libavcodec/codec_desc.c
 +++ b/libavcodec/codec_desc.c
-@@ -1665,6 +1665,48 @@ static const AVCodecDescriptor codec_descriptors[] = {
+@@ -1665,6 +1665,41 @@ static const AVCodecDescriptor codec_descriptors[] = {
          .props     = AV_CODEC_PROP_LOSSLESS,
          .mime_types= MT("image/png"),
      },
@@ -7071,202 +8406,194 @@ index 6a13bbbf0e..ff10f3b2bc 100644
 +        .name      = "ylc",
 +        .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
 +        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-+    },
-+    {
-+        .id        = AV_CODEC_ID_H264_MVC,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "h264_mvc",
-+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
-+        .props     = AV_CODEC_PROP_LOSSY,
 +    },
  
      /* various PCM "codecs" */
      {
-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
-index 86df5eb9b3..22c4f1d82a 100644
---- a/libavcodec/h264.h
-+++ b/libavcodec/h264.h
-@@ -41,7 +41,9 @@ enum {
-     H264_NAL_END_STREAM      = 11,
-     H264_NAL_FILLER_DATA     = 12,
-     H264_NAL_SPS_EXT         = 13,
-+    H264_NAL_SPS_SUBSET      = 15,
-     H264_NAL_AUXILIARY_SLICE = 19,
-+    H264_NAL_SLICE_EXT       = 20,
- };
- 
- #endif /* AVCODEC_H264_H */
 diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index dd0a965af0..5e43def0e9 100644
+index dd0a965af0..053325c26b 100644
 --- a/libavcodec/h264_parser.c
 +++ b/libavcodec/h264_parser.c
-@@ -60,6 +60,8 @@ typedef struct H264ParseContext {
-     uint8_t parse_history[6];
-     int parse_history_count;
-     int parse_last_mb;
-+    int is_mvc;
-+    int slice_ext;
-     int64_t reference_dts;
-     int last_frame_num, last_picture_structure;
- } H264ParseContext;
-@@ -109,24 +111,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-         } else if (state <= 5) {
-             int nalu_type = buf[i] & 0x1F;
-             if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
--                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
-+                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
-+                nalu_type == H264_NAL_SPS_SUBSET) {
-                 if (pc->frame_start_found) {
-                     i++;
+@@ -115,7 +115,7 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
                      goto found;
                  }
              } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
 -                       nalu_type == H264_NAL_IDR_SLICE) {
-+                       nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) {
++                       nalu_type == H264_NAL_IDR_SLICE)) {
                  state += 8;
-+
-+                p->slice_ext = (nalu_type == H264_NAL_SLICE_EXT);
                  continue;
              }
-             state = 7;
-         } else {
-             p->parse_history[p->parse_history_count++] = buf[i];
--            if (p->parse_history_count > 5) {
-+            if (p->parse_history_count > 8) {
-                 unsigned int mb, last_mb = p->parse_last_mb;
-                 GetBitContext gb;
- 
--                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
-+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
-                 p->parse_history_count = 0;
-                 mb= get_ue_golomb_long(&gb);
-                 p->parse_last_mb = mb;
-@@ -149,7 +154,7 @@ found:
-     pc->frame_start_found = 0;
-     if (p->is_avc)
-         return next_avc;
--    return i - (state & 5) - 5 * (state > 7);
-+    return i - (state & 5) - 8 * (state > 7);
- }
+diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+index 0b1195dc3e..5ef81fa739 100644
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
+@@ -24,6 +24,9 @@
+  * MMAL Video Decoder
+  */
  
- static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
-@@ -594,7 +599,8 @@ static int h264_parse(AVCodecParserContext *s,
-         }
-     }
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ #include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/mmal_parameters_video.h>
+@@ -31,6 +34,7 @@
+ #include <interface/mmal/util/mmal_util_params.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/mmal/vc/mmal_vc_api.h>
++#pragma GCC diagnostic pop
+ #include <stdatomic.h>
  
--    parse_nal_units(s, avctx, buf, buf_size);
-+    if (!p->is_mvc)
-+        parse_nal_units(s, avctx, buf, buf_size);
+ #include "avcodec.h"
+diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+index 8da2a9735e..9089f9b4ea 100644
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -283,6 +283,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
  
-     if (avctx->framerate.num)
-         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
-@@ -651,7 +657,7 @@ static int h264_split(AVCodecContext *avctx,
-         if ((state & 0xFFFFFF00) != 0x100)
-             break;
-         nalu_type = state & 0x1F;
--        if (nalu_type == H264_NAL_SPS) {
-+        if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) {
-             has_sps = 1;
-         } else if (nalu_type == H264_NAL_PPS)
-             has_pps = 1;
-@@ -703,3 +709,23 @@ AVCodecParser ff_h264_parser = {
-     .parser_close   = h264_close,
-     .split          = h264_split,
- };
++    /* RPI (Might as well define for everything) */
++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
 +
-+static av_cold int init_mvc(AVCodecParserContext *s)
+     /* special */
+     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+index d181b74570..76e844caa8 100644
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -24,6 +24,7 @@
+  * Raw Video Encoder
+  */
+ 
++#include "config.h"
+ #include "avcodec.h"
+ #include "raw.h"
+ #include "internal.h"
+@@ -31,6 +32,10 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#if CONFIG_RPI
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -49,6 +54,55 @@ FF_ENABLE_DEPRECATION_WARNINGS
+     return 0;
+ }
+ 
++#if CONFIG_RPI
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
 +{
-+    H264ParseContext *p = s->priv_data;
-+    int ret = init(s);
-+    if (ret < 0)
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3 / 2;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
 +        return ret;
 +
-+    p->is_mvc = 1;
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height;
++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
 +    return 0;
 +}
 +
-+AVCodecParser ff_h264_mvc_parser = {
-+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
-+    .priv_data_size = sizeof(H264ParseContext),
-+    .parser_init    = init_mvc,
-+    .parser_parse   = h264_parse,
-+    .parser_close   = h264_close,
-+    .split          = h264_split,
-+};
-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index f0fb919a7f..f3e3732ce1 100644
---- a/libavcodec/hevc.h
-+++ b/libavcodec/hevc.h
-@@ -21,6 +21,49 @@
- #ifndef AVCODEC_HEVC_H
- #define AVCODEC_HEVC_H
- 
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#ifndef RPI
-+
-+  #define RPI_INTER          0
-+  #define RPI_TSTATS         0
-+  #define RPI_HEVC_SAND      0
-+
-+#else
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3;
++    uint8_t * dst;
++    int ret;
 +
-+  #include "rpi_qpu.h"
-+  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
 +
-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-+  // This has no effect unless RPI_WORKER is defined
-+  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
-+  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
-+  // free for the foreground to fill in.
-+  #define RPI_MAX_JOBS 8
++    dst = pkt->data;
 +
-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-+  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
-+  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
-+  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
-+  //
-+  // * Whilst most of the code still exists it will have rotted by now
-+//  #define RPI_DEBLOCK_VPU
++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++    dst += width * height * 2;
++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++    return 0;
++}
++#endif
 +
-+  #define RPI_VPU_DEBLOCK_CACHED 1
 +
-+  #if HAVE_NEON
-+  #define RPI_HEVC_SAND      1
-+  #else
-+  // Sand bust on Pi1 currently - reasons unknown
-+  #define RPI_HEVC_SAND      0
-+  #endif
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *frame, int *got_packet)
+ {
+@@ -58,6 +112,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+     if (ret < 0)
+         return ret;
+ 
++#if CONFIG_RPI
++    if (av_rpi_is_sand_frame(frame)) {
++        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
++        *got_packet = (ret == 0);
++        return ret;
++    }
++#endif
 +
+     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+         return ret;
+     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
+new file mode 100644
+index 0000000000..e498c1a3eb
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac.c
+@@ -0,0 +1,2381 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+  #define RPI_QPU_EMU_Y      0
-+  #define RPI_QPU_EMU_C      0
++#define UNCHECKED_BITSTREAM_READER 1
 +
-+  #define RPI_TSTATS 0
-+#endif
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
 +
- /**
-  * Table 7-3: NAL unit type codes
-  */
-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 853fd3f722..e8e6ad3b1a 100644
---- a/libavcodec/hevc_cabac.c
-+++ b/libavcodec/hevc_cabac.c
-@@ -21,6 +21,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#define UNCHECKED_BITSTREAM_READER 1
++#include "cabac_functions.h"
++#include "rpi_hevc_data.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
 +
- #include "libavutil/attributes.h"
- #include "libavutil/common.h"
- 
-@@ -29,8 +31,68 @@
- #include "hevc.h"
- #include "hevcdec.h"
- 
-+#ifdef RPI
 +#include "libavutil/rpi_sand_fns.h"
-+#endif
 +
 +// BY22 is probably faster than simple bypass if the processor has
 +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
@@ -7285,11 +8612,11 @@ index 853fd3f722..e8e6ad3b1a 100644
 +#define USE_N_END_1 1
 +
 +#if ARCH_ARM
-+#include "arm/hevc_cabac.h"
++#include "arm/rpi_hevc_cabac.h"
 +#endif
 +
- #define CABAC_MAX_BIN 31
- 
++#define CABAC_MAX_BIN 31
++
 +
 +#if USE_BY22 && !USE_BY22_DIV
 +#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
@@ -7326,14 +8653,422 @@ index 853fd3f722..e8e6ad3b1a 100644
 +#undef I
 +#endif  // USE_BY22
 +
- /**
-  * number of bin by SyntaxElement.
-  */
-@@ -447,19 +509,227 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
-     { 28, 36, 43, 49, 54, 58, 61, 63, },
- };
- 
--void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
++/**
++ * number of bin by SyntaxElement.
++ */
++static const int8_t num_bins_in_se[] = {
++     1, // sao_merge_flag
++     1, // sao_type_idx
++     0, // sao_eo_class
++     0, // sao_band_position
++     0, // sao_offset_abs
++     0, // sao_offset_sign
++     0, // end_of_slice_flag
++     3, // split_coding_unit_flag
++     1, // cu_transquant_bypass_flag
++     3, // skip_flag
++     3, // cu_qp_delta
++     1, // pred_mode
++     4, // part_mode
++     0, // pcm_flag
++     1, // prev_intra_luma_pred_mode
++     0, // mpm_idx
++     0, // rem_intra_luma_pred_mode
++     2, // intra_chroma_pred_mode
++     1, // merge_flag
++     1, // merge_idx
++     5, // inter_pred_idc
++     2, // ref_idx_l0
++     2, // ref_idx_l1
++     2, // abs_mvd_greater0_flag
++     2, // abs_mvd_greater1_flag
++     0, // abs_mvd_minus2
++     0, // mvd_sign_flag
++     1, // mvp_lx_flag
++     1, // no_residual_data_flag
++     3, // split_transform_flag
++     2, // cbf_luma
++     4, // cbf_cb, cbf_cr
++     2, // transform_skip_flag[][]
++     2, // explicit_rdpcm_flag[][]
++     2, // explicit_rdpcm_dir_flag[][]
++    18, // last_significant_coeff_x_prefix
++    18, // last_significant_coeff_y_prefix
++     0, // last_significant_coeff_x_suffix
++     0, // last_significant_coeff_y_suffix
++     4, // significant_coeff_group_flag
++    44, // significant_coeff_flag
++    24, // coeff_abs_level_greater1_flag
++     6, // coeff_abs_level_greater2_flag
++     0, // coeff_abs_level_remaining
++     0, // coeff_sign_flag
++     8, // log2_res_scale_abs
++     2, // res_scale_sign_flag
++     1, // cu_chroma_qp_offset_flag
++     1, // cu_chroma_qp_offset_idx
++};
++
++/**
++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
++ */
++static const int elem_offset[sizeof(num_bins_in_se)] = {
++    0, // sao_merge_flag
++    1, // sao_type_idx
++    2, // sao_eo_class
++    2, // sao_band_position
++    2, // sao_offset_abs
++    2, // sao_offset_sign
++    2, // end_of_slice_flag
++    2, // split_coding_unit_flag
++    5, // cu_transquant_bypass_flag
++    6, // skip_flag
++    9, // cu_qp_delta
++    12, // pred_mode
++    13, // part_mode
++    17, // pcm_flag
++    17, // prev_intra_luma_pred_mode
++    18, // mpm_idx
++    18, // rem_intra_luma_pred_mode
++    18, // intra_chroma_pred_mode
++    20, // merge_flag
++    21, // merge_idx
++    22, // inter_pred_idc
++    27, // ref_idx_l0
++    29, // ref_idx_l1
++    31, // abs_mvd_greater0_flag
++    33, // abs_mvd_greater1_flag
++    35, // abs_mvd_minus2
++    35, // mvd_sign_flag
++    35, // mvp_lx_flag
++    36, // no_residual_data_flag
++    37, // split_transform_flag
++    40, // cbf_luma
++    42, // cbf_cb, cbf_cr
++    46, // transform_skip_flag[][]
++    48, // explicit_rdpcm_flag[][]
++    50, // explicit_rdpcm_dir_flag[][]
++    52, // last_significant_coeff_x_prefix
++    70, // last_significant_coeff_y_prefix
++    88, // last_significant_coeff_x_suffix
++    88, // last_significant_coeff_y_suffix
++    88, // significant_coeff_group_flag
++    92, // significant_coeff_flag
++    136, // coeff_abs_level_greater1_flag
++    160, // coeff_abs_level_greater2_flag
++    166, // coeff_abs_level_remaining
++    166, // coeff_sign_flag
++    166, // log2_res_scale_abs
++    174, // res_scale_sign_flag
++    176, // cu_chroma_qp_offset_flag
++    177, // cu_chroma_qp_offset_idx
++};
++
++#define CNU 154
++/**
++ * Indexed by init_type
++ */
++static const uint8_t init_values[3][HEVC_CONTEXTS] = {
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      200,
++      // split_coding_unit_flag
++      139, 141, 157,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      CNU, CNU, CNU,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      CNU,
++      // part_mode
++      184, CNU, CNU, CNU,
++      // prev_intra_luma_pred_mode
++      184,
++      // intra_chroma_pred_mode
++      63, 139,
++      // merge_flag
++      CNU,
++      // merge_idx
++      CNU,
++      // inter_pred_idc
++      CNU, CNU, CNU, CNU, CNU,
++      // ref_idx_l0
++      CNU, CNU,
++      // ref_idx_l1
++      CNU, CNU,
++      // abs_mvd_greater1_flag
++      CNU, CNU,
++      // abs_mvd_greater1_flag
++      CNU, CNU,
++      // mvp_lx_flag
++      CNU,
++      // no_residual_data_flag
++      CNU,
++      // split_transform_flag
++      153, 138, 138,
++      // cbf_luma
++      111, 141,
++      // cbf_cb, cbf_cr
++      94, 138, 182, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++       79, 108, 123,  63,
++      // last_significant_coeff_y_prefix
++      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++       79, 108, 123,  63,
++      // significant_coeff_group_flag
++      91, 171, 134, 141,
++      // significant_coeff_flag
++      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
++      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
++      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
++      141, 111,
++      // coeff_abs_level_greater1_flag
++      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
++      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
++      // coeff_abs_level_greater2_flag
++      138, 153, 136, 167, 152, 152,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      185,
++      // split_coding_unit_flag
++      107, 139, 126,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      197, 185, 201,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      149,
++      // part_mode
++      154, 139, 154, 154,
++      // prev_intra_luma_pred_mode
++      154,
++      // intra_chroma_pred_mode
++      152, 139,
++      // merge_flag
++      110,
++      // merge_idx
++      122,
++      // inter_pred_idc
++      95, 79, 63, 31, 31,
++      // ref_idx_l0
++      153, 153,
++      // ref_idx_l1
++      153, 153,
++      // abs_mvd_greater1_flag
++      140, 198,
++      // abs_mvd_greater1_flag
++      140, 198,
++      // mvp_lx_flag
++      168,
++      // no_residual_data_flag
++      79,
++      // split_transform_flag
++      124, 138, 94,
++      // cbf_luma
++      153, 111,
++      // cbf_cb, cbf_cr
++      149, 107, 167, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
++       94, 108, 123, 108,
++      // last_significant_coeff_y_prefix
++      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
++       94, 108, 123, 108,
++      // significant_coeff_group_flag
++      121, 140, 61, 154,
++      // significant_coeff_flag
++      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
++      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
++      140, 140,
++      // coeff_abs_level_greater1_flag
++      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
++      // coeff_abs_level_greater2_flag
++      107, 167, 91, 122, 107, 167,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      160,
++      // split_coding_unit_flag
++      107, 139, 126,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      197, 185, 201,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      134,
++      // part_mode
++      154, 139, 154, 154,
++      // prev_intra_luma_pred_mode
++      183,
++      // intra_chroma_pred_mode
++      152, 139,
++      // merge_flag
++      154,
++      // merge_idx
++      137,
++      // inter_pred_idc
++      95, 79, 63, 31, 31,
++      // ref_idx_l0
++      153, 153,
++      // ref_idx_l1
++      153, 153,
++      // abs_mvd_greater1_flag
++      169, 198,
++      // abs_mvd_greater1_flag
++      169, 198,
++      // mvp_lx_flag
++      168,
++      // no_residual_data_flag
++      79,
++      // split_transform_flag
++      224, 167, 122,
++      // cbf_luma
++      153, 111,
++      // cbf_cb, cbf_cr
++      149, 92, 167, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
++       79, 108, 123,  93,
++      // last_significant_coeff_y_prefix
++      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
++       79, 108, 123,  93,
++      // significant_coeff_group_flag
++      121, 140, 61, 154,
++      // significant_coeff_flag
++      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
++      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
++      140, 140,
++      // coeff_abs_level_greater1_flag
++      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
++      // coeff_abs_level_greater2_flag
++      107, 167, 91, 107, 107, 167,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++};
++
++static const uint8_t scan_1x1[1] = {
++    0,
++};
++
++static const uint8_t horiz_scan2x2_x[4] = {
++    0, 1, 0, 1,
++};
++
++static const uint8_t horiz_scan2x2_y[4] = {
++    0, 0, 1, 1
++};
++
++static const uint8_t horiz_scan4x4_x[16] = {
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++};
++
++static const uint8_t horiz_scan4x4_y[16] = {
++    0, 0, 0, 0,
++    1, 1, 1, 1,
++    2, 2, 2, 2,
++    3, 3, 3, 3,
++};
++
++static const uint8_t horiz_scan8x8_inv[8][8] = {
++    {  0,  1,  2,  3, 16, 17, 18, 19, },
++    {  4,  5,  6,  7, 20, 21, 22, 23, },
++    {  8,  9, 10, 11, 24, 25, 26, 27, },
++    { 12, 13, 14, 15, 28, 29, 30, 31, },
++    { 32, 33, 34, 35, 48, 49, 50, 51, },
++    { 36, 37, 38, 39, 52, 53, 54, 55, },
++    { 40, 41, 42, 43, 56, 57, 58, 59, },
++    { 44, 45, 46, 47, 60, 61, 62, 63, },
++};
++
++static const uint8_t diag_scan2x2_x[4] = {
++    0, 0, 1, 1,
++};
++
++static const uint8_t diag_scan2x2_y[4] = {
++    0, 1, 0, 1,
++};
++
++static const uint8_t diag_scan2x2_inv[2][2] = {
++    { 0, 2, },
++    { 1, 3, },
++};
++
++static const uint8_t diag_scan4x4_inv[4][4] = {
++    { 0,  2,  5,  9, },
++    { 1,  4,  8, 12, },
++    { 3,  7, 11, 14, },
++    { 6, 10, 13, 15, },
++};
++
++static const uint8_t diag_scan8x8_inv[8][8] = {
++    {  0,  2,  5,  9, 14, 20, 27, 35, },
++    {  1,  4,  8, 13, 19, 26, 34, 42, },
++    {  3,  7, 12, 18, 25, 33, 41, 48, },
++    {  6, 11, 17, 24, 32, 40, 47, 53, },
++    { 10, 16, 23, 31, 39, 46, 52, 57, },
++    { 15, 22, 30, 38, 45, 51, 56, 60, },
++    { 21, 29, 37, 44, 50, 55, 59, 62, },
++    { 28, 36, 43, 49, 54, 58, 61, 63, },
++};
++
 +
 +typedef struct
 +{
@@ -7460,7 +9195,7 @@ index 853fd3f722..e8e6ad3b1a 100644
 +// into the correct state.  _by22_finish must be called to return to 'normal'
 +// (i.e. non-bypass) cabac decoding
 +static inline void get_cabac_by22_start(CABACContext * const c)
- {
++{
 +    const unsigned int bits = __builtin_ctz(c->low);
 +    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
 +    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
@@ -7539,116 +9274,94 @@ index 853fd3f722..e8e6ad3b1a 100644
 +#endif  // USE_BY22
 +
 +
-+void ff_hevc_save_states(HEVCContext *s, const HEVCLocalContext * const lc, int ctb_addr_ts)
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts)
 +{
 +    // ???? Does this work with tiles + WPP? (No)
 +    // **** Need to save rice state too
 +    // pred_qpy is handled by get_qPy_pred and lc->first_qp_group
-     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-         (ctb_addr_ts % s->ps.sps->ctb_width == 2 ||
-          (s->ps.sps->ctb_width == 2 &&
-           ctb_addr_ts % s->ps.sps->ctb_width == 0))) {
--        memcpy(s->cabac_state, s->HEVClc->cabac_state, HEVC_CONTEXTS);
++    if (s->ps.pps->entropy_coding_sync_enabled_flag &&
++        (ctb_addr_ts % s->ps.sps->ctb_width == 2 ||
++         (s->ps.sps->ctb_width == 2 &&
++          ctb_addr_ts % s->ps.sps->ctb_width == 0))) {
 +        memcpy(s->cabac_state, lc->cabac_state, HEVC_CONTEXTS);
-     }
- }
- 
--static void load_states(HEVCContext *s)
-+static void load_states(const HEVCContext * const s, HEVCLocalContext * const lc)
- {
--    memcpy(s->HEVClc->cabac_state, s->cabac_state, HEVC_CONTEXTS);
++    }
++}
++
++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
 +    memcpy(lc->cabac_state, s->cabac_state, HEVC_CONTEXTS);
- }
- 
- static int cabac_reinit(HEVCLocalContext *lc)
-@@ -467,17 +737,17 @@ static int cabac_reinit(HEVCLocalContext *lc)
-     return skip_bytes(&lc->cc, 0) == NULL ? AVERROR_INVALIDDATA : 0;
- }
- 
--static int cabac_init_decoder(HEVCContext *s)
-+static int cabac_init_decoder(HEVCLocalContext * const lc)
- {
--    GetBitContext *gb = &s->HEVClc->gb;
++}
++
++static int cabac_reinit(HEVCRpiLocalContext *lc)
++{
++    return skip_bytes(&lc->cc, 0) == NULL ? AVERROR_INVALIDDATA : 0;
++}
++
++static int cabac_init_decoder(HEVCRpiLocalContext * const lc)
++{
 +    GetBitContext * const gb = &lc->gb;
-     skip_bits(gb, 1);
-     align_get_bits(gb);
--    return ff_init_cabac_decoder(&s->HEVClc->cc,
++    skip_bits(gb, 1);
++    align_get_bits(gb);
 +    return ff_init_cabac_decoder(&lc->cc,
-                           gb->buffer + get_bits_count(gb) / 8,
-                           (get_bits_left(gb) + 7) / 8);
- }
- 
--static void cabac_init_state(HEVCContext *s)
-+static void cabac_init_state(const HEVCContext * const s, HEVCLocalContext * const lc)
- {
-     int init_type = 2 - s->sh.slice_type;
-     int i;
-@@ -494,194 +764,204 @@ static void cabac_init_state(HEVCContext *s)
-         pre ^= pre >> 31;
-         if (pre > 124)
-             pre = 124 + (pre & 1);
--        s->HEVClc->cabac_state[i] = pre;
++                          gb->buffer + get_bits_count(gb) / 8,
++                          (get_bits_left(gb) + 7) / 8);
++}
++
++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int init_type = 2 - s->sh.slice_type;
++    int i;
++
++    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
++        init_type ^= 3;
++
++    for (i = 0; i < HEVC_CONTEXTS; i++) {
++        int init_value = init_values[init_type][i];
++        int m = (init_value >> 4) * 5 - 45;
++        int n = ((init_value & 15) << 3) - 16;
++        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
++
++        pre ^= pre >> 31;
++        if (pre > 124)
++            pre = 124 + (pre & 1);
 +        lc->cabac_state[i] = pre;
-     }
- 
-     for (i = 0; i < 4; i++)
--        s->HEVClc->stat_coeff[i] = 0;
++    }
++
++    for (i = 0; i < 4; i++)
 +        lc->stat_coeff[i] = 0;
- }
- 
--int ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts)
-+int ff_hevc_cabac_init(const HEVCContext * const s, HEVCLocalContext *const lc, int ctb_addr_ts)
- {
-     if (ctb_addr_ts == s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) {
--        int ret = cabac_init_decoder(s);
++}
++
++int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts)
++{
++    if (ctb_addr_ts == s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) {
 +        int ret = cabac_init_decoder(lc);
-         if (ret < 0)
-             return ret;
-         if (s->sh.dependent_slice_segment_flag == 0 ||
-             (s->ps.pps->tiles_enabled_flag &&
-              s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]))
--            cabac_init_state(s);
++        if (ret < 0)
++            return ret;
++        if (s->sh.dependent_slice_segment_flag == 0 ||
++            (s->ps.pps->tiles_enabled_flag &&
++             s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]))
 +            cabac_init_state(s, lc);
- 
-         if (!s->sh.first_slice_in_pic_flag &&
-             s->ps.pps->entropy_coding_sync_enabled_flag) {
-             if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {
-                 if (s->ps.sps->ctb_width == 1)
--                    cabac_init_state(s);
++
++        if (!s->sh.first_slice_in_pic_flag &&
++            s->ps.pps->entropy_coding_sync_enabled_flag) {
++            if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {
++                if (s->ps.sps->ctb_width == 1)
 +                    cabac_init_state(s, lc);
-                 else if (s->sh.dependent_slice_segment_flag == 1)
--                    load_states(s);
++                else if (s->sh.dependent_slice_segment_flag == 1)
 +                    load_states(s, lc);
-             }
-         }
-     } else {
-         if (s->ps.pps->tiles_enabled_flag &&
-             s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
--            int ret;
--            if (s->threads_number == 1)
--                ret = cabac_reinit(s->HEVClc);
--            else {
--                ret = cabac_init_decoder(s);
--            }
--            if (ret < 0)
--                return ret;
--            cabac_init_state(s);
--        }
--        if (s->ps.pps->entropy_coding_sync_enabled_flag) {
--            if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {
++            }
++        }
++    } else {
++        if (s->ps.pps->tiles_enabled_flag &&
++            s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
 +            if (!lc->wpp_init) {
-                 int ret;
--                get_cabac_terminate(&s->HEVClc->cc);
-                 if (s->threads_number == 1)
--                    ret = cabac_reinit(s->HEVClc);
--                else {
--                    ret = cabac_init_decoder(s);
--                }
++                int ret;
++                if (s->threads_number == 1)  // **** Ummm... can only be 1 in our world but this is a wpp test
 +                    ret = cabac_reinit(lc);
 +                else
 +                    ret = cabac_init_decoder(lc);
-                 if (ret < 0)
-                     return ret;
++                if (ret < 0)
++                    return ret;
 +            }
 +            lc->wpp_init = 0;
 +
@@ -7671,551 +9384,421 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                        return ret;
 +                }
 +                lc->wpp_init = 0;
- 
-                 if (s->ps.sps->ctb_width == 1)
--                    cabac_init_state(s);
++
++                if (s->ps.sps->ctb_width == 1)
 +                    cabac_init_state(s, lc);
-                 else
--                    load_states(s);
++                else
 +                    load_states(s, lc);
-             }
-         }
-     }
-     return 0;
- }
- 
--#define GET_CABAC(ctx) get_cabac(&s->HEVClc->cc, &s->HEVClc->cabac_state[ctx])
-+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
- 
--int ff_hevc_sao_merge_flag_decode(HEVCContext *s)
-+int ff_hevc_sao_merge_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[SAO_MERGE_FLAG]);
-+    return get_cabac(&lc->cc, lc->cabac_state + elem_offset[SAO_MERGE_FLAG]);
- }
- 
--int ff_hevc_sao_type_idx_decode(HEVCContext *s)
-+int ff_hevc_sao_type_idx_decode(HEVCLocalContext * const lc)
- {
--    if (!GET_CABAC(elem_offset[SAO_TYPE_IDX]))
++            }
++        }
++    }
++    return 0;
++}
++
++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
++
++int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return get_cabac(&lc->cc, lc->cabac_state + elem_offset[SAO_MERGE_FLAG]);
++}
++
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
++{
 +    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
-         return 0;
- 
--    if (!get_cabac_bypass(&s->HEVClc->cc))
++        return 0;
++
 +    if (!get_cabac_bypass(&lc->cc))
-         return SAO_BAND;
-     return SAO_EDGE;
- }
- 
--int ff_hevc_sao_band_position_decode(HEVCContext *s)
-+int ff_hevc_sao_band_position_decode(HEVCLocalContext * const lc)
- {
-     int i;
--    int value = get_cabac_bypass(&s->HEVClc->cc);
++        return SAO_BAND;
++    return SAO_EDGE;
++}
++
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
++{
++    int i;
 +    int value = get_cabac_bypass(&lc->cc);
- 
-     for (i = 0; i < 4; i++)
--        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
++
++    for (i = 0; i < 4; i++)
 +        value = (value << 1) | get_cabac_bypass(&lc->cc);
-     return value;
- }
- 
--int ff_hevc_sao_offset_abs_decode(HEVCContext *s)
-+int ff_hevc_sao_offset_abs_decode(const HEVCContext * const s, HEVCLocalContext * const lc)
- {
-     int i = 0;
-     int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
- 
--    while (i < length && get_cabac_bypass(&s->HEVClc->cc))
++    return value;
++}
++
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int i = 0;
++    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
++
 +    while (i < length && get_cabac_bypass(&lc->cc))
-         i++;
-     return i;
- }
- 
--int ff_hevc_sao_offset_sign_decode(HEVCContext *s)
-+int ff_hevc_sao_offset_sign_decode(HEVCLocalContext * const lc)
- {
--    return get_cabac_bypass(&s->HEVClc->cc);
++        i++;
++    return i;
++}
++
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
++{
 +    return get_cabac_bypass(&lc->cc);
- }
- 
--int ff_hevc_sao_eo_class_decode(HEVCContext *s)
-+int ff_hevc_sao_eo_class_decode(HEVCLocalContext * const lc)
- {
--    int ret = get_cabac_bypass(&s->HEVClc->cc) << 1;
--    ret    |= get_cabac_bypass(&s->HEVClc->cc);
++}
++
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
++{
 +    int ret = get_cabac_bypass(&lc->cc) << 1;
 +    ret    |= get_cabac_bypass(&lc->cc);
-     return ret;
- }
- 
--int ff_hevc_end_of_slice_flag_decode(HEVCContext *s)
-+int ff_hevc_end_of_slice_flag_decode(HEVCLocalContext * const lc)
- {
--    return get_cabac_terminate(&s->HEVClc->cc);
++    return ret;
++}
++
++int ff_hevc_rpi_end_of_slice_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return get_cabac_terminate(&lc->cc);
- }
- 
--int ff_hevc_cu_transquant_bypass_flag_decode(HEVCContext *s)
-+int ff_hevc_cu_transquant_bypass_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[CU_TRANSQUANT_BYPASS_FLAG]);
++}
++
++int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[CU_TRANSQUANT_BYPASS_FLAG]);
- }
- 
--int ff_hevc_skip_flag_decode(HEVCContext *s, int x0, int y0, int x_cb, int y_cb)
-+int ff_hevc_skip_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc,
++}
++
++int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
 +                             const int x0, const int y0, const int x_cb, const int y_cb)
- {
-     int min_cb_width = s->ps.sps->min_cb_width;
-     int inc = 0;
-     int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
-     int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
- 
--    if (s->HEVClc->ctb_left_flag || x0b)
++{
++    int min_cb_width = s->ps.sps->min_cb_width;
++    int inc = 0;
++    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
++    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
++
 +    if (lc->ctb_left_flag || x0b)
-         inc = !!SAMPLE_CTB(s->skip_flag, x_cb - 1, y_cb);
--    if (s->HEVClc->ctb_up_flag || y0b)
++        inc = !!SAMPLE_CTB(s->skip_flag, x_cb - 1, y_cb);
 +    if (lc->ctb_up_flag || y0b)
-         inc += !!SAMPLE_CTB(s->skip_flag, x_cb, y_cb - 1);
- 
--    return GET_CABAC(elem_offset[SKIP_FLAG] + inc);
++        inc += !!SAMPLE_CTB(s->skip_flag, x_cb, y_cb - 1);
++
 +    return GET_CABAC_LC(elem_offset[SKIP_FLAG] + inc);
- }
- 
--int ff_hevc_cu_qp_delta_abs(HEVCContext *s)
-+int ff_hevc_cu_qp_delta_abs(HEVCLocalContext * const lc)
- {
-     int prefix_val = 0;
-     int suffix_val = 0;
-     int inc = 0;
- 
--    while (prefix_val < 5 && GET_CABAC(elem_offset[CU_QP_DELTA] + inc)) {
++}
++
++int ff_hevc_rpi_cu_qp_delta_abs(HEVCRpiLocalContext * const lc)
++{
++    int prefix_val = 0;
++    int suffix_val = 0;
++    int inc = 0;
++
 +    while (prefix_val < 5 && GET_CABAC_LC(elem_offset[CU_QP_DELTA] + inc)) {
-         prefix_val++;
-         inc = 1;
-     }
-     if (prefix_val >= 5) {
-         int k = 0;
--        while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
++        prefix_val++;
++        inc = 1;
++    }
++    if (prefix_val >= 5) {
++        int k = 0;
 +        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-             suffix_val += 1 << k;
-             k++;
-         }
--        if (k == CABAC_MAX_BIN)
--            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++            suffix_val += 1 << k;
++            k++;
++        }
 +//        if (k == CABAC_MAX_BIN)
 +//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
- 
-         while (k--)
--            suffix_val += get_cabac_bypass(&s->HEVClc->cc) << k;
++
++        while (k--)
 +            suffix_val += get_cabac_bypass(&lc->cc) << k;
-     }
-     return prefix_val + suffix_val;
- }
- 
--int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s)
-+int ff_hevc_cu_qp_delta_sign_flag(HEVCLocalContext * const lc)
- {
--    return get_cabac_bypass(&s->HEVClc->cc);
++    }
++    return prefix_val + suffix_val;
++}
++
++int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc)
++{
 +    return get_cabac_bypass(&lc->cc);
- }
- 
--int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s)
-+int ff_hevc_cu_chroma_qp_offset_flag(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_FLAG]);
++}
++
++int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_FLAG]);
- }
- 
--int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s)
-+int ff_hevc_cu_chroma_qp_offset_idx(const HEVCContext * const s, HEVCLocalContext * const lc)
- {
-     int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
-     int i = 0;
- 
--    while (i < c_max && GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
++}
++
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
++    int i = 0;
++
 +    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
-         i++;
- 
-     return i;
- }
- 
--int ff_hevc_pred_mode_decode(HEVCContext *s)
-+int ff_hevc_pred_mode_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[PRED_MODE_FLAG]);
++        i++;
++
++    return i;
++}
++
++int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[PRED_MODE_FLAG]);
- }
- 
--int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, int y0)
-+int ff_hevc_split_coding_unit_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc, int ct_depth, int x0, int y0)
- {
-     int inc = 0, depth_left = 0, depth_top = 0;
-     int x0b  = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
-@@ -689,229 +969,232 @@ int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0,
-     int x_cb = x0 >> s->ps.sps->log2_min_cb_size;
-     int y_cb = y0 >> s->ps.sps->log2_min_cb_size;
- 
--    if (s->HEVClc->ctb_left_flag || x0b)
++}
++
++int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int ct_depth, int x0, int y0)
++{
++    int inc = 0, depth_left = 0, depth_top = 0;
++    int x0b  = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
++    int y0b  = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
++    int x_cb = x0 >> s->ps.sps->log2_min_cb_size;
++    int y_cb = y0 >> s->ps.sps->log2_min_cb_size;
++
 +    if (lc->ctb_left_flag || x0b)
-         depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1];
--    if (s->HEVClc->ctb_up_flag || y0b)
++        depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1];
 +    if (lc->ctb_up_flag || y0b)
-         depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb];
- 
-     inc += (depth_left > ct_depth);
-     inc += (depth_top  > ct_depth);
- 
--    return GET_CABAC(elem_offset[SPLIT_CODING_UNIT_FLAG] + inc);
++        depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb];
++
++    inc += (depth_left > ct_depth);
++    inc += (depth_top  > ct_depth);
++
 +    return GET_CABAC_LC(elem_offset[SPLIT_CODING_UNIT_FLAG] + inc);
- }
- 
--int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size)
-+int ff_hevc_part_mode_decode(const HEVCContext * const s, HEVCLocalContext * const lc, const int log2_cb_size)
- {
--    if (GET_CABAC(elem_offset[PART_MODE])) // 1
++}
++
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
++{
 +    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
-         return PART_2Nx2N;
-     if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
--        if (s->HEVClc->cu.pred_mode == MODE_INTRA) // 0
++        return PART_2Nx2N;
++    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
 +        if (lc->cu.pred_mode == MODE_INTRA) // 0
-             return PART_NxN;
--        if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01
++            return PART_NxN;
 +        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-             return PART_2NxN;
-         if (log2_cb_size == 3) // 00
-             return PART_Nx2N;
--        if (GET_CABAC(elem_offset[PART_MODE] + 2)) // 001
++            return PART_2NxN;
++        if (log2_cb_size == 3) // 00
++            return PART_Nx2N;
 +        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
-             return PART_Nx2N;
-         return PART_NxN; // 000
-     }
- 
-     if (!s->ps.sps->amp_enabled_flag) {
--        if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01
++            return PART_Nx2N;
++        return PART_NxN; // 000
++    }
++
++    if (!s->ps.sps->amp_enabled_flag) {
 +        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-             return PART_2NxN;
-         return PART_Nx2N;
-     }
- 
--    if (GET_CABAC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
--        if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 011
++            return PART_2NxN;
++        return PART_Nx2N;
++    }
++
 +    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
 +        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
-             return PART_2NxN;
--        if (get_cabac_bypass(&s->HEVClc->cc)) // 0101
++            return PART_2NxN;
 +        if (get_cabac_bypass(&lc->cc)) // 0101
-             return PART_2NxnD;
-         return PART_2NxnU; // 0100
-     }
- 
--    if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 001
++            return PART_2NxnD;
++        return PART_2NxnU; // 0100
++    }
++
 +    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
-         return PART_Nx2N;
--    if (get_cabac_bypass(&s->HEVClc->cc)) // 0001
++        return PART_Nx2N;
 +    if (get_cabac_bypass(&lc->cc)) // 0001
-         return PART_nRx2N;
-     return PART_nLx2N;  // 0000
- }
- 
--int ff_hevc_pcm_flag_decode(HEVCContext *s)
-+int ff_hevc_pcm_flag_decode(HEVCLocalContext * const lc)
- {
--    return get_cabac_terminate(&s->HEVClc->cc);
++        return PART_nRx2N;
++    return PART_nLx2N;  // 0000
++}
++
++int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return get_cabac_terminate(&lc->cc);
- }
- 
--int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s)
-+int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[PREV_INTRA_LUMA_PRED_FLAG]);
++}
++
++int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[PREV_INTRA_LUMA_PRED_FLAG]);
- }
- 
--int ff_hevc_mpm_idx_decode(HEVCContext *s)
-+int ff_hevc_mpm_idx_decode(HEVCLocalContext * const lc)
- {
-     int i = 0;
--    while (i < 2 && get_cabac_bypass(&s->HEVClc->cc))
++}
++
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
++{
++    int i = 0;
 +    while (i < 2 && get_cabac_bypass(&lc->cc))
-         i++;
-     return i;
- }
- 
--int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCContext *s)
-+int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCLocalContext * const lc)
- {
-     int i;
--    int value = get_cabac_bypass(&s->HEVClc->cc);
++        i++;
++    return i;
++}
++
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    int i;
 +    int value = get_cabac_bypass(&lc->cc);
- 
-     for (i = 0; i < 4; i++)
--        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
++
++    for (i = 0; i < 4; i++)
 +        value = (value << 1) | get_cabac_bypass(&lc->cc);
-     return value;
- }
- 
--int ff_hevc_intra_chroma_pred_mode_decode(HEVCContext *s)
-+int ff_hevc_intra_chroma_pred_mode_decode(HEVCLocalContext * const lc)
- {
-     int ret;
--    if (!GET_CABAC(elem_offset[INTRA_CHROMA_PRED_MODE]))
++    return value;
++}
++
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret;
 +    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
-         return 4;
- 
--    ret  = get_cabac_bypass(&s->HEVClc->cc) << 1;
--    ret |= get_cabac_bypass(&s->HEVClc->cc);
++        return 4;
++
 +    ret  = get_cabac_bypass(&lc->cc) << 1;
 +    ret |= get_cabac_bypass(&lc->cc);
-     return ret;
- }
- 
--int ff_hevc_merge_idx_decode(HEVCContext *s)
-+int ff_hevc_merge_idx_decode(const HEVCContext * const s, HEVCLocalContext * const lc)
- {
--    int i = GET_CABAC(elem_offset[MERGE_IDX]);
++    return ret;
++}
++
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
 +    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
- 
-     if (i != 0) {
--        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc->cc))
++
++    if (i != 0) {
 +        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
-             i++;
-     }
-     return i;
- }
- 
--int ff_hevc_merge_flag_decode(HEVCContext *s)
-+int ff_hevc_merge_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[MERGE_FLAG]);
++            i++;
++    }
++    return i;
++}
++
++int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[MERGE_FLAG]);
- }
- 
--int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH)
-+int ff_hevc_inter_pred_idc_decode(HEVCLocalContext * const lc, int nPbW, int nPbH)
- {
-     if (nPbW + nPbH == 12)
--        return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
--    if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc->ct_depth))
++}
++
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
++{
++    if (nPbW + nPbH == 12)
 +        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
 +    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
-         return PRED_BI;
- 
--    return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4);
++        return PRED_BI;
++
 +    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
- }
- 
--int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx)
-+int ff_hevc_ref_idx_lx_decode(HEVCLocalContext * const lc, const int num_ref_idx_lx)
- {
-     int i = 0;
-     int max = num_ref_idx_lx - 1;
-     int max_ctx = FFMIN(max, 2);
- 
--    while (i < max_ctx && GET_CABAC(elem_offset[REF_IDX_L0] + i))
++}
++
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
++{
++    int i = 0;
++    int max = num_ref_idx_lx - 1;
++    int max_ctx = FFMIN(max, 2);
++
 +    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
-         i++;
-     if (i == 2) {
--        while (i < max && get_cabac_bypass(&s->HEVClc->cc))
++        i++;
++    if (i == 2) {
 +        while (i < max && get_cabac_bypass(&lc->cc))
-             i++;
-     }
- 
-     return i;
- }
- 
--int ff_hevc_mvp_lx_flag_decode(HEVCContext *s)
-+int ff_hevc_mvp_lx_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[MVP_LX_FLAG]);
++            i++;
++    }
++
++    return i;
++}
++
++int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[MVP_LX_FLAG]);
- }
- 
--int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s)
-+int ff_hevc_no_residual_syntax_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[NO_RESIDUAL_DATA_FLAG]);
++}
++
++int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[NO_RESIDUAL_DATA_FLAG]);
- }
- 
--static av_always_inline int abs_mvd_greater0_flag_decode(HEVCContext *s)
-+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[ABS_MVD_GREATER0_FLAG]);
++}
++
++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
- }
- 
--static av_always_inline int abs_mvd_greater1_flag_decode(HEVCContext *s)
-+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCLocalContext * const lc)
- {
--    return GET_CABAC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
++}
++
++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
- }
- 
--static av_always_inline int mvd_decode(HEVCContext *s)
++}
++
 +#if !USE_BY22
-+static av_always_inline int mvd_decode(HEVCLocalContext * const lc)
- {
-     int ret = 2;
-     int k = 1;
- 
--    while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) {
++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret = 2;
++    int k = 1;
++
 +    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-         ret += 1U << k;
-         k++;
-     }
-     if (k == CABAC_MAX_BIN) {
--        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++        ret += 1U << k;
++        k++;
++    }
++    if (k == CABAC_MAX_BIN) {
 +        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-         return 0;
-     }
++        return 0;
++    }
 +
-     while (k--)
--        ret += get_cabac_bypass(&s->HEVClc->cc) << k;
--    return get_cabac_bypass_sign(&s->HEVClc->cc, -ret);
++    while (k--)
 +        ret += get_cabac_bypass(&lc->cc) << k;
 +    return get_cabac_bypass_sign(&lc->cc, -ret);
- }
++}
 +#endif
- 
--static av_always_inline int mvd_sign_flag_decode(HEVCContext *s)
-+static av_always_inline int mvd_sign_flag_decode(HEVCLocalContext * const lc)
- {
--    return get_cabac_bypass_sign(&s->HEVClc->cc, -1);
++
++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
++{
 +    return get_cabac_bypass_sign(&lc->cc, -1);
- }
- 
--int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size)
-+int ff_hevc_split_transform_flag_decode(HEVCLocalContext * const lc, const int log2_trafo_size)
- {
--    return GET_CABAC(elem_offset[SPLIT_TRANSFORM_FLAG] + 5 - log2_trafo_size);
++}
++
++int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
++{
 +    return GET_CABAC_LC(elem_offset[SPLIT_TRANSFORM_FLAG] + 5 - log2_trafo_size);
- }
- 
--int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth)
-+int ff_hevc_cbf_cb_cr_decode(HEVCLocalContext * const lc, const int trafo_depth)
- {
--    return GET_CABAC(elem_offset[CBF_CB_CR] + trafo_depth);
++}
++
++int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
 +    return GET_CABAC_LC(elem_offset[CBF_CB_CR] + trafo_depth);
- }
- 
--int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
-+int ff_hevc_cbf_luma_decode(HEVCLocalContext * const lc, const int trafo_depth)
- {
--    return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
++}
++
++int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
 +    return GET_CABAC_LC(elem_offset[CBF_LUMA] + !trafo_depth);
- }
- 
--static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
-+static int hevc_transform_skip_flag_decode(HEVCLocalContext * const lc, int c_idx_nz)
- {
--    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
++}
++
++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
 +    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
- }
- 
--static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
-+static int explicit_rdpcm_flag_decode(HEVCLocalContext * const lc, int c_idx_nz)
- {
--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
++}
++
++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
 +    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
- }
- 
--static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
-+static int explicit_rdpcm_dir_flag_decode(HEVCLocalContext * const lc, int c_idx_nz)
- {
--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
++}
++
++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
 +    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
- }
- 
--int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-+int ff_hevc_log2_res_scale_abs(HEVCLocalContext * const lc, const int idx) {
-     int i =0;
- 
--    while (i < 4 && GET_CABAC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
++}
++
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
++    int i =0;
++
 +    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
-         i++;
- 
-     return i;
- }
- 
--int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
--    return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
-+int ff_hevc_res_scale_sign_flag(HEVCLocalContext *const lc, const int idx) {
++        i++;
++
++    return i;
++}
++
++int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx) {
 +    return GET_CABAC_LC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
- }
- 
--static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCLocalContext * const lc, int c_idx_nz,
-                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
- {
-     int i = 0;
-     int max = (log2_size << 1) - 1;
-     int ctx_offset, ctx_shift;
- 
--    if (!c_idx) {
++}
++
++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
++                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++{
++    int i = 0;
++    int max = (log2_size << 1) - 1;
++    int ctx_offset, ctx_shift;
++
 +    if (!c_idx_nz) {
-         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
-         ctx_shift = (log2_size + 1) >> 2;
-     } else {
-@@ -919,150 +1202,501 @@ static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext
-         ctx_shift = log2_size - 2;
-     }
-     while (i < max &&
--           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
++        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
++        ctx_shift = (log2_size + 1) >> 2;
++    } else {
++        ctx_offset = 15;
++        ctx_shift = log2_size - 2;
++    }
++    while (i < max &&
 +           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
-         i++;
-     *last_scx_prefix = i;
- 
-     i = 0;
-     while (i < max &&
--           GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
++        i++;
++    *last_scx_prefix = i;
++
++    i = 0;
++    while (i < max &&
 +           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
-         i++;
-     *last_scy_prefix = i;
- }
- 
--static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
-+static av_always_inline int last_significant_coeff_suffix_decode(HEVCLocalContext * const lc,
-                                                  int last_significant_coeff_prefix)
- {
-     int i;
-     int length = (last_significant_coeff_prefix >> 1) - 1;
--    int value = get_cabac_bypass(&s->HEVClc->cc);
++        i++;
++    *last_scy_prefix = i;
++}
++
++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
++                                                 int last_significant_coeff_prefix)
++{
++    int i;
++    int length = (last_significant_coeff_prefix >> 1) - 1;
 +    int value = get_cabac_bypass(&lc->cc);
- 
-     for (i = 1; i < length; i++)
--        value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc);
++
++    for (i = 1; i < length; i++)
 +        value = (value << 1) | get_cabac_bypass(&lc->cc);
-     return value;
- }
- 
--static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
-+static av_always_inline int significant_coeff_group_flag_decode(HEVCLocalContext * const lc, int c_idx_nz, int ctx_cg)
- {
-     int inc;
- 
--    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
++    return value;
++}
++
++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
++{
++    int inc;
++
 +    inc = (ctx_cg != 0) + (c_idx_nz << 1);
- 
--    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
--}
--static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
--                                           int offset, const uint8_t *ctx_idx_map)
--{
--    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
--    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
++
 +    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
- }
- 
--static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
-+static av_always_inline int significant_coeff_flag_decode_0(HEVCLocalContext * const lc, int offset)
- {
--    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++}
++
++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
++{
 +    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
- }
- 
--static av_always_inline int coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc)
++}
++
 +#if !USE_BY22
 +#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
 +#endif
@@ -8223,20 +9806,16 @@ index 853fd3f722..e8e6ad3b1a 100644
 +
 +#ifndef coeff_abs_level_remaining_decode_bypass
 +static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
- {
++{
 +    uint32_t y;
 +    unsigned int prefix;
 +    unsigned int last_coeff_abs_level_remaining;
 +    unsigned int n;
- 
--    if (c_idx > 0)
--        inc += 16;
++
 +    y = get_cabac_by22_peek(c);
 +    prefix = hevc_clz32(~y);
 +    // y << prefix will always have top bit 0
- 
--    return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + inc);
--}
++
 +    if (prefix < 3) {
 +        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
 +        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
@@ -8245,18 +9824,13 @@ index 853fd3f722..e8e6ad3b1a 100644
 +    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
 +    {
 +        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
- 
--static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc)
--{
--    if (c_idx > 0)
--        inc += 4;
++
 +        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
 +        n = prefix * 2 + rice_param - 2;
 +    }
 +    else {
 +        unsigned int suffix;
- 
--    return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
++
 +        get_cabac_by22_flush(c, prefix, y);
 +        y = get_cabac_by22_peek(c);
 +
@@ -8268,56 +9842,46 @@ index 853fd3f722..e8e6ad3b1a 100644
 +    get_cabac_by22_flush(c, n, y);
 +
 +    return last_coeff_abs_level_remaining;
- }
++}
 +#endif
- 
--static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
++
 +static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
- {
-     int prefix = 0;
-     int suffix = 0;
-     int last_coeff_abs_level_remaining;
-     int i;
- 
--    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
++{
++    int prefix = 0;
++    int suffix = 0;
++    int last_coeff_abs_level_remaining;
++    int i;
++
 +    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-         prefix++;
-     if (prefix == CABAC_MAX_BIN) {
--        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++        prefix++;
++    if (prefix == CABAC_MAX_BIN) {
 +//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-         return 0;
-     }
++        return 0;
++    }
 +
-     if (prefix < 3) {
-         for (i = 0; i < rc_rice_param; i++)
--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
++    if (prefix < 3) {
++        for (i = 0; i < rc_rice_param; i++)
 +            suffix = (suffix << 1) | get_cabac_bypass(c);
-         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-     } else {
-         int prefix_minus3 = prefix - 3;
-         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
++        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++    } else {
++        int prefix_minus3 = prefix - 3;
++        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
 +            suffix = (suffix << 1) | get_cabac_bypass(c);
-         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-                                               << rc_rice_param) + suffix;
-     }
++        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++                                              << rc_rice_param) + suffix;
++    }
++
++    return last_coeff_abs_level_remaining;
++}
 +
-     return last_coeff_abs_level_remaining;
- }
- 
--static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
 +#if !USE_BY22
 +#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
 +static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
- {
--    int i;
--    int ret = 0;
++{
 +    unsigned int i;
 +    uint32_t ret = 0;
- 
-     for (i = 0; i < nb; i++)
--        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
--    return ret;
++
++    for (i = 0; i < nb; i++)
 +        ret = (ret << 1) | get_cabac_bypass(c);
 +
 +    return ret << (32 - nb);
@@ -8331,7 +9895,7 @@ index 853fd3f722..e8e6ad3b1a 100644
 +    y = get_cabac_by22_peek(c);
 +    get_cabac_by22_flush(c, nb, y);
 +    return y & ~(0xffffffffU >> nb);
- }
++}
 +#endif
 +
 +
@@ -8350,24 +9914,14 @@ index 853fd3f722..e8e6ad3b1a 100644
 +}
 +#endif
 +
- 
--void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
--                                int log2_trafo_size, enum ScanType scan_idx,
--                                int c_idx)
++
 +// N.B. levels returned are the values assuming coeff_abs_level_remaining
 +// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
 +// this version of events.
-+static inline uint32_t get_greaterx_bits(HEVCLocalContext * const lc, const unsigned int n_end, int * const levels,
++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
 +    int * const pprev_subset_coded, int * const psum,
 +    const unsigned int idx0_gt1, const unsigned int idx_gt2)
- {
--#define GET_COORD(offset, n)                                    \
--    do {                                                        \
--        x_c = (x_cg << 2) + scan_x_off[n];                      \
--        y_c = (y_cg << 2) + scan_y_off[n];                      \
--    } while (0)
--    HEVCLocalContext *lc = s->HEVClc;
--    int transform_skip_flag = 0;
++{
 +    CABACContext * const c = &lc->cc;
 +    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
 +    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
@@ -8492,7 +10046,7 @@ index 853fd3f722..e8e6ad3b1a 100644
 +     x7, x14, x11, x15}
 +
 +
-+static inline int next_subset(HEVCLocalContext * const lc, int i, const int c_idx_nz,
++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
 +    uint8_t * const significant_coeff_group_flag,
 +    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
 +    int * const pPrev_sig)
@@ -8521,89 +10075,74 @@ index 853fd3f722..e8e6ad3b1a 100644
 +    return i;
 +}
 +
-+#ifdef RPI
-+static void rpi_add_residual(const HEVCContext *const s, HEVCRpiJob * const jb,
++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
 +    const unsigned int log2_trafo_size, const unsigned int c_idx,
 +    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
 +{
 +    const AVFrame * const frame = s->frame;
-+    unsigned int stride = frame->linesize[c_idx];
-+    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
-+    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
-+    const int is_sliced = av_rpi_is_sand_frame(frame);
-+    uint8_t * dst = !is_sliced ?
++    const unsigned int stride = frame_stride1(s->frame, c_idx);
++    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
++    uint8_t * const dst = !is_sliced ?
 +            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
 +        c_idx == 0 ?
 +            av_rpi_sand_frame_pos_y(frame, x, y) :
 +            av_rpi_sand_frame_pos_c(frame, x, y);
 +
-+    if (s->enable_rpi) {
-+        const unsigned int i = jb->intra.n;
-+        HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++    const unsigned int i = jb->intra.n;
++    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
 +
-+        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+            pc->ta.dst == dst)
-+        {
-+            av_assert1(pc->size == log2_trafo_size &&
-+                       pc->c_idx == 1 &&
-+                       pc->ta.stride == stride);
-+
-+            pc->type = RPI_PRED_ADD_RESIDUAL_C;
-+        }
-+        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+            pc->dc.dst == dst)
-+        {
-+            const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
-+            av_assert1(pc->size == log2_trafo_size &&
-+                       pc->c_idx == 1 &&
-+                       pc->dc.stride == stride);
-+
-+            // Rewrite as add residual - must rewrite all fields as different union member
-+            pc->type = RPI_PRED_ADD_RESIDUAL_V;
-+            pc->c_idx = c_idx;
-+            pc->ta.buf = coeffs;
-+            pc->ta.dst = dst;
-+            pc->ta.stride = stride;
-+            pc->ta.dc = dc;
-+        }
-+        else
-+        {
-+            HEVCPredCmd * const cmd = pc + 1;
-+            jb->intra.n = i + 1;
++    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++        pc->ta.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->ta.stride == stride);
 +
-+            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
-+            cmd->size = log2_trafo_size;
-+            cmd->c_idx = c_idx;
-+            cmd->ta.buf = coeffs;
-+            cmd->ta.dst = dst;
-+            cmd->ta.stride = stride;
-+            cmd->ta.dc = 0;
-+        }
-+    }
-+    else if (!is_sliced || c_idx == 0) {
-+        s->hevcdsp.add_residual[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++        pc->type = RPI_PRED_ADD_RESIDUAL_C;
 +    }
-+#if RPI_HEVC_SAND
-+    // * These should probably never happen
-+    else if (c_idx == 1) {
-+        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
++    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++        pc->dc.dst == dst)
++    {
++        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->dc.stride == stride);
++
++        // Rewrite as add residual - must rewrite all fields as different union member
++        pc->type = RPI_PRED_ADD_RESIDUAL_V;
++        pc->c_idx = c_idx;
++        pc->ta.buf = coeffs;
++        pc->ta.dst = dst;
++        pc->ta.stride = stride;
++        pc->ta.dc = dc;
 +    }
-+    else {
-+        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
++    else
++    {
++        HEVCPredCmd * const cmd = pc + 1;
++        jb->intra.n = i + 1;
++
++        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++        cmd->size = log2_trafo_size;
++        cmd->c_idx = c_idx;
++        cmd->ta.buf = coeffs;
++        cmd->ta.dst = dst;
++        cmd->ta.stride = stride;
++        cmd->ta.dc = 0;
 +    }
-+#endif
 +}
 +
 +
-+static void rpi_add_dc(const HEVCContext * const s, HEVCRpiJob * const jb,
++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
 +    const unsigned int log2_trafo_size, const unsigned int c_idx,
 +    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
 +{
 +    const AVFrame * const frame = s->frame;
-+    const unsigned int stride = frame->linesize[c_idx];
-+    const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
-+    const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
-+    const int is_sliced = av_rpi_is_sand_frame(frame);
++    const unsigned int stride = frame_stride1(s->frame, c_idx);
++    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++    const int is_sliced = 1;
 +    uint8_t * const dst = !is_sliced ?
 +            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
 +        c_idx == 0 ?
@@ -8613,119 +10152,99 @@ index 853fd3f722..e8e6ad3b1a 100644
 +    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
 +    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
 +
-+    if (s->enable_rpi) {
-+        const unsigned int i = jb->intra.n;
-+        HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++    const unsigned int i = jb->intra.n;
++    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
 +
-+        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+            pc->ta.dst == dst)
-+        {
-+            av_assert1(pc->size == log2_trafo_size &&
-+                       pc->c_idx == 1 &&
-+                       pc->ta.stride == stride);
++    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++        pc->ta.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->ta.stride == stride);
 +
-+            pc->ta.dc = (int16_t)coeff;
-+        }
-+        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+            pc->dc.dst == dst)
-+        {
-+            av_assert1(pc->size == log2_trafo_size &&
-+                       pc->c_idx == 1 &&
-+                       pc->dc.stride == stride &&
-+                       (pc->dc.dc & ~0xffff) == 0);
++        pc->ta.dc = (int16_t)coeff;
++    }
++    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++        pc->dc.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->dc.stride == stride &&
++                   (pc->dc.dc & ~0xffff) == 0);
 +
-+            pc->dc.dc |= (coeff << 16);
-+        }
-+        else
-+        {
-+            HEVCPredCmd * const cmd = pc + 1;
-+            jb->intra.n = i + 1;
++        pc->dc.dc |= (coeff << 16);
++    }
++    else
++    {
++        HEVCPredCmd * const cmd = pc + 1;
++        jb->intra.n = i + 1;
 +
-+            cmd->type = RPI_PRED_ADD_DC + c_idx;
-+            cmd->size = log2_trafo_size;
-+            cmd->c_idx = c_idx;
-+            cmd->dc.dst = dst;
-+            cmd->dc.stride = stride;
-+            cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
-+        }
++        cmd->type = RPI_PRED_ADD_DC + c_idx;
++        cmd->size = log2_trafo_size;
++        cmd->c_idx = c_idx;
++        cmd->dc.dst = dst;
++        cmd->dc.stride = stride;
++        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
 +    }
 +}
 +
 +
-+#endif
-+
-+void ff_hevc_hls_residual_coding(const HEVCContext * const s, HEVCLocalContext * const lc,
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
 +                                const int x0, const int y0,
 +                                const int log2_trafo_size, const enum ScanType scan_idx,
 +                                const int c_idx)
 +{
 +    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
- 
-     int last_significant_coeff_x, last_significant_coeff_y;
--    int last_scan_pos;
--    int n_end;
-     int num_coeff = 0;
--    int greater1_ctx = 1;
++
++    int last_significant_coeff_x, last_significant_coeff_y;
++    int num_coeff = 0;
 +    int prev_subset_coded = 0;
- 
-     int num_last_subset;
-     int x_cg_last_sig, y_cg_last_sig;
- 
--    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
++
++    int num_last_subset;
++    int x_cg_last_sig, y_cg_last_sig;
++
 +    const uint8_t *scan_x_cg, *scan_y_cg;
 +    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
- 
-+#ifndef RPI
-     ptrdiff_t stride = s->frame->linesize[c_idx];
-     int hshift = s->ps.sps->hshift[c_idx];
-     int vshift = s->ps.sps->vshift[c_idx];
--    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-+    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
--    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
--    uint8_t significant_coeff_group_flag[8][8] = {{0}};
-+#endif
-+#ifdef RPI
++
 +    int use_vpu;
 +    int use_dc = 0;
-+#endif
 +    int16_t *coeffs;
 +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
-     int explicit_rdpcm_flag = 0;
-     int explicit_rdpcm_dir_flag;
- 
--    int trafo_size = 1 << log2_trafo_size;
-     int i;
--    int qp,shift,add,scale,scale_m;
++    int explicit_rdpcm_flag = 0;
++    int explicit_rdpcm_dir_flag;
++
++    int i;
 +    int qp,shift,scale;
-     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
-     const uint8_t *scale_matrix = NULL;
-     uint8_t dc_scale;
-     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
-                                          lc->tu.intra_pred_mode_c;
- 
--    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+    int prev_sig = 0;
++    static const uint8_t const level_scale[] = { 40, 45, 51, 57, 64, 72 };
++    const uint8_t *scale_matrix = NULL;
++    uint8_t dc_scale;
 +    const int c_idx_nz = (c_idx != 0);
-+
++    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++    int prev_sig = 0;
 +    int may_hide_sign;
- 
-     // Derive QP for dequant
-     if (!lc->cu.cu_transquant_bypass_flag) {
--        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
++
++    // Derive QP for dequant
++    if (!lc->cu.cu_transquant_bypass_flag) {
 +        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
-         static const uint8_t rem6[51 + 4 * 6 + 1] = {
-             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
-             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1078,9 +1712,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-         };
-         int qp_y = lc->qp_y;
- 
++        static const uint8_t rem6[51 + 4 * 6 + 1] = {
++            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
++            3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
++            0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
++            4, 5, 0, 1, 2, 3, 4, 5, 0, 1
++        };
++
++        static const uint8_t div6[51 + 4 * 6 + 1] = {
++            0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,  3,  3,
++            3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,  6,  6,
++            7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10,
++            10, 10, 11, 11, 11, 11, 11, 11, 12, 12
++        };
++        int qp_y = lc->qp_y;
++
 +        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
 +
-         if (s->ps.pps->transform_skip_enabled_flag &&
-             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
--            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
++        if (s->ps.pps->transform_skip_enabled_flag &&
++            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
 +            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
 +            if (transform_skip_flag) {
 +                trans_skip_or_bypass = 1;
@@ -8735,18 +10254,38 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                    may_hide_sign = 0;
 +                }
 +            }
-         }
- 
-         if (c_idx == 0) {
-@@ -1113,50 +1757,87 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             qp += s->ps.sps->qp_bd_offset;
-         }
- 
--        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
--        add      = 1 << (shift-1);
--        scale    = level_scale[rem6[qp]] << (div6[qp]);
--        scale_m  = 16; // default when no custom scaling lists.
--        dc_scale = 16;
++        }
++
++        if (c_idx == 0) {
++            qp = qp_y + s->ps.sps->qp_bd_offset;
++        } else {
++            int qp_i, offset;
++
++            if (c_idx == 1)
++                offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset +
++                         lc->tu.cu_qp_offset_cb;
++            else
++                offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset +
++                         lc->tu.cu_qp_offset_cr;
++
++            qp_i = av_clip(qp_y + offset, - s->ps.sps->qp_bd_offset, 57);
++            if (ctx_cfmt(s) == 1) {
++                if (qp_i < 30)
++                    qp = qp_i;
++                else if (qp_i > 43)
++                    qp = qp_i - 6;
++                else
++                    qp = qp_c[qp_i - 30];
++            } else {
++                if (qp_i > 51)
++                    qp = 51;
++                else
++                    qp = qp_i;
++            }
++
++            qp += s->ps.sps->qp_bd_offset;
++        }
++
 +        // Shift is set to one less than will actually occur as the scale
 +        // and saturate step adds 1 and then shifts right again
 +        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
@@ -8757,21 +10296,19 @@ index 853fd3f722..e8e6ad3b1a 100644
 +        } else {
 +            shift -= div6[qp];
 +        }
- 
--        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
++
 +        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
--            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++            const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
 +                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
- 
-             matrix_id = 3 * matrix_id + c_idx;
- 
-             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
++            int matrix_id = lc->cu.pred_mode != MODE_INTRA;
++
++            matrix_id = 3 * matrix_id + c_idx;
++
++            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
 +            dc_scale = scale_matrix[0];
-             if (log2_trafo_size >= 4)
-                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-         }
++            if (log2_trafo_size >= 4)
++                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++        }
 +        else
 +        {
 +            static const uint8_t sixteen_scale[64] = {
@@ -8787,7 +10324,7 @@ index 853fd3f722..e8e6ad3b1a 100644
 +            scale_matrix = sixteen_scale;
 +            dc_scale = 16;
 +        }
-     } else {
++    } else {
 +        static const uint8_t unit_scale[64] = {
 +            1, 1, 1, 1, 1, 1, 1, 1,
 +            1, 1, 1, 1, 1, 1, 1, 1,
@@ -8799,198 +10336,138 @@ index 853fd3f722..e8e6ad3b1a 100644
 +            1, 1, 1, 1, 1, 1, 1, 1,
 +        };
 +        scale_matrix = unit_scale;
-         shift        = 0;
--        add          = 0;
--        scale        = 0;
--        dc_scale     = 0;
++        shift        = 0;
 +        scale        = 2;  // We will shift right to kill this
 +        dc_scale     = 1;
 +
 +        may_hide_sign = 0;
-     }
- 
++    }
++
 +
 +
 +
-     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
--        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
--        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
++    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
 +        trans_skip_or_bypass) {
 +        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
-         if (explicit_rdpcm_flag) {
--            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
++        if (explicit_rdpcm_flag) {
 +            may_hide_sign = 0;
 +            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
-         }
-     }
- 
--    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
++        }
++    }
++
 +    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
-                                            &last_significant_coeff_x, &last_significant_coeff_y);
- 
-     if (last_significant_coeff_x > 3) {
--        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_x);
++                                           &last_significant_coeff_x, &last_significant_coeff_y);
++
++    if (last_significant_coeff_x > 3) {
 +        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
-         last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-         (2 + (last_significant_coeff_x & 1)) +
-         suffix;
-     }
- 
-     if (last_significant_coeff_y > 3) {
--        int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_y);
++        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
++        (2 + (last_significant_coeff_x & 1)) +
++        suffix;
++    }
++
++    if (last_significant_coeff_y > 3) {
 +        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
-         last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-         (2 + (last_significant_coeff_y & 1)) +
-         suffix;
-@@ -1173,119 +1854,145 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-         int last_x_c = last_significant_coeff_x & 3;
-         int last_y_c = last_significant_coeff_y & 3;
- 
--        scan_x_off = ff_hevc_diag_scan4x4_x;
--        scan_y_off = ff_hevc_diag_scan4x4_y;
-         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
--        if (trafo_size == 4) {
++        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
++        (2 + (last_significant_coeff_y & 1)) +
++        suffix;
++    }
++
++    if (scan_idx == SCAN_VERT)
++        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
++
++    x_cg_last_sig = last_significant_coeff_x >> 2;
++    y_cg_last_sig = last_significant_coeff_y >> 2;
++
++    switch (scan_idx) {
++    case SCAN_DIAG: {
++        int last_x_c = last_significant_coeff_x & 3;
++        int last_y_c = last_significant_coeff_y & 3;
++
++        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
 +
 +        switch (log2_trafo_size) {
 +        case 2:
-             scan_x_cg = scan_1x1;
-             scan_y_cg = scan_1x1;
--        } else if (trafo_size == 8) {
++            scan_x_cg = scan_1x1;
++            scan_y_cg = scan_1x1;
 +            break;
 +        case 3:
-             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-             scan_x_cg = diag_scan2x2_x;
-             scan_y_cg = diag_scan2x2_y;
--        } else if (trafo_size == 16) {
++            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = diag_scan2x2_x;
++            scan_y_cg = diag_scan2x2_y;
 +            break;
 +        case 4:
-             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-             scan_x_cg = ff_hevc_diag_scan4x4_x;
-             scan_y_cg = ff_hevc_diag_scan4x4_y;
--        } else { // trafo_size == 32
++            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
++            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
 +            break;
 +        case 5:
 +        default:
-             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-             scan_x_cg = ff_hevc_diag_scan8x8_x;
-             scan_y_cg = ff_hevc_diag_scan8x8_y;
++            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
++            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
 +            break;
-         }
-         break;
-     }
-     case SCAN_HORIZ:
-         scan_x_cg = horiz_scan2x2_x;
-         scan_y_cg = horiz_scan2x2_y;
--        scan_x_off = horiz_scan4x4_x;
--        scan_y_off = horiz_scan4x4_y;
-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-         break;
-     default: //SCAN_VERT
-         scan_x_cg = horiz_scan2x2_y;
-         scan_y_cg = horiz_scan2x2_x;
--        scan_x_off = horiz_scan4x4_y;
--        scan_y_off = horiz_scan4x4_x;
-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-         break;
-     }
-     num_coeff++;
-     num_last_subset = (num_coeff - 1) >> 4;
- 
--    for (i = num_last_subset; i >= 0; i--) {
--        int n, m;
--        int x_cg, y_cg, x_c, y_c, pos;
++        }
++        break;
++    }
++    case SCAN_HORIZ:
++        scan_x_cg = horiz_scan2x2_x;
++        scan_y_cg = horiz_scan2x2_y;
++        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++        break;
++    default: //SCAN_VERT
++        scan_x_cg = horiz_scan2x2_y;
++        scan_y_cg = horiz_scan2x2_x;
++        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++        break;
++    }
++    num_coeff++;
++    num_last_subset = (num_coeff - 1) >> 4;
++
 +    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
 +
 +    {
 +        const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+#ifdef RPI
++        const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
 +        use_vpu = 0;
-+        if (s->enable_rpi) {
-+            const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
-+            use_dc = (num_coeff == 1) && !special &&
-+                !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
-+
-+            if (use_dc) {
-+                // Just need a little empty space
-+                coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+                // No need to clear
-+            }
-+            else
-+            {
-+                use_vpu = !special && log2_trafo_size >= 4;
-+                coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if HAVE_NEON
-+                rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+                memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+            }
++        use_dc = (num_coeff == 1) && !special &&
++            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
++
++        if (use_dc) {
++            // Just need a little empty space
++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            // No need to clear
 +        }
 +        else
-+#endif
 +        {
-+            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            use_vpu = !special && log2_trafo_size >= 4;
++            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if HAVE_NEON
++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
 +            memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
 +        }
 +    }
 +
 +    i = num_last_subset;
 +    do {
-         int implicit_non_zero_coeff = 0;
--        int64_t trans_coeff_level;
--        int prev_sig = 0;
--        int offset = i << 4;
--        int rice_init = 0;
++        int implicit_non_zero_coeff = 0;
 +        int n_end;
- 
-         uint8_t significant_coeff_flag_idx[16];
--        uint8_t nb_significant_coeff_flag = 0;
--
--        x_cg = scan_x_cg[i];
--        y_cg = scan_y_cg[i];
--
--        if ((i < num_last_subset) && (i > 0)) {
--            int ctx_cg = 0;
--            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
--                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
--            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
--                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
--
--            significant_coeff_group_flag[x_cg][y_cg] =
--                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
--            implicit_non_zero_coeff = 1;
--        } else {
--            significant_coeff_group_flag[x_cg][y_cg] =
--            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
--             (x_cg == 0 && y_cg == 0));
--        }
--
--        last_scan_pos = num_coeff - offset - 1;
++
++        uint8_t significant_coeff_flag_idx[16];
 +        unsigned int nb_significant_coeff_flag = 0;
- 
-         if (i == num_last_subset) {
++
++        if (i == num_last_subset) {
 +            // First time through
 +            int last_scan_pos = num_coeff - (i << 4) - 1;
-             n_end = last_scan_pos - 1;
-             significant_coeff_flag_idx[0] = last_scan_pos;
-             nb_significant_coeff_flag = 1;
-         } else {
-             n_end = 15;
++            n_end = last_scan_pos - 1;
++            significant_coeff_flag_idx[0] = last_scan_pos;
++            nb_significant_coeff_flag = 1;
++        } else {
++            n_end = 15;
 +            implicit_non_zero_coeff = (i != 0);
-         }
- 
--        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
--            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
--        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
--            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
--
--        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
--            static const uint8_t ctx_idx_map[] = {
--                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
--                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
--                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
--                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
--                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
++        }
++
 +        if (n_end >= 0) {
 +            static const uint8_t ctx_idx_maps_ts2[3][16] = {
 +                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
@@ -9017,51 +10494,38 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                }
-             };
-             const uint8_t *ctx_idx_map_p;
-             int scf_offset = 0;
--            if (s->ps.sps->transform_skip_context_enabled_flag &&
--                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
--                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
--                if (c_idx == 0) {
--                    scf_offset = 40;
--                } else {
--                    scf_offset = 14 + 27;
--                }
++            };
++            const uint8_t *ctx_idx_map_p;
++            int scf_offset = 0;
 +
 +            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
 +                ctx_idx_map_p = ctx_idx_maps[0][3];
 +                scf_offset = 40 + c_idx_nz;
-             } else {
--                if (c_idx != 0)
++            } else {
 +                if (c_idx_nz != 0)
-                     scf_offset = 27;
++                    scf_offset = 27;
 +
-                 if (log2_trafo_size == 2) {
--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
++                if (log2_trafo_size == 2) {
 +                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-                 } else {
--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
--                    if (c_idx == 0) {
--                        if ((x_cg > 0 || y_cg > 0))
++                } else {
 +                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
 +                    if (!c_idx_nz) {
 +                        if (i != 0)
-                             scf_offset += 3;
++                            scf_offset += 3;
 +
-                         if (log2_trafo_size == 3) {
-                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-                         } else {
-@@ -1299,34 +2006,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                     }
-                 }
-             }
--            for (n = n_end; n > 0; n--) {
--                x_c = scan_x_off[n];
--                y_c = scan_y_off[n];
--                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
--                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
--                    nb_significant_coeff_flag++;
++                        if (log2_trafo_size == 3) {
++                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++                        } else {
++                            scf_offset += 21;
++                        }
++                    } else {
++                        if (log2_trafo_size == 3)
++                            scf_offset += 9;
++                        else
++                            scf_offset += 12;
++                    }
++                }
++            }
 +
 +            if (n_end > 0) {
 +                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
@@ -9071,42 +10535,30 @@ index 853fd3f722..e8e6ad3b1a 100644
 +
 +                nb_significant_coeff_flag += cnt;
 +                if (cnt != 0) {
-                     implicit_non_zero_coeff = 0;
-                 }
-             }
++                    implicit_non_zero_coeff = 0;
++                }
++            }
 +
-             if (implicit_non_zero_coeff == 0) {
--                if (s->ps.sps->transform_skip_context_enabled_flag &&
--                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
--                    if (c_idx == 0) {
--                        scf_offset = 42;
--                    } else {
--                        scf_offset = 16 + 27;
--                    }
++            if (implicit_non_zero_coeff == 0) {
 +                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
 +                    scf_offset = 42 + c_idx_nz;
-                 } else {
-                     if (i == 0) {
--                        if (c_idx == 0)
--                            scf_offset = 0;
--                        else
--                            scf_offset = 27;
++                } else {
++                    if (i == 0) {
 +                        scf_offset = c_idx_nz ? 27 : 0;
-                     } else {
-                         scf_offset = 2 + scf_offset;
-                     }
-                 }
--                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
++                    } else {
++                        scf_offset = 2 + scf_offset;
++                    }
++                }
 +                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
-                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-                     nb_significant_coeff_flag++;
-                 }
-@@ -1336,141 +2039,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             }
-         }
- 
--        n_end = nb_significant_coeff_flag;
--
++                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                    nb_significant_coeff_flag++;
++                }
++            } else {
++                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                nb_significant_coeff_flag++;
++            }
++        }
++
 +        if (nb_significant_coeff_flag != 0) {
 +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
 +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
@@ -9153,20 +10605,10 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                    prev_subset_coded = 1;
 +                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
 +                }
- 
--        if (n_end) {
--            int first_nz_pos_in_cg;
--            int last_nz_pos_in_cg;
--            int c_rice_param = 0;
--            int first_greater1_coeff_idx = -1;
--            uint8_t coeff_abs_level_greater1_flag[8];
--            uint16_t coeff_sign_flag;
--            int sum_abs = 0;
--            int sign_hidden;
--            int sb_type;
++
 +                // Probably not worth the overhead of starting by22 for just one value
 +                coeff_sign_flag = get_cabac_bypass(&lc->cc);
- 
++
 +                if (coded_val)
 +                {
 +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
@@ -9176,58 +10618,25 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
 +                        const unsigned int c_rice_param = *stat_coeff >> 2;
 +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
- 
--            // initialize first elem of coeff_bas_level_greater1_flag
--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
++
 +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
 +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                    }
 +                }
- 
--            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
--                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
--                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
--                else
--                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
--                c_rice_param = lc->stat_coeff[sb_type] / 4;
--            }
++
 +                {
 +                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
 +                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
 +                    const unsigned int scale_m = blk_scale[xy_off->scale];
- 
--            if (!(i == num_last_subset) && greater1_ctx == 0)
--                ctx_set++;
--            greater1_ctx = 1;
--            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
--
--            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
--                int inc = (ctx_set << 2) + greater1_ctx;
--                coeff_abs_level_greater1_flag[m] =
--                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
--                if (coeff_abs_level_greater1_flag[m]) {
--                    greater1_ctx = 0;
--                    if (first_greater1_coeff_idx == -1)
--                        first_greater1_coeff_idx = m;
--                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
--                    greater1_ctx++;
++
 +                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
 +                        (trans_coeff_level ^ k) - k,  // Apply sign
 +                        scale,
 +                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
 +                        shift);
-                 }
-             }
--            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
--
--            if (lc->cu.cu_transquant_bypass_flag ||
--                (lc->cu.pred_mode ==  MODE_INTRA  &&
--                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
--                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
--                 explicit_rdpcm_flag)
--                sign_hidden = 0;
-             else
--                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
++                }
++            }
++            else
 +#endif
 +            {
 +                int sign_hidden = may_hide_sign;
@@ -9265,41 +10674,14 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                            level += z;
 +                            coded_vals <<= z;
 +                        }
- 
--            if (first_greater1_coeff_idx != -1) {
--                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
--            }
--            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
--            } else {
--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
--            }
++
 +                        {
 +                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
 +                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
 +
 +                            sum_abs += last_coeff_abs_level_remaining + 1;
 +                            *level = trans_coeff_level;
- 
--            for (m = 0; m < n_end; m++) {
--                n = significant_coeff_flag_idx[m];
--                GET_COORD(offset, n);
--                if (m < 8) {
--                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
--                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
--                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
--
--                        trans_coeff_level += last_coeff_abs_level_remaining;
--                        if (trans_coeff_level > (3 << c_rice_param))
--                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
--                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
--                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
--                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
--                                lc->stat_coeff[sb_type]++;
--                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
--                                if (lc->stat_coeff[sb_type] > 0)
--                                    lc->stat_coeff[sb_type]--;
--                            rice_init = 1;
++
 +                            if (stat_coeff != NULL)
 +                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                            stat_coeff = NULL;
@@ -9307,50 +10689,14 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                            if (trans_coeff_level > (3 << c_rice_param) &&
 +                                (c_rice_param < 4 || rice_adaptation_enabled))
 +                                ++c_rice_param;
-                         }
--                    }
--                } else {
--                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
--
--                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
--                    if (trans_coeff_level > (3 << c_rice_param))
--                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
--                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
--                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
--                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
--                            lc->stat_coeff[sb_type]++;
--                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
--                            if (lc->stat_coeff[sb_type] > 0)
--                                lc->stat_coeff[sb_type]--;
--                        rice_init = 1;
--                    }
++                        }
 +                    } while (coded_vals != 0);
-                 }
--                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
--                    sum_abs += trans_coeff_level;
--                    if (n == first_nz_pos_in_cg && (sum_abs&1))
--                        trans_coeff_level = -trans_coeff_level;
++                }
 +
 +                // sign_hidden = 0 or 1 so we can combine the tests
 +                if ((sign_hidden & sum_abs) != 0) {
 +                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-                 }
--                if (coeff_sign_flag >> 15)
--                    trans_coeff_level = -trans_coeff_level;
--                coeff_sign_flag <<= 1;
--                if(!lc->cu.cu_transquant_bypass_flag) {
--                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
--                        if(y_c || x_c || log2_trafo_size < 4) {
--                            switch(log2_trafo_size) {
--                                case 3: pos = (y_c << 3) + x_c; break;
--                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
--                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
--                                default: pos = (y_c << 2) + x_c; break;
--                            }
--                            scale_m = scale_matrix[pos];
--                        } else {
--                            scale_m = dc_scale;
--                        }
++                }
 +
 +                bypass_finish(&lc->cc);
 +
@@ -9367,14 +10713,7 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                        blk_coeffs[0] = trans_scale_sat(
 +                            (levels[m] ^ k) - k, scale, dc_scale, shift);
 +                        --m;
-                     }
--                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
--                    if(trans_coeff_level < 0) {
--                        if((~trans_coeff_level) & 0xFffffffffff8000)
--                            trans_coeff_level = -32768;
--                    } else {
--                        if(trans_coeff_level & 0xffffffffffff8000)
--                            trans_coeff_level = 32767;
++                    }
 +
 +#if !USE_N_END_1
 +                    // If N_END_1 set then m was at least 1 initially
@@ -9392,112 +10731,104 @@ index 853fd3f722..e8e6ad3b1a 100644
 +                                blk_scale[xy_off->scale],
 +                                shift);
 +                        } while (--m >= 0);
-                     }
-                 }
--                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
++                    }
++                }
 +
-             }
-         }
--    }
++            }
++        }
 +    } while ((i = next_subset(lc, i, c_idx_nz,
 +        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
- 
-     if (lc->cu.cu_transquant_bypass_flag) {
-         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1480,7 +2227,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-         }
-     } else {
--        if (transform_skip_flag) {
++
++    if (lc->cu.cu_transquant_bypass_flag) {
++        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
++
++            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++        }
++    } else {
 +        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-                       log2_trafo_size == 2 &&
-                       lc->cu.pred_mode == MODE_INTRA;
-@@ -1500,10 +2247,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-             }
-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-             s->hevcdsp.transform_4x4_luma(coeffs);
--        } else {
++            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++                      log2_trafo_size == 2 &&
++                      lc->cu.pred_mode == MODE_INTRA;
++            if (rot) {
++                for (i = 0; i < 8; i++)
++                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
++            }
++
++            s->hevcdsp.dequant(coeffs, log2_trafo_size);
++
++            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++                                        lc->cu.pred_mode == MODE_INTRA &&
++                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
++
++                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++            }
++        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++            s->hevcdsp.transform_4x4_luma(coeffs);
 +        }
-+#ifdef RPI
 +        else if (!use_vpu)
-+#else
-+        else
-+#endif
 +        {
-             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-             if (max_xy == 0)
--                s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++            if (max_xy == 0)
 +            {
-+#ifdef RPI
 +                if (use_dc)
 +                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
 +                else
-+#endif
 +                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
 +            }
-             else {
-                 int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-                 if (max_xy < 4)
-@@ -1517,36 +2277,158 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-         }
-     }
-     if (lc->tu.cross_pf) {
--        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
++            else {
++                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++                if (max_xy < 4)
++                    col_limit = FFMIN(4, col_limit);
++                else if (max_xy < 8)
++                    col_limit = FFMIN(8, col_limit);
++                else if (max_xy < 12)
++                    col_limit = FFMIN(24, col_limit);
++                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
++            }
++        }
++    }
++    if (lc->tu.cross_pf) {
 +        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
 +        const int ccount = 1 << (log2_trafo_size * 2);
- 
--        for (i = 0; i < (trafo_size * trafo_size); i++) {
++
 +        for (i = 0; i < ccount; i++) {
-             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-         }
-     }
-+#ifdef RPI
++            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++        }
++    }
++
 +    if (!use_dc)
-+    {
 +        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+    }
-+#else
-     s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
-+#endif
- }
- 
--void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
++}
++
 +#if !USE_BY22
 +// Stores results to lc
-+void ff_hevc_hls_mvd_coding(HEVCLocalContext * const lc)
- {
--    HEVCLocalContext *lc = s->HEVClc;
--    int x = abs_mvd_greater0_flag_decode(s);
--    int y = abs_mvd_greater0_flag_decode(s);
++void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
 +    int x = abs_mvd_greater0_flag_decode(lc);
 +    int y = abs_mvd_greater0_flag_decode(lc);
- 
-     if (x)
--        x += abs_mvd_greater1_flag_decode(s);
++
++    if (x)
 +        x += abs_mvd_greater1_flag_decode(lc);
-     if (y)
--        y += abs_mvd_greater1_flag_decode(s);
++    if (y)
 +        y += abs_mvd_greater1_flag_decode(lc);
- 
-     switch (x) {
--    case 2: lc->pu.mvd.x = mvd_decode(s);           break;
--    case 1: lc->pu.mvd.x = mvd_sign_flag_decode(s); break;
++
++    switch (x) {
 +    case 2: lc->pu.mvd.x = mvd_decode(lc);           break;
 +    case 1: lc->pu.mvd.x = mvd_sign_flag_decode(lc); break;
-     case 0: lc->pu.mvd.x = 0;                       break;
-     }
- 
-     switch (y) {
--    case 2: lc->pu.mvd.y = mvd_decode(s);           break;
--    case 1: lc->pu.mvd.y = mvd_sign_flag_decode(s); break;
++    case 0: lc->pu.mvd.x = 0;                       break;
++    }
++
++    switch (y) {
 +    case 2: lc->pu.mvd.y = mvd_decode(lc);           break;
 +    case 1: lc->pu.mvd.y = mvd_sign_flag_decode(lc); break;
-     case 0: lc->pu.mvd.y = 0;                       break;
-     }
- }
++    case 0: lc->pu.mvd.y = 0;                       break;
++    }
++}
 +#else
-+void ff_hevc_hls_mvd_coding(HEVCLocalContext * const lc)
++void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
 +{
 +    int x = abs_mvd_greater0_flag_decode(lc);
 +    int y = abs_mvd_greater0_flag_decode(lc);
@@ -9602,7 +10933,7 @@ index 853fd3f722..e8e6ad3b1a 100644
 +            lc->pu.mvd.y = (y ^ s) - s;
 +            // don't care about b anymore
 +        }
- 
++
 +        get_cabac_by22_flush(cc, n, val);
 +        bypass_finish(cc);
 +    }
@@ -9610,149 +10941,289 @@ index 853fd3f722..e8e6ad3b1a 100644
 +//    printf("BY: X=%d,Y=%d\n", lc->pu.mvd.x, lc->pu.mvd.y);
 +}
 +#endif
-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index b53f4cc721..9982cff40f 100644
---- a/libavcodec/hevc_filter.c
-+++ b/libavcodec/hevc_filter.c
-@@ -22,6 +22,12 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
+diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c
+new file mode 100644
+index 0000000000..341bb77d9d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.c
+@@ -0,0 +1,75 @@
++/*
++ * HEVC shared tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "rpi_hevc_data.h"
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
++    0, 0, 1, 0,
++    1, 2, 0, 1,
++    2, 3, 1, 2,
++    3, 2, 3, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
++    0, 1, 0, 2,
++    1, 0, 3, 2,
++    1, 0, 3, 2,
++    1, 3, 2, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
++    0, 0, 1, 0,
++    1, 2, 0, 1,
++    2, 3, 0, 1,
++    2, 3, 4, 0,
++    1, 2, 3, 4,
++    5, 0, 1, 2,
++    3, 4, 5, 6,
++    0, 1, 2, 3,
++    4, 5, 6, 7,
++    1, 2, 3, 4,
++    5, 6, 7, 2,
++    3, 4, 5, 6,
++    7, 3, 4, 5,
++    6, 7, 4, 5,
++    6, 7, 5, 6,
++    7, 6, 7, 7,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
++    0, 1, 0, 2,
++    1, 0, 3, 2,
++    1, 0, 4, 3,
++    2, 1, 0, 5,
++    4, 3, 2, 1,
++    0, 6, 5, 4,
++    3, 2, 1, 0,
++    7, 6, 5, 4,
++    3, 2, 1, 0,
++    7, 6, 5, 4,
++    3, 2, 1, 7,
++    6, 5, 4, 3,
++    2, 7, 6, 5,
++    4, 3, 7, 6,
++    5, 4, 7, 6,
++    5, 7, 6, 7,
++};
+diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h
+new file mode 100644
+index 0000000000..0aee673d8b
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.h
+@@ -0,0 +1,31 @@
++/*
++ * HEVC shared data tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_DATA_H
++#define AVCODEC_RPI_HEVC_DATA_H
++
++#include <stdint.h>
++
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
++
++#endif /* AVCODEC_RPI_HEVC_DATA_H */
+diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
+new file mode 100644
+index 0000000000..a1d6d56b04
+--- /dev/null
++++ b/libavcodec/rpi_hevc_filter.c
+@@ -0,0 +1,1067 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Seppo Tomperi
++ * Copyright (C) 2013 Wassim Hamidouche
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
 +//#define DISABLE_SAO
 +//#define DISABLE_DEBLOCK
 +//#define DISABLE_STRENGTHS
 +// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
 +//#define DISABLE_DEBLOCK_NONREF
 +
- #include "libavutil/common.h"
- #include "libavutil/internal.h"
- 
-@@ -30,6 +36,16 @@
- 
- #include "bit_depth_template.c"
- 
-+#ifdef RPI
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++
++#include "cabac_functions.h"
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++
 +#include "rpi_qpu.h"
-+#endif
-+#if RPI_HEVC_SAND
 +#include "rpi_zc.h"
 +#include "libavutil/rpi_sand_fns.h"
-+#else
-+#define RPI_ZC_SAND_8_IN_10_BUF 0
-+#endif
 +
- #define LUMA 0
- #define CB 1
- #define CR 2
-@@ -75,14 +91,13 @@ static int chroma_tc(HEVCContext *s, int qp_y, int c_idx, int tc_offset)
-     return tctable[idxt];
- }
- 
--static int get_qPy_pred(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
-+static inline int get_qPy_pred(const HEVCContext * const s, HEVCLocalContext * const lc, int xBase, int yBase, int log2_cb_size)
- {
--    HEVCLocalContext *lc     = s->HEVClc;
-     int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
--    int MinCuQpDeltaSizeMask = (1 << (s->ps.sps->log2_ctb_size -
--                                      s->ps.pps->diff_cu_qp_delta_depth)) - 1;
--    int xQgBase              = xBase - (xBase & MinCuQpDeltaSizeMask);
--    int yQgBase              = yBase - (yBase & MinCuQpDeltaSizeMask);
-+    int MinCuQpDeltaSizeMask = ~((1 << (s->ps.sps->log2_ctb_size -
-+                                      s->ps.pps->diff_cu_qp_delta_depth)) - 1);
-+    int xQgBase              = xBase & MinCuQpDeltaSizeMask;
-+    int yQgBase              = yBase & MinCuQpDeltaSizeMask;
-     int min_cb_width         = s->ps.sps->min_cb_width;
-     int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
-     int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
-@@ -90,54 +105,43 @@ static int get_qPy_pred(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
-                                (xQgBase & ctb_size_mask);
-     int availableB           = (yBase   & ctb_size_mask) &&
-                                (yQgBase & ctb_size_mask);
--    int qPy_pred, qPy_a, qPy_b;
--
--    // qPy_pred
--    if (lc->first_qp_group || (!xQgBase && !yQgBase)) {
--        lc->first_qp_group = !lc->tu.is_cu_qp_delta_coded;
--        qPy_pred = s->sh.slice_qp;
--    } else {
--        qPy_pred = lc->qPy_pred;
--    }
-+    const int qPy_pred = lc->qPy_pred;
- 
--    // qPy_a
--    if (availableA == 0)
--        qPy_a = qPy_pred;
--    else
--        qPy_a = s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width];
--
--    // qPy_b
--    if (availableB == 0)
--        qPy_b = qPy_pred;
--    else
--        qPy_b = s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width];
--
--    av_assert2(qPy_a >= -s->ps.sps->qp_bd_offset && qPy_a < 52);
--    av_assert2(qPy_b >= -s->ps.sps->qp_bd_offset && qPy_b < 52);
--
--    return (qPy_a + qPy_b + 1) >> 1;
++#define LUMA 0
++#define CB 1
++#define CR 2
++
++static const uint8_t tctable[54] = {
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
++    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
++    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24           // QP 38...53
++};
++
++static const uint8_t betatable[52] = {
++     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
++     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
++    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64                      // QP 38...51
++};
++
++static int chroma_tc(HEVCRpiContext *s, int qp_y, int c_idx, int tc_offset)
++{
++    static const int qp_c[] = {
++        29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37
++    };
++    int qp, qp_i, offset, idxt;
++
++    // slice qp offset is not used for deblocking
++    if (c_idx == 1)
++        offset = s->ps.pps->cb_qp_offset;
++    else
++        offset = s->ps.pps->cr_qp_offset;
++
++    qp_i = av_clip(qp_y + offset, 0, 57);
++    if (ctx_cfmt(s) == 1) {
++        if (qp_i < 30)
++            qp = qp_i;
++        else if (qp_i > 43)
++            qp = qp_i - 6;
++        else
++            qp = qp_c[qp_i - 30];
++    } else {
++        qp = av_clip(qp_i, 0, 51);
++    }
++
++    idxt = av_clip(qp + DEFAULT_INTRA_TC_OFFSET + tc_offset, 0, 53);
++    return tctable[idxt];
++}
++
++static inline int get_qPy_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase, int log2_cb_size)
++{
++    int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
++    int MinCuQpDeltaSizeMask = ~((1 << (s->ps.sps->log2_ctb_size -
++                                      s->ps.pps->diff_cu_qp_delta_depth)) - 1);
++    int xQgBase              = xBase & MinCuQpDeltaSizeMask;
++    int yQgBase              = yBase & MinCuQpDeltaSizeMask;
++    int min_cb_width         = s->ps.sps->min_cb_width;
++    int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
++    int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
++    int availableA           = (xBase   & ctb_size_mask) &&
++                               (xQgBase & ctb_size_mask);
++    int availableB           = (yBase   & ctb_size_mask) &&
++                               (yQgBase & ctb_size_mask);
++    const int qPy_pred = lc->qPy_pred;
++
 +    return ((!availableA ? qPy_pred : s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
 +            (!availableB ? qPy_pred : s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
- }
- 
--void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase, int log2_cb_size)
++}
++
 +// * Only called from bitstream decode in foreground
 +//   so should be safe
-+void ff_hevc_set_qPy(const HEVCContext * const s, HEVCLocalContext * const lc, int xBase, int yBase, int log2_cb_size)
- {
--    int qp_y = get_qPy_pred(s, xBase, yBase, log2_cb_size);
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase, int log2_cb_size)
++{
 +    const int qp_y = get_qPy_pred(s, lc, xBase, yBase, log2_cb_size);
- 
--    if (s->HEVClc->tu.cu_qp_delta != 0) {
++
 +    if (lc->tu.cu_qp_delta != 0) {
-         int off = s->ps.sps->qp_bd_offset;
--        s->HEVClc->qp_y = FFUMOD(qp_y + s->HEVClc->tu.cu_qp_delta + 52 + 2 * off,
++        int off = s->ps.sps->qp_bd_offset;
 +        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
-                                  52 + off) - off;
-     } else
--        s->HEVClc->qp_y = qp_y;
++                                 52 + off) - off;
++    } else
 +        lc->qp_y = qp_y;
- }
- 
--static int get_qPy(HEVCContext *s, int xC, int yC)
-+static int get_qPy(const HEVCContext * const s, const int xC, const int yC)
- {
--    int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
--    int x                 = xC >> log2_min_cb_size;
--    int y                 = yC >> log2_min_cb_size;
++}
++
++static int get_qPy(const HEVCRpiContext * const s, const int xC, const int yC)
++{
 +    const int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
 +    const int x                 = xC >> log2_min_cb_size;
 +    const int y                 = yC >> log2_min_cb_size;
-     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
- }
- 
-+static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
++    return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
++}
++
++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
 +{
-+#if RPI_HEVC_SAND
-+    return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
-+#else
-+    return s->ps.sps->pixel_shift;
-+#endif
++    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
++}
++
++static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
++                     ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++int i, j;
++
++    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
++        for (i = 0; i < height; i++) {
++            for (j = 0; j < width; j+=8)
++                AV_COPY64U(dst+j, src+j);
++            dst += stride_dst;
++            src += stride_src;
++        }
++    } else {
++        for (i = 0; i < height; i++) {
++            for (j = 0; j < width; j+=16)
++                AV_COPY128(dst+j, src+j);
++            dst += stride_dst;
++            src += stride_src;
++        }
++    }
 +}
 +
- static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
-                      ptrdiff_t stride_dst, ptrdiff_t stride_src)
- {
-@@ -160,12 +164,21 @@ int i, j;
-     }
- }
- 
 +// "DSP" these?
- static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
- {
--    if (pixel_shift)
--        *(uint16_t *)dst = *(uint16_t *)src;
--    else
--        *dst = *src;
++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
++{
 +    switch (pixel_shift)
 +    {
 +        case 2:
@@ -9765,25 +11236,13 @@ index b53f4cc721..9982cff40f 100644
 +            *dst = *src;
 +            break;
 +    }
- }
- 
- static void copy_vert(uint8_t *dst, const uint8_t *src,
-@@ -173,26 +186,37 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
-                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
- {
-     int i;
--    if (pixel_shift == 0) {
--        for (i = 0; i < height; i++) {
--            *dst = *src;
--            dst += stride_dst;
--            src += stride_src;
--        }
--    } else {
--        for (i = 0; i < height; i++) {
--            *(uint16_t *)dst = *(uint16_t *)src;
--            dst += stride_dst;
--            src += stride_src;
--        }
++}
++
++static void copy_vert(uint8_t *dst, const uint8_t *src,
++                      int pixel_shift, int height,
++                      ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    int i;
 +    switch (pixel_shift)
 +    {
 +        case 2:
@@ -9807,56 +11266,68 @@ index b53f4cc721..9982cff40f 100644
 +                src += stride_src;
 +            }
 +            break;
-     }
- }
- 
--static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
-+static void copy_CTB_to_hv(const HEVCContext * const s, const uint8_t * const src,
-                            ptrdiff_t stride_src, int x, int y, int width, int height,
-                            int c_idx, int x_ctb, int y_ctb)
- {
--    int sh = s->ps.sps->pixel_shift;
++    }
++}
++
++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
++                           ptrdiff_t stride_src, int x, int y, int width, int height,
++                           int c_idx, int x_ctb, int y_ctb)
++{
 +    const unsigned int sh = pixel_shift(s, c_idx);
-     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
-     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
- 
-@@ -208,7 +232,8 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
-     copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
- }
- 
--static void restore_tqb_pixels(HEVCContext *s,
++    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
++    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
++
++    /* copy horizontal edges */
++    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
++        src, width << sh);
++    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
++        src + stride_src * (height - 1), width << sh);
++
++    /* copy vertical edges */
++    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
++
++    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
++}
++
 +// N.B. Src & dst are swapped as this is a restore!
-+static void restore_tqb_pixels(const HEVCContext * const s,
-                                uint8_t *src1, const uint8_t *dst1,
-                                ptrdiff_t stride_src, ptrdiff_t stride_dst,
-                                int x0, int y0, int width, int height, int c_idx)
-@@ -223,13 +248,14 @@ static void restore_tqb_pixels(HEVCContext *s,
-         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
-         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
-         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
--        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
++static void restore_tqb_pixels(const HEVCRpiContext * const s,
++                               uint8_t *src1, const uint8_t *dst1,
++                               ptrdiff_t stride_src, ptrdiff_t stride_dst,
++                               int x0, int y0, int width, int height, int c_idx)
++{
++    if ( s->ps.pps->transquant_bypass_enable_flag ||
++            (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
++        int x, y;
++        int min_pu_size  = 1 << s->ps.sps->log2_min_pu_size;
++        const unsigned int hshift = ctx_hshift(s, c_idx);
++        const unsigned int vshift = ctx_vshift(s, c_idx);
++        int x_min        = ((x0         ) >> s->ps.sps->log2_min_pu_size);
++        int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
++        int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
++        int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
 +        const unsigned int sh = pixel_shift(s, c_idx);
 +        int len          = (min_pu_size >> hshift) << sh;
-         for (y = y_min; y < y_max; y++) {
-             for (x = x_min; x < x_max; x++) {
-                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
-                     int n;
--                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
--                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++        for (y = y_min; y < y_max; y++) {
++            for (x = x_min; x < x_max; x++) {
++                if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
++                    int n;
 +                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
 +                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
-                     for (n = 0; n < (min_pu_size >> vshift); n++) {
-                         memcpy(src, dst, len);
-                         src += stride_src;
-@@ -243,10 +269,15 @@ static void restore_tqb_pixels(HEVCContext *s,
- 
- #define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
- 
--static void sao_filter_CTB(HEVCContext *s, int x, int y)
-+static void sao_filter_CTB(const HEVCContext * const s, const int x, const int y)
- {
--    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
--    HEVCLocalContext *lc = s->HEVClc;
++                    for (n = 0; n < (min_pu_size >> vshift); n++) {
++                        memcpy(src, dst, len);
++                        src += stride_src;
++                        dst += stride_dst;
++                    }
++                }
++            }
++        }
++    }
++}
++
++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
++
++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
++{
 +#if SAO_FILTER_N == 5
 +    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
 +#elif SAO_FILTER_N == 6
@@ -9864,50 +11335,82 @@ index b53f4cc721..9982cff40f 100644
 +#else
 +#error Confused by size of sao fn array
 +#endif
-     int c_idx;
-     int edges[4];  // 0 left 1 top 2 right 3 bottom
-     int x_ctb                = x >> s->ps.sps->log2_ctb_size;
-@@ -266,12 +297,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-     uint8_t right_tile_edge  = 0;
-     uint8_t up_tile_edge     = 0;
-     uint8_t bottom_tile_edge = 0;
-+#if RPI_HEVC_SAND
-+    const int sliced = av_rpi_is_sand_frame(s->frame);
-+    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
-+#else
-+    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
-+#endif
- 
-     edges[0]   = x_ctb == 0;
-     edges[1]   = y_ctb == 0;
-     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
-     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
- 
++    int c_idx;
++    int edges[4];  // 0 left 1 top 2 right 3 bottom
++    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
++    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
++    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
++    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
++    SAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
++    // flags indicating unfilterable edges
++    uint8_t vert_edge[]      = { 0, 0 };
++    uint8_t horiz_edge[]     = { 0, 0 };
++    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
++    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
++    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
++                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
++    uint8_t restore          = no_tile_filter || !lfase;
++    uint8_t left_tile_edge   = 0;
++    uint8_t right_tile_edge  = 0;
++    uint8_t up_tile_edge     = 0;
++    uint8_t bottom_tile_edge = 0;
++    const int sliced = 1;
++    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
++
++    edges[0]   = x_ctb == 0;
++    edges[1]   = y_ctb == 0;
++    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
++    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++
 +#ifdef DISABLE_SAO
 +    return;
 +#endif
 +
-     if (restore) {
-         if (!edges[0]) {
-             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -303,7 +344,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-         }
-     }
- 
--    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
++    if (restore) {
++        if (!edges[0]) {
++            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
++        }
++        if (!edges[2]) {
++            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
++            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
++        }
++        if (!edges[1]) {
++            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
++            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
++        }
++        if (!edges[3]) {
++            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
++            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
++        }
++        if (!edges[0] && !edges[1]) {
++            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
++        }
++        if (!edges[1] && !edges[2]) {
++            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
++        }
++        if (!edges[2] && !edges[3]) {
++            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
++        }
++        if (!edges[0] && !edges[3]) {
++            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
++        }
++    }
++
 +    for (c_idx = 0; c_idx < plane_count; c_idx++) {
-         int x0       = x >> s->ps.sps->hshift[c_idx];
-         int y0       = y >> s->ps.sps->vshift[c_idx];
-         ptrdiff_t stride_src = s->frame->linesize[c_idx];
-@@ -312,28 +353,91 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
-         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
-         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
--        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
-         ptrdiff_t stride_dst;
-         uint8_t *dst;
- 
-+#if RPI_HEVC_SAND
++        const unsigned int vshift = ctx_vshift(s, c_idx);
++        const unsigned int hshift = ctx_hshift(s, c_idx);
++        const int x0 = x >> hshift;
++        const int y0 = y >> vshift;
++        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
++        int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
++        int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
++        int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
++        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
++        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++        ptrdiff_t stride_dst;
++        uint8_t *dst;
++
 +        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
 +        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
 +        uint8_t * const src = !sliced ?
@@ -9926,43 +11429,22 @@ index b53f4cc721..9982cff40f 100644
 +                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
 +                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
 +
-+
 +        if (sliced && c_idx > 1) {
 +            break;
 +        }
-+#else
-+        const unsigned int sh = s->ps.sps->pixel_shift;
-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
-+        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
-+#endif
 +
-         switch (sao->type_idx[c_idx]) {
-         case SAO_BAND:
-             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-                            x_ctb, y_ctb);
-             if (s->ps.pps->transquant_bypass_enable_flag ||
-                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
--            dst = lc->edge_emu_buffer;
--            stride_dst = 2*MAX_PB_SIZE;
--            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
--            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
--                                            sao->offset_val[c_idx], sao->band_position[c_idx],
--                                            width, height);
--            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
--                               x, y, width, height, c_idx);
-+#ifdef RPI
++        switch (sao->type_idx[c_idx]) {
++        case SAO_BAND:
++            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                           x_ctb, y_ctb);
++            if (s->ps.pps->transquant_bypass_enable_flag ||
++                (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
 +                // Can't use the edge buffer here as it may be in use by the foreground
 +                DECLARE_ALIGNED(64, uint8_t, dstbuf)
 +                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
-+#else
-+                uint8_t * const dstbuf = s->HEVClc->edge_emu_buffer;
-+#endif
 +                dst = dstbuf;
 +                stride_dst = 2*MAX_PB_SIZE;
 +                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
-+#if RPI_HEVC_SAND
 +                if (sliced && c_idx != 0)
 +                {
 +                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
@@ -9971,7 +11453,6 @@ index b53f4cc721..9982cff40f 100644
 +                                                    width, height);
 +                }
 +                else
-+#endif
 +                {
 +                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
 +                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
@@ -9979,178 +11460,111 @@ index b53f4cc721..9982cff40f 100644
 +                }
 +                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
 +                                   x, y, width, height, c_idx);
-             } else {
--            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
--                                            sao->offset_val[c_idx], sao->band_position[c_idx],
--                                            width, height);
-+#if RPI_HEVC_SAND
++            } else {
 +                if (sliced && c_idx != 0)
 +                {
-+//                    printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
-+
 +                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
 +                                                    sao->offset_val[1], sao->band_position[1],
 +                                                    sao->offset_val[2], sao->band_position[2],
 +                                                    width, height);
 +                }
 +                else
-+#endif
 +                {
 +                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
 +                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
 +                                                    width, height);
 +                }
-             }
-             sao->type_idx[c_idx] = SAO_APPLIED;
-             break;
-@@ -341,108 +445,125 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-         {
-             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
-             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
--            int left_edge = edges[0];
-             int top_edge = edges[1];
--            int right_edge = edges[2];
-             int bottom_edge = edges[3];
--            int sh = s->ps.sps->pixel_shift;
--            int left_pixels, right_pixels;
-+#ifdef RPI
++            }
++            sao->type_idx[c_idx] = SAO_APPLIED;
++            break;
++        case SAO_EDGE:
++        {
++            const int w = s->ps.sps->width >> hshift;
++            const int h = s->ps.sps->height >> vshift;
++            int top_edge = edges[1];
++            int bottom_edge = edges[3];
 +            // Can't use the edge buffer here as it may be in use by the foreground
 +            DECLARE_ALIGNED(64, uint8_t, dstbuf)
 +                [2*(MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE)*(MAX_PB_SIZE + 2) + 64];
-+#else
-+            uint8_t * const dstbuf = s->HEVClc->edge_emu_buffer;
-+#endif
- 
-             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
--            dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
++
++            stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
 +            dst = dstbuf + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
- 
-             if (!top_edge) {
--                int left = 1 - left_edge;
--                int right = 1 - right_edge;
--                const uint8_t *src1[2];
-                 uint8_t *dst1;
--                int src_idx, pos;
++
++            if (!top_edge) {
++                uint8_t *dst1;
 +                int src_idx;
 +                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
 +
 +                dst1 = dst - stride_dst;
- 
--                dst1 = dst - stride_dst - (left << sh);
--                src1[0] = src - stride_src - (left << sh);
--                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
--                pos = 0;
--                if (left) {
++
 +                if (src_l != NULL) {
-                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
-                                SAO_APPLIED);
--                    copy_pixel(dst1, src1[src_idx], sh);
--                    pos += (1 << sh);
++                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++                               SAO_APPLIED);
 +                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
-                 }
++                }
 +
-                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
-                            SAO_APPLIED);
--                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
--                if (right) {
--                    pos += width << sh;
++                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++                           SAO_APPLIED);
 +                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
 +
 +                if (src_r != NULL) {
-                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
-                                SAO_APPLIED);
--                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++                               SAO_APPLIED);
 +                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
-                 }
-             }
-             if (!bottom_edge) {
--                int left = 1 - left_edge;
--                int right = 1 - right_edge;
--                const uint8_t *src1[2];
--                uint8_t *dst1;
--                int src_idx, pos;
++                }
++            }
++            if (!bottom_edge) {
 +                uint8_t * const dst1 = dst + height * stride_dst;
 +                int src_idx;
 +                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
 +                const unsigned int hoff = height * stride_src;
- 
--                dst1 = dst + height * stride_dst - (left << sh);
--                src1[0] = src + height * stride_src - (left << sh);
--                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
--                pos = 0;
--                if (left) {
++
 +                if (src_l != NULL) {
-                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
-                                SAO_APPLIED);
--                    copy_pixel(dst1, src1[src_idx], sh);
--                    pos += (1 << sh);
++                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++                               SAO_APPLIED);
 +                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
-                 }
++                }
 +
-                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
-                            SAO_APPLIED);
--                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
--                if (right) {
--                    pos += width << sh;
++                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++                           SAO_APPLIED);
 +                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
 +
 +                if (src_r != NULL) {
-                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
-                                SAO_APPLIED);
--                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++                               SAO_APPLIED);
 +                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
-                 }
-             }
--            left_pixels = 0;
--            if (!left_edge) {
++                }
++            }
 +            if (src_l != NULL) {
-                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-                     copy_vert(dst - (1 << sh),
-                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
-                               sh, height, stride_dst, 1 << sh);
-                 } else {
--                    left_pixels = 1;
++                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                    copy_vert(dst - (1 << sh),
++                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++                              sh, height, stride_dst, 1 << sh);
++                } else {
 +                    copy_vert(dst - (1 << sh),
 +                              src_l,
 +                              sh, height, stride_dst, stride_src);
-                 }
-             }
--            right_pixels = 0;
--            if (!right_edge) {
++                }
++            }
 +            if (src_r != NULL) {
-                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-                     copy_vert(dst + (width << sh),
-                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
-                               sh, height, stride_dst, 1 << sh);
-                 } else {
--                    right_pixels = 1;
++                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                    copy_vert(dst + (width << sh),
++                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++                              sh, height, stride_dst, 1 << sh);
++                } else {
 +                    copy_vert(dst + (width << sh),
 +                              src_r,
 +                              sh, height, stride_dst, stride_src);
-                 }
-             }
- 
--            copy_CTB(dst - (left_pixels << sh),
--                     src - (left_pixels << sh),
--                     (width + left_pixels + right_pixels) << sh,
++                }
++            }
++
 +            copy_CTB(dst,
 +                     src,
 +                     width << sh,
-                      height, stride_dst, stride_src);
- 
-             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-                            x_ctb, y_ctb);
--            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
--                                            sao->eo_class[c_idx], width, height);
--            s->hevcdsp.sao_edge_restore[restore](src, dst,
--                                                stride_src, stride_dst,
--                                                sao,
--                                                edges, width,
--                                                height, c_idx,
--                                                vert_edge,
--                                                horiz_edge,
--                                                diag_edge);
-+#if RPI_HEVC_SAND
++                     height, stride_dst, stride_src);
++
++            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                           x_ctb, y_ctb);
 +            if (sliced && c_idx != 0)
 +            {
 +                // Class always the same for both U & V (which is just as well :-))
@@ -10167,7 +11581,6 @@ index b53f4cc721..9982cff40f 100644
 +                                                    diag_edge);
 +            }
 +            else
-+#endif
 +            {
 +                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
 +                                                sao->eo_class[c_idx], width, height);
@@ -10181,19 +11594,19 @@ index b53f4cc721..9982cff40f 100644
 +                                                    diag_edge);
 +            }
 +            // ??? Does this actually work for chroma ???
-             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-                                x, y, width, height, c_idx);
-             sao->type_idx[c_idx] = SAO_APPLIED;
-@@ -450,8 +571,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
-         }
-         }
-     }
++            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                               x, y, width, height, c_idx);
++            sao->type_idx[c_idx] = SAO_APPLIED;
++            break;
++        }
++        }
++    }
 +
 +#if RPI_ZC_SAND_8_IN_10_BUF
 +    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
 +        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
 +    {
-+        const unsigned int stride1 = s->frame->linesize[0];
++        const unsigned int stride1 = frame_stride1(s->frame, 1);
 +        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
 +        const unsigned int xoff = (x >> 8) * stride2 * stride1;
 +        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
@@ -10209,25 +11622,52 @@ index b53f4cc721..9982cff40f 100644
 +        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
 +    }
 +#endif
- }
- 
++}
++
 +// Returns 2 or 0.
- static int get_pcm(HEVCContext *s, int x, int y)
- {
-     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
-@@ -478,7 +621,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-     uint8_t *src;
-     int x, y;
-     int chroma, beta;
--    int32_t c_tc[2], tc[2];
-+    int32_t c_tc[4], tc[2];
-     uint8_t no_p[2] = { 0 };
-     uint8_t no_q[2] = { 0 };
- 
-@@ -495,6 +638,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
-                s->ps.pps->transquant_bypass_enable_flag;
- 
++static int get_pcm(HEVCRpiContext *s, int x, int y)
++{
++    int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++    int x_pu, y_pu;
++
++    if (x < 0 || y < 0)
++        return 2;
++
++    x_pu = x >> log2_min_pu_size;
++    y_pu = y >> log2_min_pu_size;
++
++    if (x_pu >= s->ps.sps->min_pu_width || y_pu >= s->ps.sps->min_pu_height)
++        return 2;
++    return s->is_pcm[y_pu * s->ps.sps->min_pu_width + x_pu];
++}
++
++#define TC_CALC(qp, bs)                                                 \
++    tctable[av_clip((qp) + DEFAULT_INTRA_TC_OFFSET * ((bs) - 1) +       \
++                    (tc_offset & -2),                                   \
++                    0, MAX_QP + DEFAULT_INTRA_TC_OFFSET)]
++
++static void deblocking_filter_CTB(HEVCRpiContext *s, int x0, int y0)
++{
++    uint8_t *src;
++    int x, y;
++    int beta;
++    int32_t tc[2];
++    uint8_t no_p[2] = { 0 };
++    uint8_t no_q[2] = { 0 };
++
++    int log2_ctb_size = s->ps.sps->log2_ctb_size;
++    int x_end, x_end2, y_end;
++    int ctb_size        = 1 << log2_ctb_size;
++    int ctb             = (x0 >> log2_ctb_size) +
++                          (y0 >> log2_ctb_size) * s->ps.sps->ctb_width;
++    int cur_tc_offset   = s->deblock[ctb].tc_offset;
++    int cur_beta_offset = s->deblock[ctb].beta_offset;
++    int left_tc_offset, left_beta_offset;
++    int tc_offset, beta_offset;
++    int pcmf = (s->ps.sps->pcm_enabled_flag &&
++                s->ps.sps->pcm.loop_filter_disable_flag) ||
++               s->ps.pps->transquant_bypass_enable_flag;
++
 +#ifdef DISABLE_DEBLOCK_NONREF
 +    if (!s->used_for_ref)
 +      return; // Don't deblock non-reference frames
@@ -10237,87 +11677,82 @@ index b53f4cc721..9982cff40f 100644
 +#endif
 +    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
 +        return;
-     if (x0) {
-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
-         left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -528,19 +680,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
- 
-                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
-                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
--                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
-                 if (pcmf) {
-                     no_p[0] = get_pcm(s, x - 1, y);
-                     no_p[1] = get_pcm(s, x - 1, y + 4);
-                     no_q[0] = get_pcm(s, x, y);
-                     no_q[1] = get_pcm(s, x, y + 4);
--                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
--                                                         s->frame->linesize[LUMA],
--                                                         beta, tc, no_p, no_q);
--                } else
--                    s->hevcdsp.hevc_v_loop_filter_luma(src,
--                                                       s->frame->linesize[LUMA],
--                                                       beta, tc, no_p, no_q);
-+                }
-+#if RPI_HEVC_SAND
-+                if (av_rpi_is_sand_frame(s->frame)) {
-+
-+                    // This copes properly with no_p/no_q
-+                    s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+                                                     s->frame->linesize[LUMA],
-+                                                     beta, tc, no_p, no_q,
-+                                                     av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
-+                }
-+                else
-+#endif
-+                {
-+                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
-+                    if (pcmf) {
-+                        // Standard DSP code is broken if no_p / no_q is set
-+                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
-+                                                           s->frame->linesize[LUMA],
-+                                                           beta, tc, no_p, no_q);
-+                    }
-+                    else
-+#ifdef RPI_DEBLOCK_VPU
-+                    if (s->enable_rpi_deblock) {
-+                        uint8_t (*setup)[2][2][4];
-+                        int num16 = (y>>4)*s->setup_width + (x>>4);
-+                        int a = ((y>>3) & 1) << 1;
-+                        int b = (x>>3) & 1;
-+                        setup = s->dvq->y_setup_arm[num16];
-+                        setup[0][b][0][a] = beta;
-+                        setup[0][b][0][a + 1] = beta;
-+                        setup[0][b][1][a] = tc[0];
-+                        setup[0][b][1][a + 1] = tc[1];
-+                    } else
-+#endif
-+                    {
-+                        s->hevcdsp.hevc_v_loop_filter_luma(src,
-+                                                           s->frame->linesize[LUMA],
-+                                                           beta, tc, no_p, no_q);
-+                    }
++    if (x0) {
++        left_tc_offset   = s->deblock[ctb - 1].tc_offset;
++        left_beta_offset = s->deblock[ctb - 1].beta_offset;
++    } else {
++        left_tc_offset   = 0;
++        left_beta_offset = 0;
++    }
++
++    x_end = x0 + ctb_size;
++    if (x_end > s->ps.sps->width)
++        x_end = s->ps.sps->width;
++    y_end = y0 + ctb_size;
++    if (y_end > s->ps.sps->height)
++        y_end = s->ps.sps->height;
++
++    tc_offset   = cur_tc_offset;
++    beta_offset = cur_beta_offset;
++
++    x_end2 = x_end;
++    if (x_end2 != s->ps.sps->width)
++        x_end2 -= 8;
++    for (y = y0; y < y_end; y += 8) {
++        // vertical filtering luma
++        for (x = x0 ? x0 : 8; x < x_end; x += 8) {
++            const int bs0 = s->vertical_bs[(x +  y      * s->bs_width) >> 2];
++            const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2];
++            if (bs0 || bs1) {
++                const int qp = (get_qPy(s, x - 1, y)     + get_qPy(s, x, y)     + 1) >> 1;
++
++                beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
++
++                tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++                if (pcmf) {
++                    no_p[0] = get_pcm(s, x - 1, y);
++                    no_p[1] = get_pcm(s, x - 1, y + 4);
++                    no_q[0] = get_pcm(s, x, y);
++                    no_q[1] = get_pcm(s, x, y + 4);
 +                }
-             }
-         }
- 
-@@ -560,7 +744,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
-                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
-                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
--                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
-+                src =
-+#if RPI_HEVC_SAND
-+                    av_rpi_is_sand_frame(s->frame) ?
-+                        av_rpi_sand_frame_pos_y(s->frame, x, y) :
-+#endif
-+                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
-                 if (pcmf) {
-                     no_p[0] = get_pcm(s, x, y - 1);
-                     no_p[1] = get_pcm(s, x + 4, y - 1);
-@@ -570,6 +759,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                                                          s->frame->linesize[LUMA],
-                                                          beta, tc, no_p, no_q);
-                 } else
++
++                // This copes properly with no_p/no_q
++                s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++                                                 frame_stride1(s->frame, LUMA),
++                                                 beta, tc, no_p, no_q,
++                                                 av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
++                // *** VPU deblock lost here
++            }
++        }
++
++        if(!y)
++             continue;
++
++        // horizontal filtering luma
++        for (x = x0 ? x0 - 8 : 0; x < x_end2; x += 8) {
++            const int bs0 = s->horizontal_bs[( x      + y * s->bs_width) >> 2];
++            const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2];
++            if (bs0 || bs1) {
++                const int qp = (get_qPy(s, x, y - 1)     + get_qPy(s, x, y)     + 1) >> 1;
++
++                tc_offset   = x >= x0 ? cur_tc_offset : left_tc_offset;
++                beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset;
++
++                beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
++                tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++                src = av_rpi_sand_frame_pos_y(s->frame, x, y);
++
++                if (pcmf) {
++                    no_p[0] = get_pcm(s, x, y - 1);
++                    no_p[1] = get_pcm(s, x + 4, y - 1);
++                    no_q[0] = get_pcm(s, x, y);
++                    no_q[1] = get_pcm(s, x + 4, y);
++                    s->hevcdsp.hevc_h_loop_filter_luma_c(src,
++                                                         frame_stride1(s->frame, LUMA),
++                                                         beta, tc, no_p, no_q);
++                } else
 +#ifdef RPI_DEBLOCK_VPU
 +                if (s->enable_rpi_deblock) {
 +                    uint8_t (*setup)[2][2][4];
@@ -10331,260 +11766,113 @@ index b53f4cc721..9982cff40f 100644
 +                    setup[1][b][1][a + 1] = tc[1];
 +                } else
 +#endif
-                     s->hevcdsp.hevc_h_loop_filter_luma(src,
-                                                        s->frame->linesize[LUMA],
-                                                        beta, tc, no_p, no_q);
-@@ -578,6 +780,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-     }
- 
-     if (s->ps.sps->chroma_format_idc) {
-+#if RPI_HEVC_SAND
-+        if (av_rpi_is_sand_frame(s->frame)) {
-+            const int v = 2;
-+            const int h = 2;
-+
-+            // vertical filtering chroma
-+            for (y = y0; y < y_end; y += 8 * v) {
++                    s->hevcdsp.hevc_h_loop_filter_luma(src,
++                                                       frame_stride1(s->frame, LUMA),
++                                                       beta, tc, no_p, no_q);
++            }
++        }
++    }
++
++    if (ctx_cfmt(s) != 0) {
++        const int v = 2;
++        const int h = 2;
++
++        // vertical filtering chroma
++        for (y = y0; y < y_end; y += 8 * v) {
 +//                const int demi_y = y + 4 * v >= s->ps.sps->height;
-+                const int demi_y = 0;
-+                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
-+                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
-+                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
-+
-+                    if ((bs0 == 2) || (bs1 == 2)) {
-+                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
-+                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
-+                        unsigned int no_f = !demi_y ? 0 : 2 | 8;
-+
-+                        // tc_offset here should be set to cur_tc_offset I think
-+                        const uint32_t tc4 =
-+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
-+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
-+
-+                        if (tc4 == 0)
-+                            continue;
++            const int demi_y = 0;
++            for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
++                const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
++                const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
 +
-+                        if (pcmf) {
-+                            no_f =
-+                                (get_pcm(s, x - 1, y) ? 1 : 0) |
-+                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
-+                                (get_pcm(s, x, y) ? 4 : 0) |
-+                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
-+                            if (no_f == 0xf)
-+                                continue;
-+                        }
++                if ((bs0 == 2) || (bs1 == 2)) {
++                    const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
++                    const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
++                    unsigned int no_f = !demi_y ? 0 : 2 | 8;
++
++                    // tc_offset here should be set to cur_tc_offset I think
++                    const uint32_t tc4 =
++                        ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
++                        ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
 +
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                                       s->frame->linesize[1],
-+                                                       tc4,
-+                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+                                                       no_f);
++                    if (tc4 == 0)
++                        continue;
++
++                    if (pcmf) {
++                        no_f =
++                            (get_pcm(s, x - 1, y) ? 1 : 0) |
++                            (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
++                            (get_pcm(s, x, y) ? 4 : 0) |
++                            (get_pcm(s, x, y + 4 * v) ? 8 : 0);
++                        if (no_f == 0xf)
++                            continue;
 +                    }
++
++                    s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                   frame_stride1(s->frame, 1),
++                                                   tc4,
++                                                   av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                   no_f);
 +                }
++            }
 +
-+                if (y == 0)
-+                    continue;
++            if (y == 0)
++                continue;
 +
-+                // horizontal filtering chroma
-+                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
-+                x_end2 = x_end;
-+                if (x_end != s->ps.sps->width)
-+                    x_end2 = x_end - 8 * h;
++            // horizontal filtering chroma
++            tc_offset = x0 ? left_tc_offset : cur_tc_offset;
++            x_end2 = x_end;
++            if (x_end != s->ps.sps->width)
++                x_end2 = x_end - 8 * h;
 +
-+                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
++            for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
 +//                    const int demi_x = x + 4 * v >= s->ps.sps->width;
-+                    const int demi_x = 0;
-+
-+                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
-+                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
-+                    if ((bs0 == 2) || (bs1 == 2)) {
-+                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
-+                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
-+                        const uint32_t tc4 =
-+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
-+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
-+                        unsigned int no_f = !demi_x ? 0 : 2 | 8;
-+
-+                        if (tc4 == 0)
-+                            continue;
++                const int demi_x = 0;
 +
-+                        if (pcmf) {
-+                            no_f =
-+                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
-+                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
-+                                (get_pcm(s, x,         y)     ? 4 : 0) |
-+                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
++                const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
++                const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
++                if ((bs0 == 2) || (bs1 == 2)) {
++                    const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
++                    const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
++                    const uint32_t tc4 =
++                        ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
++                        ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
++                    unsigned int no_f = !demi_x ? 0 : 2 | 8;
 +
-+                            if (no_f == 0xf)
-+                                continue;
-+                        }
++                    if (tc4 == 0)
++                        continue;
++
++                    if (pcmf) {
++                        no_f =
++                            (get_pcm(s, x,         y - 1) ? 1 : 0) |
++                            (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
++                            (get_pcm(s, x,         y)     ? 4 : 0) |
++                            (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
 +
-+                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+                                                             s->frame->linesize[1],
-+                                                             tc4, no_f);
++                        if (no_f == 0xf)
++                            continue;
 +                    }
++
++                    s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                     frame_stride1(s->frame, LUMA),
++                                                     tc4, no_f);
 +                }
 +            }
++            // **** VPU deblock code gone from here....
 +        }
-+        else
-+#endif
-         for (chroma = 1; chroma <= 2; chroma++) {
-             int h = 1 << s->ps.sps->hshift[chroma];
-             int v = 1 << s->ps.sps->vshift[chroma];
-@@ -594,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
- 
-                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
-                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
--                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
-+                        src =
-+#if RPI_HEVC_SAND
-+                            av_rpi_is_sand_frame(s->frame) ?
-+                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
-+#endif
-+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
-                         if (pcmf) {
-                             no_p[0] = get_pcm(s, x - 1, y);
-                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
-@@ -604,9 +901,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                                                                    s->frame->linesize[chroma],
-                                                                    c_tc, no_p, no_q);
-                         } else
-+#ifdef RPI_DEBLOCK_VPU
-+                        if (s->enable_rpi_deblock) {
-+                            uint8_t (*setup)[2][2][4];
-+                            int xc = x>>s->ps.sps->hshift[chroma];
-+                            int yc = y>>s->ps.sps->vshift[chroma];
-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-+                            int a = ((yc>>3) & 1) << 1;
-+                            int b = (xc>>3) & 1;
-+                            setup = s->dvq->uv_setup_arm[num16];
-+                            setup[0][b][0][a] = c_tc[0];
-+                            setup[0][b][0][a + 1] = c_tc[1];
-+                        } else
-+#endif
-                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
-                                                                  s->frame->linesize[chroma],
-                                                                  c_tc, no_p, no_q);
++    }
++}
 +
-                     }
-                 }
- 
-@@ -627,7 +938,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
- 
-                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
-                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
--                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
-+                        src =
-+#if RPI_HEVC_SAND
-+                            av_rpi_is_sand_frame(s->frame) ?
-+                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
-+#endif
-+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
-                         if (pcmf) {
-                             no_p[0] = get_pcm(s, x,           y - 1);
-                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
-@@ -637,6 +953,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                                                                    s->frame->linesize[chroma],
-                                                                    c_tc, no_p, no_q);
-                         } else
-+#ifdef RPI_DEBLOCK_VPU
-+                        if (s->enable_rpi_deblock) {
-+                            uint8_t (*setup)[2][2][4];
-+                            int xc = x>>s->ps.sps->hshift[chroma];
-+                            int yc = y>>s->ps.sps->vshift[chroma];
-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
-+                            int a = ((xc>>3) & 1) << 1;
-+                            int b = (yc>>3) & 1;
-+                            setup = s->dvq->uv_setup_arm[num16];
-+                            setup[1][b][0][a] = c_tc[0];
-+                            setup[1][b][0][a + 1] = c_tc[1];
-+                        } else
-+#endif
-                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
-                                                                  s->frame->linesize[chroma],
-                                                                  c_tc, no_p, no_q);
-@@ -647,83 +976,31 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-     }
- }
- 
--static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
--                             RefPicList *neigh_refPicList)
--{
--    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
--        // same L0 and L1
--        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
--            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
--            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
--            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
--                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
--                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
--                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
--                return 1;
--            else
--                return 0;
--        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
--                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
--            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
--                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
--                return 1;
--            else
--                return 0;
--        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
--                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
--            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
--                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
--                return 1;
--            else
--                return 0;
--        } else {
--            return 1;
--        }
--    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
--        Mv A, B;
--        int ref_A, ref_B;
--
--        if (curr->pred_flag & 1) {
--            A     = curr->mv[0];
--            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
--        } else {
--            A     = curr->mv[1];
--            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
--        }
--
--        if (neigh->pred_flag & 1) {
--            B     = neigh->mv[0];
--            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
--        } else {
--            B     = neigh->mv[1];
--            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
--        }
--
--        if (ref_A == ref_B) {
--            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
--                return 1;
--            else
--                return 0;
--        } else
--            return 1;
--    }
--
--    return 1;
--}
- 
--void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+void ff_hevc_deblocking_boundary_strengths(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0,
-                                            int log2_trafo_size)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     MvField *tab_mvf     = s->ref->tab_mvf;
-     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
-     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
-     int min_pu_width     = s->ps.sps->min_pu_width;
-     int min_tu_width     = s->ps.sps->min_tb_width;
--    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
--                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
-     int boundary_upper, boundary_left;
--    int i, j, bs;
++
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++                                           int log2_trafo_size)
++{
++    MvField *tab_mvf     = s->ref->tab_mvf;
++    int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++    int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
++    int min_pu_width     = s->ps.sps->min_pu_width;
++    int min_tu_width     = s->ps.sps->min_tb_width;
++    int boundary_upper, boundary_left;
 +    int i, j;
 +    const RefPicList *rpl = s->ref->refPicList;
 +    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
@@ -10600,24 +11888,22 @@ index b53f4cc721..9982cff40f 100644
 +#ifdef DISABLE_STRENGTHS
 +    return;
 +#endif
- 
-     boundary_upper = y0 > 0 && !(y0 & 7);
-     if (boundary_upper &&
-@@ -735,34 +1012,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-         boundary_upper = 0;
- 
++
++    boundary_upper = y0 > 0 && !(y0 & 7);
++    if (boundary_upper &&
++        ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++          lc->boundary_flags & BOUNDARY_UPPER_SLICE &&
++          (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0) ||
++         (!s->ps.pps->loop_filter_across_tiles_enabled_flag &&
++          lc->boundary_flags & BOUNDARY_UPPER_TILE &&
++          (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++        boundary_upper = 0;
++
 +    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
 +
-     if (boundary_upper) {
--        RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
++    if (boundary_upper) {
 +        const RefPicList *const rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
-                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
--                              s->ref->refPicList;
--        int yp_pu = (y0 - 1) >> log2_min_pu_size;
--        int yq_pu =  y0      >> log2_min_pu_size;
--        int yp_tu = (y0 - 1) >> log2_min_tu_size;
--        int yq_tu =  y0      >> log2_min_tu_size;
++                              ff_hevc_rpi_get_ref_list(s, s->ref, x0, y0 - 1) :
 +                              rpl;
 +        MvField *top = curr - min_pu_width;
 +
@@ -10635,22 +11921,8 @@ index b53f4cc721..9982cff40f 100644
 +                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
 +                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
 +                    curr, top, bs);
- 
-             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
--                int x_pu = (x0 + i) >> log2_min_pu_size;
--                int x_tu = (x0 + i) >> log2_min_tu_size;
--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
--                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
--                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
--
--                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
--                    bs = 2;
--                else if (curr_cbf_luma || top_cbf_luma)
--                    bs = 1;
--                else
--                    bs = boundary_strength(s, curr, top, rpl_top);
--                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
++
++            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
 +                int i_pu = i >> log2_min_pu_size;
 +                int i_tu = i >> log2_min_tu_size;
 +
@@ -10658,7 +11930,7 @@ index b53f4cc721..9982cff40f 100644
 +                    bs[i >> 2] = 2;
 +                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
 +                    bs[i >> 2] = 1;
-             }
++            }
 +        }
 +    }
 +
@@ -10675,67 +11947,31 @@ index b53f4cc721..9982cff40f 100644
 +                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
 +                    curr, top, bs);
 +        }
-     }
- 
--    // bs for vertical TU boundaries
-     boundary_left = x0 > 0 && !(x0 & 7);
-     if (boundary_left &&
-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -773,64 +1072,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
-         boundary_left = 0;
- 
++    }
++
++    boundary_left = x0 > 0 && !(x0 & 7);
++    if (boundary_left &&
++        ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++          lc->boundary_flags & BOUNDARY_LEFT_SLICE &&
++          (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0) ||
++         (!s->ps.pps->loop_filter_across_tiles_enabled_flag &&
++          lc->boundary_flags & BOUNDARY_LEFT_TILE &&
++          (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++        boundary_left = 0;
++
 +    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
 +    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
 +
-     if (boundary_left) {
--        RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
++    if (boundary_left) {
 +        const RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
-                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
--                               s->ref->refPicList;
--        int xp_pu = (x0 - 1) >> log2_min_pu_size;
--        int xq_pu =  x0      >> log2_min_pu_size;
--        int xp_tu = (x0 - 1) >> log2_min_tu_size;
--        int xq_tu =  x0      >> log2_min_tu_size;
--
--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
--                int y_pu      = (y0 + i) >> log2_min_pu_size;
--                int y_tu      = (y0 + i) >> log2_min_tu_size;
--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
--                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
--                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
--
--                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
--                    bs = 2;
--                else if (curr_cbf_luma || left_cbf_luma)
--                    bs = 1;
--                else
--                    bs = boundary_strength(s, curr, left, rpl_left);
--                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
--            }
--    }
++                               ff_hevc_rpi_get_ref_list(s, s->ref, x0 - 1, y0) :
 +                               rpl;
 +        MvField *left = curr - 1;
- 
--    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
--        RefPicList *rpl = s->ref->refPicList;
++
 +        if (is_intra) {
 +            for (j = 0; j < (1 << log2_trafo_size); j += 4)
 +                bs[j * s->bs_width >> 2] = 2;
- 
--        // bs for TU internal horizontal PU boundaries
--        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
--            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
--            int yq_pu = (y0 + j)     >> log2_min_pu_size;
--
--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
--                int x_pu = (x0 + i) >> log2_min_pu_size;
--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
--
--                bs = boundary_strength(s, curr, top, rpl);
--                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
++
 +        } else {
 +            int y_tu = y0 >> log2_min_tu_size;
 +            int x_tu = x0 >> log2_min_tu_size;
@@ -10755,57 +11991,49 @@ index b53f4cc721..9982cff40f 100644
 +                    bs[j * s->bs_width >> 2] = 2;
 +                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
 +                    bs[j * s->bs_width >> 2] = 1;
-             }
-         }
++            }
++        }
 +    }
- 
--        // bs for TU internal vertical PU boundaries
--        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
--            int y_pu = (y0 + j) >> log2_min_pu_size;
++
 +    if (!is_intra) {
 +        for (i = inc; i < trafo_in_min_pus; i += inc) {
 +            MvField *left;
- 
--            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
--                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
--                int xq_pu = (x0 + i)     >> log2_min_pu_size;
--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
++
 +            curr += inc;
 +            left = curr - 1;
 +            bs += inc << log2_min_pu_size >> 2;
- 
--                bs = boundary_strength(s, curr, left, rpl);
--                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
--            }
++
 +            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
 +                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
 +                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
 +                    curr, left, bs);
-         }
-     }
- }
-@@ -839,39 +1128,119 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
- #undef CB
- #undef CR
- 
--void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++        }
++    }
++}
++
++#undef LUMA
++#undef CB
++#undef CR
++
 +#ifdef RPI_DEBLOCK_VPU
-+// ff_hevc_flush_buffer_lines
++// ff_hevc_rpi_flush_buffer_lines
 +// flushes and invalidates all pixel rows in [start,end-1]
-+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
++static void ff_hevc_rpi_flush_buffer_lines(HEVCRpiContext *s, int start, int end, int flush_luma, int flush_chroma)
 +{
 +    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
 +    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+      0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
++      0, start, s->ps.sps->width, end - start, ctx_vshift(s, 1), flush_luma, flush_chroma);
 +    rpi_cache_flush_finish(rfe);
 +}
 +
 +/* rpi_deblock deblocks an entire row of ctbs using the VPU */
-+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
++static void rpi_deblock(HEVCRpiContext *s, int y, int ctb_size)
 +{
++  int num16high = (ctb_size+15)>>4;  // May go over bottom of the image, but setup will be zero for these so should have no effect.
++  // TODO check that image allocation is large enough for this to be okay as well.
++  
 +  // Flush image, 4 lines above to bottom of ctb stripe
-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
++  ff_hevc_rpi_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
 +  // TODO flush buffer of beta/tc setup when it becomes cached
 +
 +  // Prepare three commands at once to avoid calling overhead
@@ -10813,23 +12041,23 @@ index b53f4cc721..9982cff40f 100644
 +  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
 +  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
 +  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
-+  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
++  s->dvq->vpu_cmds_arm[0][4] = num16high;
 +  s->dvq->vpu_cmds_arm[0][5] = 2;
 +
 +  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
 +  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
 +  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
 +  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-+  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
++  s->dvq->vpu_cmds_arm[1][4] = (num16high + 1) >> s->ps.sps->vshift[1];
 +  s->dvq->vpu_cmds_arm[1][5] = 3;
 +
 +  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
 +  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
 +  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
 +  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
-+  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
++  s->dvq->vpu_cmds_arm[2][4] = (num16high + 1) >> s->ps.sps->vshift[1];
 +  s->dvq->vpu_cmds_arm[2][5] = 4;
-+
++  
 +  // Call VPU
 +  {
 +      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
@@ -10846,16 +12074,14 @@ index b53f4cc721..9982cff40f 100644
 +
 +#endif
 +
-+void ff_hevc_hls_filter(HEVCContext * const s, const int x, const int y, const int ctb_size)
- {
--    int x_end = x >= s->ps.sps->width  - ctb_size;
++void ff_hevc_rpi_hls_filter(HEVCRpiContext * const s, const int x, const int y, const int ctb_size)
++{
 +    const int x_end = x >= s->ps.sps->width  - ctb_size;
 +
-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
-         deblocking_filter_CTB(s, x, y);
++    if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
++        deblocking_filter_CTB(s, x, y);
 +
 +#ifdef RPI_DEBLOCK_VPU
-+#error Deblock VPU thoroughly rotted
 +    if (s->enable_rpi_deblock && x_end)
 +    {
 +      int y_at_end = y >= s->ps.sps->height - ctb_size;
@@ -10863,16468 +12089,4360 @@ index b53f4cc721..9982cff40f 100644
 +      int y_start = y&~63;
 +      if (y_at_end) height = s->ps.sps->height - y_start;
 +      if ((((y+ctb_size)&63)==0) || y_at_end) {
-+        done_deblock = 1;
 +        rpi_deblock(s, y_start, height);
 +      }
 +    }
 +#endif
 +
-     if (s->ps.sps->sao_enabled) {
-         int y_end = y >= s->ps.sps->height - ctb_size;
--        if (y && x)
++    if (s->ps.sps->sao_enabled) {
++        int y_end = y >= s->ps.sps->height - ctb_size;
 +        if (y != 0 && x != 0)
-             sao_filter_CTB(s, x - ctb_size, y - ctb_size);
--        if (x && y_end)
++            sao_filter_CTB(s, x - ctb_size, y - ctb_size);
 +        if (x != 0 && y_end)
-             sao_filter_CTB(s, x - ctb_size, y);
--        if (y && x_end) {
++            sao_filter_CTB(s, x - ctb_size, y);
 +        if (y != 0 && x_end)
-             sao_filter_CTB(s, x, y - ctb_size);
--            if (s->threads_type & FF_THREAD_FRAME )
--                ff_thread_report_progress(&s->ref->tf, y, 0);
--        }
--        if (x_end && y_end) {
++            sao_filter_CTB(s, x, y - ctb_size);
 +        if (x_end && y_end)
-             sao_filter_CTB(s, x , y);
--            if (s->threads_type & FF_THREAD_FRAME )
--                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
--        }
--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
--        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++            sao_filter_CTB(s, x , y);
 +    }
- }
- 
- void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
- {
--    int x_end = x_ctb >= s->ps.sps->width  - ctb_size;
--    int y_end = y_ctb >= s->ps.sps->height - ctb_size;
++}
++
++void ff_hevc_rpi_hls_filters(HEVCRpiContext *s, int x_ctb, int y_ctb, int ctb_size)
++{
 +    // * This can break strict L->R then U->D ordering - mostly it doesn't matter
 +    // Never called if rpi_enabled so no need for cache flush ops
 +    const int x_end = x_ctb >= s->ps.sps->width  - ctb_size;
 +    const int y_end = y_ctb >= s->ps.sps->height - ctb_size;
-     if (y_ctb && x_ctb)
-         ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size, ctb_size);
-     if (y_ctb && x_end)
++    if (y_ctb && x_ctb)
++        ff_hevc_rpi_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size, ctb_size);
++    if (y_ctb && x_end)
 +    {
-         ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size, ctb_size);
++        ff_hevc_rpi_hls_filter(s, x_ctb, y_ctb - ctb_size, ctb_size);
 +        // Signal progress - this is safe for SAO
 +        if (s->threads_type == FF_THREAD_FRAME && y_ctb > ctb_size)
-+            ff_hevc_progress_signal_recon(s, y_ctb - ctb_size - 1);
++            ff_hevc_rpi_progress_signal_recon(s, y_ctb - ctb_size - 1);
 +    }
-     if (x_ctb && y_end)
-         ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb, ctb_size);
++    if (x_ctb && y_end)
++        ff_hevc_rpi_hls_filter(s, x_ctb - ctb_size, y_ctb, ctb_size);
 +    if (x_end && y_end)
 +    {
-+        ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
++        ff_hevc_rpi_hls_filter(s, x_ctb, y_ctb, ctb_size);
 +        // All done - signal such
 +        if (s->threads_type == FF_THREAD_FRAME)
-+            ff_hevc_progress_signal_recon(s, INT_MAX);
++            ff_hevc_rpi_progress_signal_recon(s, INT_MAX);
 +    }
- }
-diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
-index a8f7876b59..ca55da9d81 100644
---- a/libavcodec/hevc_mvs.c
-+++ b/libavcodec/hevc_mvs.c
-@@ -39,10 +39,9 @@ static const uint8_t l0_l1_cand_idx[12][2] = {
-     { 3, 2, },
- };
- 
--void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
--                                     int nPbW, int nPbH)
-+void ff_hevc_set_neighbour_available(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0,
++}
+diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
+new file mode 100644
+index 0000000000..9db79e658f
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mvs.c
+@@ -0,0 +1,769 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Anand Meher Kotra
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++static const uint8_t l0_l1_cand_idx[12][2] = {
++    { 0, 1, },
++    { 1, 0, },
++    { 0, 2, },
++    { 2, 0, },
++    { 1, 2, },
++    { 2, 1, },
++    { 0, 3, },
++    { 3, 0, },
++    { 1, 3, },
++    { 3, 1, },
++    { 2, 3, },
++    { 3, 2, },
++};
++
++void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
 +                                     const int nPbW, const int nPbH)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
-     int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
- 
-@@ -61,8 +60,8 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
- /*
-  * 6.4.1 Derivation process for z-scan order block availability
-  */
--static av_always_inline int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr,
--                              int xN, int yN)
-+static av_always_inline int z_scan_block_avail(const HEVCContext * const s, const int xCurr, const int yCurr,
++{
++    int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
++    int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
++
++    lc->na.cand_up       = (lc->ctb_up_flag   || y0b);
++    lc->na.cand_left     = (lc->ctb_left_flag || x0b);
++    lc->na.cand_up_left  = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up;
++    lc->na.cand_up_right_sap =
++            ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size)) ?
++                    lc->ctb_up_right_flag && !y0b : lc->na.cand_up;
++    lc->na.cand_up_right =
++            lc->na.cand_up_right_sap
++                     && (x0 + nPbW) < lc->end_of_tiles_x;
++    lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left;
++}
++
++/*
++ * 6.4.1 Derivation process for z-scan order block availability
++ */
++static av_always_inline int z_scan_block_avail(const HEVCRpiContext * const s, const int xCurr, const int yCurr,
 +                              const int xN, const int yN)
- {
- #define MIN_TB_ADDR_ZS(x, y)                                            \
-     s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
-@@ -83,7 +82,7 @@ static av_always_inline int z_scan_block_avail(HEVCContext *s, int xCurr, int yC
- }
- 
- //check if the two luma locations belong to the same motion estimation region
--static av_always_inline int is_diff_mer(HEVCContext *s, int xN, int yN, int xP, int yP)
-+static av_always_inline int is_diff_mer(const HEVCContext * const s, int xN, int yN, int xP, int yP)
- {
-     uint8_t plevel = s->ps.pps->log2_parallel_merge_level;
- 
-@@ -95,7 +94,7 @@ static av_always_inline int is_diff_mer(HEVCContext *s, int xN, int yN, int xP,
- #define MATCH(x) (A.x == B.x)
- 
- // check if the mv's and refidx are the same between A and B
--static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField B)
++{
++#define MIN_TB_ADDR_ZS(x, y)                                            \
++    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
++
++    int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size;
++    int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size;
++    int xN_ctb    = xN    >> s->ps.sps->log2_ctb_size;
++    int yN_ctb    = yN    >> s->ps.sps->log2_ctb_size;
++    if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb )
++        return 1;
++    else {
++        int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
++                (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
++        int N    = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask,
++                (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask);
++        return N <= Curr;
++    }
++}
++
++//check if the two luma locations belong to the same motion estimation region
++static av_always_inline int is_diff_mer(const HEVCRpiContext * const s, int xN, int yN, int xP, int yP)
++{
++    uint8_t plevel = s->ps.pps->log2_parallel_merge_level;
++
++    return xN >> plevel == xP >> plevel &&
++           yN >> plevel == yP >> plevel;
++}
++
++#define MATCH_MV(x) (AV_RN32A(&A.x) == AV_RN32A(&B.x))
++#define MATCH(x) (A.x == B.x)
++
++// check if the mv's and refidx are the same between A and B
 +static av_always_inline int compare_mv_ref_idx(const struct MvField A, const struct MvField B)
- {
-     int a_pf = A.pred_flag;
-     int b_pf = B.pred_flag;
-@@ -112,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
-     return 0;
- }
- 
--static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
++{
++    int a_pf = A.pred_flag;
++    int b_pf = B.pred_flag;
++    if (a_pf == b_pf) {
++        if (a_pf == PF_BI) {
++            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
++                   MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
++        } else if (a_pf == PF_L0) {
++            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
++        } else if (a_pf == PF_L1) {
++            return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
++        }
++    }
++    return 0;
++}
++
 +static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
- {
-     int tx, scale_factor;
- 
-@@ -126,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
-                            (scale_factor * src->y < 0)) >> 8);
- }
- 
--static int check_mvset(Mv *mvLXCol, Mv *mvCol,
--                       int colPic, int poc,
--                       RefPicList *refPicList, int X, int refIdxLx,
--                       RefPicList *refPicList_col, int listCol, int refidxCol)
++{
++    int tx, scale_factor;
++
++    td = av_clip_int8(td);
++    tb = av_clip_int8(tb);
++    tx = (0x4000 + abs(td / 2)) / td;
++    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
++    dst->x = av_clip_int16((scale_factor * src->x + 127 +
++                           (scale_factor * src->x < 0)) >> 8);
++    dst->y = av_clip_int16((scale_factor * src->y + 127 +
++                           (scale_factor * src->y < 0)) >> 8);
++}
++
 +static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
 +                       const int colPic, const int poc,
 +                       const RefPicList * const refPicList, const int X, const int refIdxLx,
 +                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
- {
-     int cur_lt = refPicList[X].isLongTerm[refIdxLx];
-     int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
-@@ -160,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
-                 refPicList_col, L ## l, temp_col.ref_idx[l])
- 
- // derive the motion vectors section 8.5.3.1.8
--static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
--                                         int refIdxLx, Mv *mvLXCol, int X,
--                                         int colPic, RefPicList *refPicList_col)
-+static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
++{
++    int cur_lt = refPicList[X].isLongTerm[refIdxLx];
++    int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
++    int col_poc_diff, cur_poc_diff;
++
++    if (cur_lt != col_lt) {
++        mvLXCol->x = 0;
++        mvLXCol->y = 0;
++        return 0;
++    }
++
++    col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol];
++    cur_poc_diff = poc    - refPicList[X].list[refIdxLx];
++
++    if (cur_lt || col_poc_diff == cur_poc_diff || !col_poc_diff) {
++        mvLXCol->x = mvCol->x;
++        mvLXCol->y = mvCol->y;
++    } else {
++        mv_scale(mvLXCol, mvCol, col_poc_diff, cur_poc_diff);
++    }
++    return 1;
++}
++
++#define CHECK_MVSET(l)                                          \
++    check_mvset(mvLXCol, temp_col.mv + l,                       \
++                colPic, s->poc,                                 \
++                refPicList, X, refIdxLx,                        \
++                refPicList_col, L ## l, temp_col.ref_idx[l])
++
++// derive the motion vectors section 8.5.3.1.8
++static int derive_temporal_colocated_mvs(const HEVCRpiContext * const s, const MvField temp_col,
 +                                         const int refIdxLx, Mv * const mvLXCol, const int X,
 +                                         const int colPic, const RefPicList * const refPicList_col)
- {
--    RefPicList *refPicList = s->ref->refPicList;
++{
 +    const RefPicList * const refPicList = s->ref->refPicList;
- 
-     if (temp_col.pred_flag == PF_INTRA)
-         return 0;
-@@ -215,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
- /*
-  * 8.5.3.1.7  temporal luma motion vector prediction
-  */
--static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
--                                       int nPbW, int nPbH, int refIdxLx,
--                                       Mv *mvLXCol, int X)
-+static int temporal_luma_motion_vector(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0,
++
++    if (temp_col.pred_flag == PF_INTRA)
++        return 0;
++
++    if (!(temp_col.pred_flag & PF_L0))
++        return CHECK_MVSET(1);
++    else if (temp_col.pred_flag == PF_L0)
++        return CHECK_MVSET(0);
++    else if (temp_col.pred_flag == PF_BI) {
++        int check_diffpicount = 0;
++        int i, j;
++        for (j = 0; j < 2; j++) {
++            for (i = 0; i < refPicList[j].nb_refs; i++) {
++                if (refPicList[j].list[i] > s->poc) {
++                    check_diffpicount++;
++                    break;
++                }
++            }
++        }
++        if (!check_diffpicount) {
++            if (X==0)
++                return CHECK_MVSET(0);
++            else
++                return CHECK_MVSET(1);
++        } else {
++            if (s->sh.collocated_list == L1)
++                return CHECK_MVSET(0);
++            else
++                return CHECK_MVSET(1);
++        }
++    }
++
++    return 0;
++}
++
++#define TAB_MVF(x, y)                                                   \
++    tab_mvf[(y) * min_pu_width + x]
++
++#define TAB_MVF_PU(v)                                                   \
++    TAB_MVF(((x ## v) >> s->ps.sps->log2_min_pu_size),                     \
++            ((y ## v) >> s->ps.sps->log2_min_pu_size))
++
++#define DERIVE_TEMPORAL_COLOCATED_MVS                                   \
++    derive_temporal_colocated_mvs(s, temp_col,                          \
++                                  refIdxLx, mvLXCol, X, colPic,         \
++                                  ff_hevc_rpi_get_ref_list(s, ref, x, y))
++
++/*
++ * 8.5.3.1.7  temporal luma motion vector prediction
++ */
++static int temporal_luma_motion_vector(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
 +                                       const int nPbW, const int nPbH, const int refIdxLx,
 +                                       Mv * const mvLXCol, const int X)
- {
-     MvField *tab_mvf;
-     MvField temp_col;
-     int x, y, x_pu, y_pu;
--    int min_pu_width = s->ps.sps->min_pu_width;
++{
++    MvField *tab_mvf;
++    MvField temp_col;
++    int x, y, x_pu, y_pu;
 +    const int min_pu_width = s->ps.sps->min_pu_width;
-     int availableFlagLXCol = 0;
-     int colPic;
- 
--    HEVCFrame *ref = s->ref->collocated_ref;
++    int availableFlagLXCol = 0;
++    int colPic;
++
 +    HEVCFrame * const ref = s->ref->collocated_ref;
- 
--    if (!ref) {
++
 +    if (ref == NULL || ref->tab_mvf == NULL) {
-         memset(mvLXCol, 0, sizeof(*mvLXCol));
-         return 0;
-     }
-@@ -240,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
-     x = x0 + nPbW;
-     y = y0 + nPbH;
- 
--    if (tab_mvf &&
--        (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++        memset(mvLXCol, 0, sizeof(*mvLXCol));
++        return 0;
++    }
++
++    tab_mvf = ref->tab_mvf;
++    colPic  = ref->poc;
++
++    //bottom right collocated motion vector
++    x = x0 + nPbW;
++    y = y0 + nPbH;
++
 +    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
-         y < s->ps.sps->height &&
-         x < s->ps.sps->width) {
-         x                 &= ~15;
-         y                 &= ~15;
-         if (s->threads_type == FF_THREAD_FRAME)
--            ff_thread_await_progress(&ref->tf, y, 0);
-+            ff_hevc_progress_wait_mv(s, lc->jb0, ref, y);
-         x_pu               = x >> s->ps.sps->log2_min_pu_size;
-         y_pu               = y >> s->ps.sps->log2_min_pu_size;
-         temp_col           = TAB_MVF(x_pu, y_pu);
-@@ -255,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
-     }
- 
-     // derive center collocated motion vector
--    if (tab_mvf && !availableFlagLXCol) {
++        y < s->ps.sps->height &&
++        x < s->ps.sps->width) {
++        x                 &= ~15;
++        y                 &= ~15;
++        if (s->threads_type == FF_THREAD_FRAME)
++            ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
++        x_pu               = x >> s->ps.sps->log2_min_pu_size;
++        y_pu               = y >> s->ps.sps->log2_min_pu_size;
++        temp_col           = TAB_MVF(x_pu, y_pu);
++        availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS;
++    }
++
++    // derive center collocated motion vector
 +    if (!availableFlagLXCol) {
-         x                  = x0 + (nPbW >> 1);
-         y                  = y0 + (nPbH >> 1);
-         x                 &= ~15;
-         y                 &= ~15;
-         if (s->threads_type == FF_THREAD_FRAME)
--            ff_thread_await_progress(&ref->tf, y, 0);
-+            ff_hevc_progress_wait_mv(s, lc->jb0, ref, y);
-         x_pu               = x >> s->ps.sps->log2_min_pu_size;
-         y_pu               = y >> s->ps.sps->log2_min_pu_size;
-         temp_col           = TAB_MVF(x_pu, y_pu);
-@@ -282,16 +280,15 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
- /*
-  * 8.5.3.1.2  Derivation process for spatial merging candidates
-  */
--static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
-+static void derive_spatial_merge_candidates(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0,
-                                             int nPbW, int nPbH,
-                                             int log2_cb_size,
-                                             int singleMCLFlag, int part_idx,
-                                             int merge_idx,
-                                             struct MvField mergecandlist[])
- {
--    HEVCLocalContext *lc   = s->HEVClc;
--    RefPicList *refPicList = s->ref->refPicList;
--    MvField *tab_mvf       = s->ref->tab_mvf;
++        x                  = x0 + (nPbW >> 1);
++        y                  = y0 + (nPbH >> 1);
++        x                 &= ~15;
++        y                 &= ~15;
++        if (s->threads_type == FF_THREAD_FRAME)
++            ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
++        x_pu               = x >> s->ps.sps->log2_min_pu_size;
++        y_pu               = y >> s->ps.sps->log2_min_pu_size;
++        temp_col           = TAB_MVF(x_pu, y_pu);
++        availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS;
++    }
++    return availableFlagLXCol;
++}
++
++#define AVAILABLE(cand, v)                                      \
++    (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA))
++
++#define PRED_BLOCK_AVAILABLE(v)                                 \
++    z_scan_block_avail(s, x0, y0, x ## v, y ## v)
++
++#define COMPARE_MV_REFIDX(a, b)                                 \
++    compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b))
++
++/*
++ * 8.5.3.1.2  Derivation process for spatial merging candidates
++ */
++static void derive_spatial_merge_candidates(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++                                            int nPbW, int nPbH,
++                                            int log2_cb_size,
++                                            int singleMCLFlag, int part_idx,
++                                            int merge_idx,
++                                            struct MvField mergecandlist[])
++{
 +    const RefPicList * const refPicList = s->ref->refPicList;
 +    const MvField * const tab_mvf       = s->ref->tab_mvf;
- 
-     const int min_pu_width = s->ps.sps->min_pu_width;
- 
-@@ -410,10 +407,10 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
-     if (s->sh.slice_temporal_mvp_enabled_flag &&
-         nb_merge_cand < s->sh.max_num_merge_cand) {
-         Mv mv_l0_col = { 0 }, mv_l1_col = { 0 };
--        int available_l0 = temporal_luma_motion_vector(s, x0, y0, nPbW, nPbH,
++
++    const int min_pu_width = s->ps.sps->min_pu_width;
++
++    const int cand_bottom_left = lc->na.cand_bottom_left;
++    const int cand_left        = lc->na.cand_left;
++    const int cand_up_left     = lc->na.cand_up_left;
++    const int cand_up          = lc->na.cand_up;
++    const int cand_up_right    = lc->na.cand_up_right_sap;
++
++    const int xA1    = x0 - 1;
++    const int yA1    = y0 + nPbH - 1;
++
++    const int xB1    = x0 + nPbW - 1;
++    const int yB1    = y0 - 1;
++
++    const int xB0    = x0 + nPbW;
++    const int yB0    = y0 - 1;
++
++    const int xA0    = x0 - 1;
++    const int yA0    = y0 + nPbH;
++
++    const int xB2    = x0 - 1;
++    const int yB2    = y0 - 1;
++
++    const int nb_refs = (s->sh.slice_type == HEVC_SLICE_P) ?
++                        s->sh.nb_refs[0] : FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]);
++
++    int zero_idx = 0;
++
++    int nb_merge_cand = 0;
++    int nb_orig_merge_cand = 0;
++
++    int is_available_a0;
++    int is_available_a1;
++    int is_available_b0;
++    int is_available_b1;
++    int is_available_b2;
++
++
++    if (!singleMCLFlag && part_idx == 1 &&
++        (lc->cu.part_mode == PART_Nx2N ||
++         lc->cu.part_mode == PART_nLx2N ||
++         lc->cu.part_mode == PART_nRx2N) ||
++        is_diff_mer(s, xA1, yA1, x0, y0)) {
++        is_available_a1 = 0;
++    } else {
++        is_available_a1 = AVAILABLE(cand_left, A1);
++        if (is_available_a1) {
++            mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1);
++            if (merge_idx == 0)
++                return;
++            nb_merge_cand++;
++        }
++    }
++
++    if (!singleMCLFlag && part_idx == 1 &&
++        (lc->cu.part_mode == PART_2NxN ||
++         lc->cu.part_mode == PART_2NxnU ||
++         lc->cu.part_mode == PART_2NxnD) ||
++        is_diff_mer(s, xB1, yB1, x0, y0)) {
++        is_available_b1 = 0;
++    } else {
++        is_available_b1 = AVAILABLE(cand_up, B1);
++        if (is_available_b1 &&
++            !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) {
++            mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1);
++            if (merge_idx == nb_merge_cand)
++                return;
++            nb_merge_cand++;
++        }
++    }
++
++    // above right spatial merge candidate
++    is_available_b0 = AVAILABLE(cand_up_right, B0) &&
++                      xB0 < s->ps.sps->width &&
++                      PRED_BLOCK_AVAILABLE(B0) &&
++                      !is_diff_mer(s, xB0, yB0, x0, y0);
++
++    if (is_available_b0 &&
++        !(is_available_b1 && COMPARE_MV_REFIDX(B0, B1))) {
++        mergecandlist[nb_merge_cand] = TAB_MVF_PU(B0);
++        if (merge_idx == nb_merge_cand)
++            return;
++        nb_merge_cand++;
++    }
++
++    // left bottom spatial merge candidate
++    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
++                      yA0 < s->ps.sps->height &&
++                      PRED_BLOCK_AVAILABLE(A0) &&
++                      !is_diff_mer(s, xA0, yA0, x0, y0);
++
++    if (is_available_a0 &&
++        !(is_available_a1 && COMPARE_MV_REFIDX(A0, A1))) {
++        mergecandlist[nb_merge_cand] = TAB_MVF_PU(A0);
++        if (merge_idx == nb_merge_cand)
++            return;
++        nb_merge_cand++;
++    }
++
++    // above left spatial merge candidate
++    is_available_b2 = AVAILABLE(cand_up_left, B2) &&
++                      !is_diff_mer(s, xB2, yB2, x0, y0);
++
++    if (is_available_b2 &&
++        !(is_available_a1 && COMPARE_MV_REFIDX(B2, A1)) &&
++        !(is_available_b1 && COMPARE_MV_REFIDX(B2, B1)) &&
++        nb_merge_cand != 4) {
++        mergecandlist[nb_merge_cand] = TAB_MVF_PU(B2);
++        if (merge_idx == nb_merge_cand)
++            return;
++        nb_merge_cand++;
++    }
++
++    // temporal motion vector candidate
++    if (s->sh.slice_temporal_mvp_enabled_flag &&
++        nb_merge_cand < s->sh.max_num_merge_cand) {
++        Mv mv_l0_col = { 0 }, mv_l1_col = { 0 };
 +        int available_l0 = temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-                                                        0, &mv_l0_col, 0);
-         int available_l1 = (s->sh.slice_type == HEVC_SLICE_B) ?
--                           temporal_luma_motion_vector(s, x0, y0, nPbW, nPbH,
++                                                       0, &mv_l0_col, 0);
++        int available_l1 = (s->sh.slice_type == HEVC_SLICE_B) ?
 +                           temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-                                                        0, &mv_l1_col, 1) : 0;
- 
-         if (available_l0 || available_l1) {
-@@ -476,16 +473,15 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0,
- /*
-  * 8.5.3.1.1 Derivation process of luma Mvs for merge mode
-  */
--void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
-+void ff_hevc_luma_mv_merge_mode(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int nPbW,
-                                 int nPbH, int log2_cb_size, int part_idx,
--                                int merge_idx, MvField *mv)
++                                                       0, &mv_l1_col, 1) : 0;
++
++        if (available_l0 || available_l1) {
++            mergecandlist[nb_merge_cand].pred_flag = available_l0 + (available_l1 << 1);
++            AV_ZERO16(mergecandlist[nb_merge_cand].ref_idx);
++            mergecandlist[nb_merge_cand].mv[0]      = mv_l0_col;
++            mergecandlist[nb_merge_cand].mv[1]      = mv_l1_col;
++
++            if (merge_idx == nb_merge_cand)
++                return;
++            nb_merge_cand++;
++        }
++    }
++
++    nb_orig_merge_cand = nb_merge_cand;
++
++    // combined bi-predictive merge candidates  (applies for B slices)
++    if (s->sh.slice_type == HEVC_SLICE_B && nb_orig_merge_cand > 1 &&
++        nb_orig_merge_cand < s->sh.max_num_merge_cand) {
++        int comb_idx = 0;
++
++        for (comb_idx = 0; nb_merge_cand < s->sh.max_num_merge_cand &&
++                           comb_idx < nb_orig_merge_cand * (nb_orig_merge_cand - 1); comb_idx++) {
++            int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
++            int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
++            MvField l0_cand = mergecandlist[l0_cand_idx];
++            MvField l1_cand = mergecandlist[l1_cand_idx];
++
++            if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) &&
++                (refPicList[0].list[l0_cand.ref_idx[0]] !=
++                 refPicList[1].list[l1_cand.ref_idx[1]] ||
++                 AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) {
++                mergecandlist[nb_merge_cand].ref_idx[0]   = l0_cand.ref_idx[0];
++                mergecandlist[nb_merge_cand].ref_idx[1]   = l1_cand.ref_idx[1];
++                mergecandlist[nb_merge_cand].pred_flag    = PF_BI;
++                AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]);
++                AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]);
++                if (merge_idx == nb_merge_cand)
++                    return;
++                nb_merge_cand++;
++            }
++        }
++    }
++
++    // append Zero motion vector candidates
++    while (nb_merge_cand < s->sh.max_num_merge_cand) {
++        mergecandlist[nb_merge_cand].pred_flag    = PF_L0 + ((s->sh.slice_type == HEVC_SLICE_B) << 1);
++        AV_ZERO32(mergecandlist[nb_merge_cand].mv + 0);
++        AV_ZERO32(mergecandlist[nb_merge_cand].mv + 1);
++        mergecandlist[nb_merge_cand].ref_idx[0]   = zero_idx < nb_refs ? zero_idx : 0;
++        mergecandlist[nb_merge_cand].ref_idx[1]   = zero_idx < nb_refs ? zero_idx : 0;
++
++        if (merge_idx == nb_merge_cand)
++            return;
++        nb_merge_cand++;
++        zero_idx++;
++    }
++}
++
++/*
++ * 8.5.3.1.1 Derivation process of luma Mvs for merge mode
++ */
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++                                int nPbH, int log2_cb_size, int part_idx,
 +                                int merge_idx, MvField * const mv)
- {
-     int singleMCLFlag = 0;
-     int nCS = 1 << log2_cb_size;
-     LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]);
-     int nPbW2 = nPbW;
-     int nPbH2 = nPbH;
--    HEVCLocalContext *lc = s->HEVClc;
- 
-     if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) {
-         singleMCLFlag = 1;
-@@ -496,8 +492,8 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
-         part_idx      = 0;
-     }
- 
--    ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH);
--    derive_spatial_merge_candidates(s, x0, y0, nPbW, nPbH, log2_cb_size,
-+    ff_hevc_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH);
++{
++    int singleMCLFlag = 0;
++    int nCS = 1 << log2_cb_size;
++    LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]);
++    int nPbW2 = nPbW;
++    int nPbH2 = nPbH;
++
++    if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) {
++        singleMCLFlag = 1;
++        x0            = lc->cu.x;
++        y0            = lc->cu.y;
++        nPbW          = nCS;
++        nPbH          = nCS;
++        part_idx      = 0;
++    }
++
++    ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH);
 +    derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-                                     singleMCLFlag, part_idx,
-                                     merge_idx, mergecand_list);
- 
-@@ -509,12 +505,12 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW,
-     *mv = mergecand_list[merge_idx];
- }
- 
--static av_always_inline void dist_scale(HEVCContext *s, Mv *mv,
-+static av_always_inline void dist_scale(const HEVCContext * const s, Mv * const mv,
-                                         int min_pu_width, int x, int y,
-                                         int elist, int ref_idx_curr, int ref_idx)
- {
--    RefPicList *refPicList = s->ref->refPicList;
--    MvField *tab_mvf       = s->ref->tab_mvf;
++                                    singleMCLFlag, part_idx,
++                                    merge_idx, mergecand_list);
++
++    if (mergecand_list[merge_idx].pred_flag == PF_BI &&
++        (nPbW2 + nPbH2) == 12) {
++        mergecand_list[merge_idx].pred_flag = PF_L0;
++    }
++
++    *mv = mergecand_list[merge_idx];
++}
++
++static av_always_inline void dist_scale(const HEVCRpiContext * const s, Mv * const mv,
++                                        int min_pu_width, int x, int y,
++                                        int elist, int ref_idx_curr, int ref_idx)
++{
 +    const RefPicList * const refPicList = s->ref->refPicList;
 +    const MvField * const tab_mvf       = s->ref->tab_mvf;
-     int ref_pic_elist      = refPicList[elist].list[TAB_MVF(x, y).ref_idx[elist]];
-     int ref_pic_curr       = refPicList[ref_idx_curr].list[ref_idx];
- 
-@@ -526,13 +522,13 @@ static av_always_inline void dist_scale(HEVCContext *s, Mv *mv,
-     }
- }
- 
--static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index,
--                         Mv *mv, int ref_idx_curr, int ref_idx)
-+static int mv_mp_mode_mx(const HEVCContext * const s, const int x, const int y, const int pred_flag_index,
++    int ref_pic_elist      = refPicList[elist].list[TAB_MVF(x, y).ref_idx[elist]];
++    int ref_pic_curr       = refPicList[ref_idx_curr].list[ref_idx];
++
++    if (ref_pic_elist != ref_pic_curr) {
++        int poc_diff = s->poc - ref_pic_elist;
++        if (!poc_diff)
++            poc_diff = 1;
++        mv_scale(mv, mv, poc_diff, s->poc - ref_pic_curr);
++    }
++}
++
++static int mv_mp_mode_mx(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index,
 +                         Mv * const mv, const int ref_idx_curr, const int ref_idx)
- {
--    MvField *tab_mvf = s->ref->tab_mvf;
--    int min_pu_width = s->ps.sps->min_pu_width;
++{
 +    const MvField * const tab_mvf = s->ref->tab_mvf;
 +    const int min_pu_width = s->ps.sps->min_pu_width;
- 
--    RefPicList *refPicList = s->ref->refPicList;
++
 +    const RefPicList * const refPicList = s->ref->refPicList;
- 
-     if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) &&
-         refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) {
-@@ -542,8 +538,8 @@ static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index,
-     return 0;
- }
- 
--static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index,
--                            Mv *mv, int ref_idx_curr, int ref_idx)
-+static int mv_mp_mode_mx_lt(const HEVCContext * const s, const int x, const int y, const int pred_flag_index,
++
++    if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) &&
++        refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) {
++        *mv = TAB_MVF(x, y).mv[pred_flag_index];
++        return 1;
++    }
++    return 0;
++}
++
++static int mv_mp_mode_mx_lt(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index,
 +                            Mv * const mv, const int ref_idx_curr, const int ref_idx)
- {
-     MvField *tab_mvf = s->ref->tab_mvf;
-     int min_pu_width = s->ps.sps->min_pu_width;
-@@ -579,13 +575,12 @@ static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index,
-                      (y ## v) >> s->ps.sps->log2_min_pu_size,      \
-                      pred, &mx, ref_idx_curr, ref_idx)
- 
--void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
-+void ff_hevc_luma_mv_mvp_mode(const HEVCContext * const s, HEVCLocalContext *lc, int x0, int y0, int nPbW,
-                               int nPbH, int log2_cb_size, int part_idx,
--                              int merge_idx, MvField *mv,
-+                              int merge_idx, MvField * const mv,
-                               int mvp_lx_flag, int LX)
- {
--    HEVCLocalContext *lc = s->HEVClc;
--    MvField *tab_mvf = s->ref->tab_mvf;
-+    const MvField *tab_mvf = s->ref->tab_mvf;
-     int isScaledFlag_L0 = 0;
-     int availableFlagLXA0 = 1;
-     int availableFlagLXB0 = 1;
-@@ -763,7 +758,7 @@ scalef:
-     if (numMVPCandLX < 2 && s->sh.slice_temporal_mvp_enabled_flag &&
-         mvp_lx_flag == numMVPCandLX) {
-         Mv mv_col;
--        int available_col = temporal_luma_motion_vector(s, x0, y0, nPbW,
-+        int available_col = temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
-                                                         nPbH, ref_idx,
-                                                         &mv_col, LX);
-         if (available_col)
-diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
-index 902917d4dd..c1140e2a76 100644
---- a/libavcodec/hevc_ps.c
-+++ b/libavcodec/hevc_ps.c
-@@ -820,7 +820,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
-     switch (sps->bit_depth) {
-     case 8:
-         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
-+#if RPI_HEVC_SAND
-+        // *** Horrid kludge s.t. we start out with sand format
-+        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
-+#else
-         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
-+#endif
-         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
-         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
-        break;
-@@ -832,7 +837,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
-         break;
-     case 10:
-         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY10;
-+#if RPI_HEVC_SAND
-+        // *** Horrid kludge s.t. we start out with sand format
-+        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
-+#else
-         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
-+#endif
-         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
-         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
-         break;
-@@ -1100,7 +1110,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
-         skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
-         if (sps_extension_flag[0]) {
-             int extended_precision_processing_flag;
--            int high_precision_offsets_enabled_flag;
-             int cabac_bypass_alignment_enabled_flag;
- 
-             sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
-@@ -1115,10 +1124,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
-                    "extended_precision_processing_flag not yet implemented\n");
- 
-             sps->intra_smoothing_disabled_flag       = get_bits1(gb);
--            high_precision_offsets_enabled_flag  = get_bits1(gb);
--            if (high_precision_offsets_enabled_flag)
-+            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
-+            if (sps->high_precision_offsets_enabled_flag)
-                 av_log(avctx, AV_LOG_WARNING,
--                   "high_precision_offsets_enabled_flag not yet implemented\n");
-+                   "high_precision_offsets_enabled_flag not fully implemented\n");
- 
-             sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
- 
-@@ -1285,6 +1294,7 @@ static void hevc_pps_free(void *opaque, uint8_t *data)
-     av_freep(&pps->ctb_addr_rs_to_ts);
-     av_freep(&pps->ctb_addr_ts_to_rs);
-     av_freep(&pps->tile_pos_rs);
-+    av_freep(&pps->tile_size);
-     av_freep(&pps->tile_id);
-     av_freep(&pps->min_tb_addr_zs_tab);
- 
-@@ -1369,7 +1379,7 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
-         pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
- 
-     for (i = 0, j = 0; i < sps->ctb_width; i++) {
--        if (i > pps->col_bd[j])
-+        if (i >= pps->col_bd[j + 1])
-             j++;
-         pps->col_idxX[i] = j;
-     }
-@@ -1382,6 +1392,7 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
-     pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
-     pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
-     pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-+    pps->tile_size         = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_size));
-     pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab));
-     if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-         !pps->tile_id || !pps->min_tb_addr_zs_tab) {
-@@ -1433,8 +1444,12 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
- 
-     for (j = 0; j < pps->num_tile_rows; j++)
-         for (i = 0; i < pps->num_tile_columns; i++)
-+        {
-+            pps->tile_size[j * pps->num_tile_columns + i] =
-+                pps->column_width[i] * pps->row_height[j];
-             pps->tile_pos_rs[j * pps->num_tile_columns + i] =
-                 pps->row_bd[j] * sps->ctb_width + pps->col_bd[i];
-+        }
- 
-     log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size;
-     pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1];
-diff --git a/libavcodec/hevc_ps.h b/libavcodec/hevc_ps.h
-index 76f8eb31e6..edf541ce8b 100644
---- a/libavcodec/hevc_ps.h
-+++ b/libavcodec/hevc_ps.h
-@@ -289,6 +289,7 @@ typedef struct HEVCSPS {
-     int implicit_rdpcm_enabled_flag;
-     int explicit_rdpcm_enabled_flag;
-     int intra_smoothing_disabled_flag;
-+    int high_precision_offsets_enabled_flag;
-     int persistent_rice_adaptation_enabled_flag;
- 
-     ///< coded frame dimension in various units
-@@ -384,6 +385,7 @@ typedef struct HEVCPPS {
-     int *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
-     int *tile_id;           ///< TileId
-     int *tile_pos_rs;       ///< TilePosRS
-+    int *tile_size;         ///< TileSize
-     int *min_tb_addr_zs;    ///< MinTbAddrZS
-     int *min_tb_addr_zs_tab;///< MinTbAddrZS
- 
-diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
-index ac462d350b..c3890e015c 100644
---- a/libavcodec/hevc_refs.c
-+++ b/libavcodec/hevc_refs.c
-@@ -23,7 +23,7 @@
- 
- #include "libavutil/avassert.h"
- #include "libavutil/pixdesc.h"
--
-+#include "libavutil/rpi_sand_fns.h"
- #include "internal.h"
- #include "thread.h"
- #include "hevc.h"
-@@ -54,13 +54,13 @@ void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags)
-     }
- }
- 
--RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *ref, int x0, int y0)
-+const RefPicList *ff_hevc_get_ref_list(const HEVCContext * const s, const HEVCFrame * const ref, int x0, int y0)
- {
-     int x_cb         = x0 >> s->ps.sps->log2_ctb_size;
-     int y_cb         = y0 >> s->ps.sps->log2_ctb_size;
-     int pic_width_cb = s->ps.sps->ctb_width;
-     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb];
--    return (RefPicList *)ref->rpl_tab[ctb_addr_ts];
-+    return (const RefPicList *)ref->rpl_tab[ctb_addr_ts];
- }
- 
- void ff_hevc_clear_refs(HEVCContext *s)
-@@ -207,7 +207,6 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
- 
-         if (nb_output) {
-             HEVCFrame *frame = &s->DPB[min_idx];
--
-             if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
-                 return 0;
- 
-@@ -218,7 +217,6 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
-                 ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-             if (ret < 0)
-                 return ret;
--
-             av_log(s->avctx, AV_LOG_DEBUG,
-                    "Output frame with POC %d.\n", frame->poc);
-             return 1;
-@@ -422,8 +420,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
-     frame->sequence = s->seq_decode;
-     frame->flags    = 0;
- 
--    if (s->threads_type == FF_THREAD_FRAME)
--        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
-+    ff_hevc_progress_set_all_done(frame);
- 
-     return frame;
- }
-diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index 2e4add2ae3..bffae105ef 100644
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -43,8 +43,681 @@
- #include "hevcdec.h"
- #include "profiles.h"
- 
-+#ifdef RPI
-+  #include "rpi_qpu.h"
-+  #include "rpi_shader.h"
-+  #include "rpi_shader_cmd.h"
-+  #include "rpi_shader_template.h"
-+  #include "rpi_zc.h"
-+  #include "libavutil/rpi_sand_fns.h"
-+
-+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-+  #define RPI_CACHE_UNIF_MVS  1
-+
-+  #include "pthread.h"
-+  #include "libavutil/atomic.h"
-+#endif
++{
++    MvField *tab_mvf = s->ref->tab_mvf;
++    int min_pu_width = s->ps.sps->min_pu_width;
 +
-+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
++    RefPicList *refPicList = s->ref->refPicList;
 +
-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++    if ((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) {
++        int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
 +
-+#ifndef av_mod_uintp2
-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-+{
-+    return a & ((1 << p) - 1);
++        int colIsLongTerm =
++            refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
++
++        if (colIsLongTerm == currIsLongTerm) {
++            *mv = TAB_MVF(x, y).mv[pred_flag_index];
++            if (!currIsLongTerm)
++                dist_scale(s, mv, min_pu_width, x, y,
++                           pred_flag_index, ref_idx_curr, ref_idx);
++            return 1;
++        }
++    }
++    return 0;
 +}
-+#   define av_mod_uintp2   av_mod_uintp2_c
-+#endif
 +
- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
- 
++#define MP_MX(v, pred, mx)                                      \
++    mv_mp_mode_mx(s,                                            \
++                  (x ## v) >> s->ps.sps->log2_min_pu_size,         \
++                  (y ## v) >> s->ps.sps->log2_min_pu_size,         \
++                  pred, &mx, ref_idx_curr, ref_idx)
 +
-+#if RPI_INTER
++#define MP_MX_LT(v, pred, mx)                                   \
++    mv_mp_mode_mx_lt(s,                                         \
++                     (x ## v) >> s->ps.sps->log2_min_pu_size,      \
++                     (y ## v) >> s->ps.sps->log2_min_pu_size,      \
++                     pred, &mx, ref_idx_curr, ref_idx)
 +
-+static void rpi_begin(const HEVCContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW,
++                              int nPbH, int log2_cb_size, int part_idx,
++                              int merge_idx, MvField * const mv,
++                              int mvp_lx_flag, int LX)
++{
++    const MvField *tab_mvf = s->ref->tab_mvf;
++    int isScaledFlag_L0 = 0;
++    int availableFlagLXA0 = 1;
++    int availableFlagLXB0 = 1;
++    int numMVPCandLX = 0;
++    int min_pu_width = s->ps.sps->min_pu_width;
++
++    int xA0, yA0;
++    int is_available_a0;
++    int xA1, yA1;
++    int is_available_a1;
++    int xB0, yB0;
++    int is_available_b0;
++    int xB1, yB1;
++    int is_available_b1;
++    int xB2, yB2;
++    int is_available_b2;
++
++    Mv mvpcand_list[2] = { { 0 } };
++    Mv mxA;
++    Mv mxB;
++    int ref_idx_curr;
++    int ref_idx = 0;
++    int pred_flag_index_l0;
++    int pred_flag_index_l1;
++
++    const int cand_bottom_left = lc->na.cand_bottom_left;
++    const int cand_left        = lc->na.cand_left;
++    const int cand_up_left     = lc->na.cand_up_left;
++    const int cand_up          = lc->na.cand_up;
++    const int cand_up_right    = lc->na.cand_up_right_sap;
++    ref_idx_curr       = LX;
++    ref_idx            = mv->ref_idx[LX];
++    pred_flag_index_l0 = LX;
++    pred_flag_index_l1 = !LX;
++
++    // left bottom spatial candidate
++    xA0 = x0 - 1;
++    yA0 = y0 + nPbH;
++
++    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
++                      yA0 < s->ps.sps->height &&
++                      PRED_BLOCK_AVAILABLE(A0);
++
++    //left spatial merge candidate
++    xA1    = x0 - 1;
++    yA1    = y0 + nPbH - 1;
++
++    is_available_a1 = AVAILABLE(cand_left, A1);
++    if (is_available_a0 || is_available_a1)
++        isScaledFlag_L0 = 1;
++
++    if (is_available_a0) {
++        if (MP_MX(A0, pred_flag_index_l0, mxA)) {
++            goto b_candidates;
++        }
++        if (MP_MX(A0, pred_flag_index_l1, mxA)) {
++            goto b_candidates;
++        }
++    }
 +
-+#define MC_DUMMY_X (-32)
-+#define MC_DUMMY_Y (-32)
++    if (is_available_a1) {
++        if (MP_MX(A1, pred_flag_index_l0, mxA)) {
++            goto b_candidates;
++        }
++        if (MP_MX(A1, pred_flag_index_l1, mxA)) {
++            goto b_candidates;
++        }
++    }
 +
-+// UV & Y both have min 4x4 pred (no 2x2 chroma)
-+// Allow for even spread +1 for setup, +1 for rounding
-+// As we have load sharing this can (in theory) be exceeded so we have to
-+// check after each CTU, but it is a good base size
++    if (is_available_a0) {
++        if (MP_MX_LT(A0, pred_flag_index_l0, mxA)) {
++            goto b_candidates;
++        }
++        if (MP_MX_LT(A0, pred_flag_index_l1, mxA)) {
++            goto b_candidates;
++        }
++    }
 +
-+// Worst case (all 4x4) commands per CTU
-+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
-+#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
++    if (is_available_a1) {
++        if (MP_MX_LT(A1, pred_flag_index_l0, mxA)) {
++            goto b_candidates;
++        }
++        if (MP_MX_LT(A1, pred_flag_index_l1, mxA)) {
++            goto b_candidates;
++        }
++    }
++    availableFlagLXA0 = 0;
++
++b_candidates:
++    // B candidates
++    // above right spatial merge candidate
++    xB0    = x0 + nPbW;
++    yB0    = y0 - 1;
++
++    is_available_b0 =  AVAILABLE(cand_up_right, B0) &&
++                       xB0 < s->ps.sps->width &&
++                       PRED_BLOCK_AVAILABLE(B0);
++
++    // above spatial merge candidate
++    xB1    = x0 + nPbW - 1;
++    yB1    = y0 - 1;
++    is_available_b1 = AVAILABLE(cand_up, B1);
++
++    // above left spatial merge candidate
++    xB2 = x0 - 1;
++    yB2 = y0 - 1;
++    is_available_b2 = AVAILABLE(cand_up_left, B2);
++
++    // above right spatial merge candidate
++    if (is_available_b0) {
++        if (MP_MX(B0, pred_flag_index_l0, mxB)) {
++            goto scalef;
++        }
++        if (MP_MX(B0, pred_flag_index_l1, mxB)) {
++            goto scalef;
++        }
++    }
 +
-+#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
-+#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
++    // above spatial merge candidate
++    if (is_available_b1) {
++        if (MP_MX(B1, pred_flag_index_l0, mxB)) {
++            goto scalef;
++        }
++        if (MP_MX(B1, pred_flag_index_l1, mxB)) {
++            goto scalef;
++        }
++    }
 +
-+// The QPU code for UV blocks only works up to a block width of 8
-+#define RPI_CHROMA_BLOCK_WIDTH 8
++    // above left spatial merge candidate
++    if (is_available_b2) {
++        if (MP_MX(B2, pred_flag_index_l0, mxB)) {
++            goto scalef;
++        }
++        if (MP_MX(B2, pred_flag_index_l1, mxB)) {
++            goto scalef;
++        }
++    }
++    availableFlagLXB0 = 0;
 +
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++scalef:
++    if (!isScaledFlag_L0) {
++        if (availableFlagLXB0) {
++            availableFlagLXA0 = 1;
++            mxA = mxB;
++        }
++        availableFlagLXB0 = 0;
 +
++        // XB0 and L1
++        if (is_available_b0) {
++            availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l0, mxB);
++            if (!availableFlagLXB0)
++                availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l1, mxB);
++        }
 +
-+// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8] = {
-+        ENCODE_COEFFS(  0,  64,   0,  0),
-+        ENCODE_COEFFS(  2,  58,  10,  2),
-+        ENCODE_COEFFS(  4,  54,  16,  2),
-+        ENCODE_COEFFS(  6,  46,  28,  4),
-+        ENCODE_COEFFS(  4,  36,  36,  4),
-+        ENCODE_COEFFS(  4,  28,  46,  6),
-+        ENCODE_COEFFS(  2,  16,  54,  4),
-+        ENCODE_COEFFS(  2,  10,  58,  2)
-+};
++        if (is_available_b1 && !availableFlagLXB0) {
++            availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l0, mxB);
++            if (!availableFlagLXB0)
++                availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l1, mxB);
++        }
 +
-+// Function arrays by QPU
++        if (is_available_b2 && !availableFlagLXB0) {
++            availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l0, mxB);
++            if (!availableFlagLXB0)
++                availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l1, mxB);
++        }
++    }
 +
-+static const int * const inter_pred_setup_c_qpu[12] = {
-+    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
-+};
++    if (availableFlagLXA0)
++        mvpcand_list[numMVPCandLX++] = mxA;
 +
-+static const int * const inter_pred_setup_c10_qpu[12] = {
-+    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
-+};
++    if (availableFlagLXB0 && (!availableFlagLXA0 || mxA.x != mxB.x || mxA.y != mxB.y))
++        mvpcand_list[numMVPCandLX++] = mxB;
 +
-+static const int * const inter_pred_setup_y_qpu[12] = {
-+    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
-+};
++    //temporal motion vector prediction candidate
++    if (numMVPCandLX < 2 && s->sh.slice_temporal_mvp_enabled_flag &&
++        mvp_lx_flag == numMVPCandLX) {
++        Mv mv_col;
++        int available_col = temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
++                                                        nPbH, ref_idx,
++                                                        &mv_col, LX);
++        if (available_col)
++            mvpcand_list[numMVPCandLX++] = mv_col;
++    }
 +
-+static const int * const inter_pred_setup_y10_qpu[12] = {
-+    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
-+};
++    mv->mv[LX] = mvpcand_list[mvp_lx_flag];
++}
+diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
+new file mode 100644
+index 0000000000..04f9231acc
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.c
+@@ -0,0 +1,142 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+static const int * const inter_pred_sync_qpu[12] = {
-+    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
-+    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
-+    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
-+};
++#include "bytestream.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_parse.h"
 +
-+static const int * const inter_pred_sync10_qpu[12] = {
-+    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
-+    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
-+    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
-+};
++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
++                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
++                                 int err_recognition, int apply_defdispwin, void *logctx)
++{
++    int i;
++    int ret = 0;
++    H2645Packet pkt = { 0 };
 +
-+static const int * const inter_pred_exit_c_qpu[12] = {
-+    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
-+};
++    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, nal_length_size, AV_CODEC_ID_HEVC, 1);
++    if (ret < 0) {
++        goto done;
++    }
 +
-+static const int * const inter_pred_exit_c10_qpu[12] = {
-+    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
-+};
++    for (i = 0; i < pkt.nb_nals; i++) {
++        H2645NAL *nal = &pkt.nals[i];
 +
-+static const int * const inter_pred_exit_y_qpu[12] = {
-+    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
-+};
++        /* ignore everything except parameter sets and VCL NALUs */
++        switch (nal->type) {
++        case HEVC_NAL_VPS:
++            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_SPS:
++            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_PPS:
++            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_SEI_PREFIX:
++        case HEVC_NAL_SEI_SUFFIX:
++            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
++            if (ret < 0)
++                goto done;
++            break;
++        default:
++            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
++            break;
++        }
++    }
 +
-+static const int * const inter_pred_exit_y10_qpu[12] = {
-+    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
-+};
++done:
++    ff_h2645_packet_uninit(&pkt);
++    if (err_recognition & AV_EF_EXPLODE)
++        return ret;
 +
-+typedef struct ipe_chan_info_s
++    return 0;
++}
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++                             int err_recognition, int apply_defdispwin, void *logctx)
 +{
-+    const uint8_t bit_depth;
-+    const uint8_t n;
-+    const int * const * setup_fns;
-+    const int * const * sync_fns;
-+    const int * const * exit_fns;
-+} ipe_chan_info_t;
++    int ret = 0;
++    GetByteContext gb;
 +
-+typedef struct ipe_init_info_s
-+{
-+    ipe_chan_info_t luma;
-+    ipe_chan_info_t chroma;
-+} ipe_init_info_t;
++    bytestream2_init(&gb, data, size);
 +
-+static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
-+   {  // 8
-+      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
-+      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
-+   },
-+   {  // 9
-+      .luma =   {0},
-+      .chroma = {0}
-+   },
-+   {  // 10
-+      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
-+      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
-+   }
++    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
++        /* It seems the extradata is encoded as hvcC format.
++         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
++         * is finalized. When finalized, configurationVersion will be 1 and we
++         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
++        int i, j, num_arrays, nal_len_size;
 +
-+};
++        *is_nalff = 1;
 +
-+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
-+{
-+    const unsigned int n = ici->n;
-+    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
++        bytestream2_skip(&gb, 21);
++        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
++        num_arrays   = bytestream2_get_byte(&gb);
 +
-+    ipe->n = n;
-+    ipe->max_fill = q1_size - ipe->min_gap;
-+    for(unsigned int i = 0; i < n; i++) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base =
-+            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
-+        q->code_setup = qpu_fn(ici->setup_fns[i]);
-+        q->code_sync = qpu_fn(ici->sync_fns[i]);
-+        q->code_exit = qpu_fn(ici->exit_fns[i]);
++        /* nal units in the hvcC always have length coded with 2 bytes,
++         * so put a fake nal_length_size = 2 while parsing them */
++        *nal_length_size = 2;
++
++        /* Decode nal units from hvcC. */
++        for (i = 0; i < num_arrays; i++) {
++            int type = bytestream2_get_byte(&gb) & 0x3f;
++            int cnt  = bytestream2_get_be16(&gb);
++
++            for (j = 0; j < cnt; j++) {
++                // +2 for the nal size field
++                int nalsize = bytestream2_peek_be16(&gb) + 2;
++                if (bytestream2_get_bytes_left(&gb) < nalsize) {
++                    av_log(logctx, AV_LOG_ERROR,
++                           "Invalid NAL unit size in extradata.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
++                                            *nal_length_size, err_recognition, apply_defdispwin,
++                                            logctx);
++                if (ret < 0) {
++                    av_log(logctx, AV_LOG_ERROR,
++                           "Decoding nal unit %d %d from hvcC failed\n",
++                           type, i);
++                    return ret;
++                }
++                bytestream2_skip(&gb, nalsize);
++            }
++        }
++
++        /* Now store right nal length size, that will be used to parse
++         * all other nals */
++        *nal_length_size = nal_len_size;
++    } else {
++        *is_nalff = 0;
++        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
++                                    err_recognition, apply_defdispwin, logctx);
++        if (ret < 0)
++            return ret;
 +    }
++
++    return ret;
 +}
+diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h
+new file mode 100644
+index 0000000000..4b4d032a16
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.h
+@@ -0,0 +1,36 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
-+{
-+    av_assert0(bit_depth >= 8 && bit_depth <= 16);
++/**
++ * @file
++ * H.265 parser code
++ */
 +
-+    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
-+}
++#ifndef AVCODEC_RPI_HEVC_PARSE_H
++#define AVCODEC_RPI_HEVC_PARSE_H
 +
++#include <stdint.h>
 +
-+#endif
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
 +
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++                             int err_recognition, int apply_defdispwin, void *logctx);
 +
-+#ifdef RPI
++#endif /* AVCODEC_RPI_HEVC_PARSE_H */
+diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
+new file mode 100644
+index 0000000000..f65efa1015
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.c
+@@ -0,0 +1,1712 @@
++/*
++ * HEVC Parameter Set decoding
++ *
++ * Copyright (C) 2012 - 2103 Guillaume Martres
++ * Copyright (C) 2012 - 2103 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+// If we only have one worker then we could just do this
-+// But if we have multiple threads then they might start "at the same time"
-+// and we need to share out the work carefully
-+//
-+// returns pq->job_n++
-+static uintptr_t pass_queue_inc_job_n(HEVCRpiPassQueue * const pq, const uintptr_t mod_n)
-+{
-+#if 1
-+    // Single thread processing
-+    uintptr_t x1;
-+    void * const x2 = pq->job_n;
-+    if ((x1 = (uintptr_t)x2 + 1) >= mod_n)
-+        x1 = 0;
-+    pq->job_n = (void *)x1;
-+#else
-+    void * x0;
-+    uintptr_t x1;
-+    void * x2 = pq->job_n;
-+    do
-+    {
-+        if ((x1 = (uintptr_t)x2 + 1) >= mod_n)
-+            x1 = 0;
-+        x0 = x2;
-+    } while ((x2 = avpriv_atomic_ptr_cas(&pq->job_n, x0, (void *)x1)) == x0);
-+#endif
-+    return (uintptr_t)x2;
-+}
++#include "libavutil/imgutils.h"
++#include "golomb.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevcdec.h"
++
++static const uint8_t default_scaling_list_intra[] = {
++    16, 16, 16, 16, 17, 18, 21, 24,
++    16, 16, 16, 16, 17, 19, 22, 25,
++    16, 16, 17, 18, 20, 22, 25, 29,
++    16, 16, 18, 21, 24, 27, 31, 36,
++    17, 17, 20, 24, 30, 35, 41, 47,
++    18, 19, 22, 27, 35, 44, 54, 65,
++    21, 22, 25, 31, 41, 54, 70, 88,
++    24, 25, 29, 36, 47, 65, 88, 115
++};
 +
-+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
-+{
-+    pq->terminate = 0;
-+    pq->job_n = (void *)0;
-+    pq->context = s;
-+    pq->worker = worker;
-+    pq->psem_out = psem_out;
-+    pq->pass_n = n;
-+    pq->started = 0;
-+    sem_init(&pq->sem_in, 0, 0);
-+}
++static const uint8_t default_scaling_list_inter[] = {
++    16, 16, 16, 16, 17, 18, 20, 24,
++    16, 16, 16, 17, 18, 20, 24, 25,
++    16, 16, 17, 18, 20, 24, 25, 28,
++    16, 17, 18, 20, 24, 25, 28, 33,
++    17, 18, 20, 24, 25, 28, 33, 41,
++    18, 20, 24, 25, 28, 33, 41, 54,
++    20, 24, 25, 28, 33, 41, 54, 71,
++    24, 25, 28, 33, 41, 54, 71, 91
++};
 +
-+static void pass_queue_kill(HEVCRpiPassQueue * const pq)
-+{
-+    sem_destroy(&pq->sem_in);
-+}
++static const AVRational vui_sar[] = {
++    {  0,   1 },
++    {  1,   1 },
++    { 12,  11 },
++    { 10,  11 },
++    { 16,  11 },
++    { 40,  33 },
++    { 24,  11 },
++    { 20,  11 },
++    { 32,  11 },
++    { 80,  33 },
++    { 18,  11 },
++    { 15,  11 },
++    { 64,  33 },
++    { 160, 99 },
++    {  4,   3 },
++    {  3,   2 },
++    {  2,   1 },
++};
 +
-+static inline void rpi_sem_wait(sem_t * const sem)
++static void remove_pps(HEVCRpiParamSets *s, int id)
 +{
-+    while (sem_wait(sem) != 0) {
-+        av_assert0(errno == EINTR);
-+    }
++    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
++        s->pps = NULL;
++    av_buffer_unref(&s->pps_list[id]);
 +}
 +
-+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
++static void remove_sps(HEVCRpiParamSets *s, int id)
 +{
-+    sem_post(&pq->sem_in);
-+}
++    int i;
++    if (s->sps_list[id]) {
++        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
++            s->sps = NULL;
 +
-+// Unsigned Trivial MOD
-+static inline unsigned int utmod(const unsigned int x, const unsigned int n)
-+{
-+    return x >= n ? x - n : x;
-+}
++        /* drop all PPS that depend on this SPS */
++        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
++            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
++                remove_pps(s, i);
 +
-+static inline void pass_queue_do_all(HEVCContext * const s, HEVCRpiJob * const jb)
-+{
-+    // Do the various passes - common with the worker code
-+    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
-+        s->passq[i].worker(s, jb);
++        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
 +    }
++    av_buffer_unref(&s->sps_list[id]);
 +}
 +
-+
-+#if 0
-+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
++static void remove_vps(HEVCRpiParamSets *s, int id)
 +{
-+    int x;
-+    sem_getvalue((sem_t *)&jbc->sem_out, &x);
-+    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
-+}
-+#endif
++    int i;
++    if (s->vps_list[id]) {
++        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
++            s->vps = NULL;
 +
++        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
++            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
++                remove_sps(s, i);
++    }
++    av_buffer_unref(&s->vps_list[id]);
++}
 +
-+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCLocalContext * const lc)
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header)
 +{
-+    HEVCRpiJob * jb;
-+    HEVCRpiJobGlobal * const jbg = jbc->jbg;
++    uint8_t rps_predict = 0;
++    int delta_poc;
++    int k0 = 0;
++    int k1 = 0;
++    int k  = 0;
++    int i;
 +
-+    pthread_mutex_lock(&jbg->lock);
-+    // Check local 1st
-+    if ((jb = jbc->jb1) != NULL)
-+    {
-+        // Only 1 - very easy :-)
-+        // ?? Can we do this with atomic_exch outside the global lock
-+        jbc->jb1 = NULL;
-+    }
-+    else
-+    {
-+        // Now look for global free chain
-+        if ((jb = jbg->free1) != NULL)
-+        {
-+            // Found one - unlink it
-+            jbg->free1 = jb->next;
-+            jb->next = NULL;
++    if (rps != sps->st_rps && sps->nb_st_rps)
++        rps_predict = get_bits1(gb);
++
++    if (rps_predict) {
++        const ShortTermRPS *rps_ridx;
++        int delta_rps;
++        unsigned abs_delta_rps;
++        uint8_t use_delta_flag = 0;
++        uint8_t delta_rps_sign;
++
++        if (is_slice_header) {
++            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
++            if (delta_idx > sps->nb_st_rps) {
++                av_log(avctx, AV_LOG_ERROR,
++                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
++                       delta_idx, sps->nb_st_rps);
++                return AVERROR_INVALIDDATA;
++            }
++            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
++            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
++        } else
++            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
++
++        delta_rps_sign = get_bits1(gb);
++        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
++        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "Invalid value of abs_delta_rps: %d\n",
++                   abs_delta_rps);
++            return AVERROR_INVALIDDATA;
 +        }
-+        else
-+        {
-+            // Out of places to look - wait for one to become free - add to Qs
++        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
++        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
++            int used = rps->used[k] = get_bits1(gb);
 +
-+            // Global
-+            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
-+            {
-+                // Add to end as we had to wait last time or wait Q empty
-+                if ((lc->jw_prev = jbg->wait_tail) == NULL)
-+                    jbg->wait_head = lc;
++            if (!used)
++                use_delta_flag = get_bits1(gb);
++
++            if (used || use_delta_flag) {
++                if (i < rps_ridx->num_delta_pocs)
++                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
 +                else
-+                    lc->jw_prev->jw_next = lc;
-+                lc->jw_next = NULL;
-+                jbg->wait_tail = lc;
++                    delta_poc = delta_rps;
++                rps->delta_poc[k] = delta_poc;
++                if (delta_poc < 0)
++                    k0++;
++                else
++                    k1++;
++                k++;
 +            }
-+            else
-+            {
-+                // Skip over els which had good progress
-+                // We know that the Q isn't empty and there is at least one !last_progess_good el in it
-+                HEVCLocalContext *p = jbg->wait_head;
++        }
 +
-+                if (!p->last_progress_good)
-+                {
-+                    jbg->wait_head = lc;
-+                    lc->jw_prev = NULL;
-+                }
-+                else
-+                {
-+                    do {
-+                        p = p->jw_next;
-+                    } while (p->last_progress_good);
++        if (k >= FF_ARRAY_ELEMS(rps->used)) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "Invalid num_delta_pocs: %d\n", k);
++            return AVERROR_INVALIDDATA;
++        }
 +
-+                    lc->jw_prev = p->jw_prev;
-+                    lc->jw_prev->jw_next = lc;
++        rps->num_delta_pocs    = k;
++        rps->num_negative_pics = k0;
++        // sort in increasing order (smallest first)
++        if (rps->num_delta_pocs != 0) {
++            int used, tmp;
++            for (i = 1; i < rps->num_delta_pocs; i++) {
++                delta_poc = rps->delta_poc[i];
++                used      = rps->used[i];
++                for (k = i - 1; k >= 0; k--) {
++                    tmp = rps->delta_poc[k];
++                    if (delta_poc < tmp) {
++                        rps->delta_poc[k + 1] = tmp;
++                        rps->used[k + 1]      = rps->used[k];
++                        rps->delta_poc[k]     = delta_poc;
++                        rps->used[k]          = used;
++                    }
 +                }
-+
-+                p->jw_prev = lc;
-+                lc->jw_next = p;
 +            }
++        }
++        if ((rps->num_negative_pics >> 1) != 0) {
++            int used;
++            k = rps->num_negative_pics - 1;
++            // flip the negative values to largest first
++            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
++                delta_poc         = rps->delta_poc[i];
++                used              = rps->used[i];
++                rps->delta_poc[i] = rps->delta_poc[k];
++                rps->used[i]      = rps->used[k];
++                rps->delta_poc[k] = delta_poc;
++                rps->used[k]      = used;
++                k--;
++            }
++        }
++    } else {
++        unsigned int prev, nb_positive_pics;
++        rps->num_negative_pics = get_ue_golomb_long(gb);
++        nb_positive_pics       = get_ue_golomb_long(gb);
 +
-+            // Local
-+            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
-+                jbc->lcw_head = lc;
-+            else
-+                lc->ljw_prev->ljw_next = lc;
-+            lc->ljw_next = NULL;
-+            jbc->lcw_tail = lc;
++        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
++            nb_positive_pics >= HEVC_MAX_REFS) {
++            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
++            return AVERROR_INVALIDDATA;
++        }
++
++        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
++        if (rps->num_delta_pocs) {
++            prev = 0;
++            for (i = 0; i < rps->num_negative_pics; i++) {
++                delta_poc = get_ue_golomb_long(gb) + 1;
++                if (delta_poc < 1 || delta_poc > 32768) {
++                    av_log(avctx, AV_LOG_ERROR,
++                        "Invalid value of delta_poc: %d\n",
++                        delta_poc);
++                    return AVERROR_INVALIDDATA;
++                }
++                prev -= delta_poc;
++                rps->delta_poc[i] = prev;
++                rps->used[i]      = get_bits1(gb);
++            }
++            prev = 0;
++            for (i = 0; i < nb_positive_pics; i++) {
++                delta_poc = get_ue_golomb_long(gb) + 1;
++                if (delta_poc < 1 || delta_poc > 32768) {
++                    av_log(avctx, AV_LOG_ERROR,
++                        "Invalid value of delta_poc: %d\n",
++                        delta_poc);
++                    return AVERROR_INVALIDDATA;
++                }
++                prev += delta_poc;
++                rps->delta_poc[rps->num_negative_pics + i] = prev;
++                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
++            }
 +        }
 +    }
++    return 0;
++}
 +
-+    pthread_mutex_unlock(&jbg->lock);
 +
-+    if (jb == NULL)  // Need to wait
-+    {
-+        rpi_sem_wait(&lc->jw_sem);
-+        jb = lc->jw_job;  // Set by free code
++static int decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx,
++                                      PTLCommon *ptl)
++{
++    int i;
++
++    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
++        return -1;
++
++    ptl->profile_space = get_bits(gb, 2);
++    ptl->tier_flag     = get_bits1(gb);
++    ptl->profile_idc   = get_bits(gb, 5);
++    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
++        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
++        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
++        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
++        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
++    else
++        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
++
++    for (i = 0; i < 32; i++) {
++        ptl->profile_compatibility_flag[i] = get_bits1(gb);
++
++        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
++            ptl->profile_idc = i;
 +    }
++    ptl->progressive_source_flag    = get_bits1(gb);
++    ptl->interlaced_source_flag     = get_bits1(gb);
++    ptl->non_packed_constraint_flag = get_bits1(gb);
++    ptl->frame_only_constraint_flag = get_bits1(gb);
 +
-+    return jb;
-+}
++    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
++    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
++    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
 +
++    return 0;
++}
 +
-+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
++static int parse_ptl(GetBitContext *gb, AVCodecContext *avctx,
++                      PTL *ptl, int max_num_sub_layers)
 +{
-+    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
-+    HEVCRpiJobCtl * jbc = jb->jbc_local;
-+    HEVCLocalContext * lc = NULL;
++    int i;
++    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
++        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
++        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
++        return -1;
++    }
 +
-+    pthread_mutex_lock(&jbg->lock);
++    ptl->general_ptl.level_idc = get_bits(gb, 8);
 +
-+    if (jbc != NULL)
-+    {
-+        av_assert1(jbc->jb1 == NULL);
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
++        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
++    }
 +
-+        // Release to Local if nothing waiting there
-+        if ((lc = jbc->lcw_head) == NULL)
-+        {
-+            jbc->jb1 = jb;
++    if (max_num_sub_layers - 1> 0)
++        for (i = max_num_sub_layers - 1; i < 8; i++)
++            skip_bits(gb, 2); // reserved_zero_2bits[i]
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        if (ptl->sub_layer_profile_present_flag[i] &&
++            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "PTL information for sublayer %i too short\n", i);
++            return -1;
 +        }
-+    }
-+    else
-+    {
-+        // Release to global if nothing waiting there
-+        if ((lc = jbg->wait_head) == NULL)
-+        {
-+            jb->next = jbg->free1;
-+            jbg->free1 = jb;
++        if (ptl->sub_layer_level_present_flag[i]) {
++            if (get_bits_left(gb) < 8) {
++                av_log(avctx, AV_LOG_ERROR,
++                       "Not enough data for sublayer %i level_idc\n", i);
++                return -1;
++            } else
++                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
 +        }
-+        else
-+        {
-+            // ? seems somehow mildy ugly...
-+            jbc = lc->context->jbc;
++    }
++
++    return 0;
++}
++
++static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb,
++                                int subpic_params_present)
++{
++    int i;
++
++    for (i = 0; i < nb_cpb; i++) {
++        get_ue_golomb_long(gb); // bit_rate_value_minus1
++        get_ue_golomb_long(gb); // cpb_size_value_minus1
++
++        if (subpic_params_present) {
++            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
++            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
 +        }
++        skip_bits1(gb); // cbr_flag
 +    }
++}
 +
-+    if (lc != NULL)
-+    {
-+        // Something was waiting
++static int decode_hrd(GetBitContext *gb, int common_inf_present,
++                       int max_sublayers)
++{
++    int nal_params_present = 0, vcl_params_present = 0;
++    int subpic_params_present = 0;
++    int i;
 +
-+        // Unlink
-+        // Global
-+        if (lc->jw_next == NULL)
-+            jbg->wait_tail = lc->jw_prev;
-+        else
-+            lc->jw_next->jw_prev = lc->jw_prev;
++    if (common_inf_present) {
++        nal_params_present = get_bits1(gb);
++        vcl_params_present = get_bits1(gb);
 +
-+        if (lc->jw_prev == NULL)
-+            jbg->wait_head = lc->jw_next;
-+        else
-+            lc->jw_prev->jw_next = lc->jw_next;
++        if (nal_params_present || vcl_params_present) {
++            subpic_params_present = get_bits1(gb);
 +
-+        // Local
-+        if (lc->ljw_next == NULL)
-+            jbc->lcw_tail = lc->ljw_prev;
-+        else
-+            lc->ljw_next->ljw_prev = lc->ljw_prev;
++            if (subpic_params_present) {
++                skip_bits(gb, 8); // tick_divisor_minus2
++                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
++                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
++                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
++            }
 +
-+        if (lc->ljw_prev == NULL)
-+            jbc->lcw_head = lc->ljw_next;
-+        else
-+            lc->ljw_prev->ljw_next = lc->ljw_next;
++            skip_bits(gb, 4); // bit_rate_scale
++            skip_bits(gb, 4); // cpb_size_scale
 +
-+        // Prod
-+        lc->jw_job = jb;
-+        sem_post(&lc->jw_sem);
++            if (subpic_params_present)
++                skip_bits(gb, 4);  // cpb_size_du_scale
++
++            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
++            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
++            skip_bits(gb, 5); // dpb_output_delay_length_minus1
++        }
 +    }
 +
-+    pthread_mutex_unlock(&jbg->lock);
-+}
++    for (i = 0; i < max_sublayers; i++) {
++        int low_delay = 0;
++        unsigned int nb_cpb = 1;
++        int fixed_rate = get_bits1(gb);
 +
-+static void job_lc_kill(HEVCLocalContext * const lc)
-+{
-+    sem_destroy(&lc->jw_sem);
-+}
++        if (!fixed_rate)
++            fixed_rate = get_bits1(gb);
 +
-+static void job_lc_init(HEVCLocalContext * const lc)
-+{
-+    lc->jw_next = NULL;
-+    lc->jw_prev = NULL;
-+    lc->ljw_next = NULL;
-+    lc->ljw_prev = NULL;
-+    lc->jw_job = NULL;
-+    sem_init(&lc->jw_sem,  0, 0);
-+}
++        if (fixed_rate)
++            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
++        else
++            low_delay = get_bits1(gb);
 +
-+static int progress_good(const HEVCContext *const s, const HEVCRpiJob * const jb)
-+{
-+    if (jb->waited)
-+        return 0;
-+    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i)
-+    {
-+        if (jb->progress[i] >= 0 && s->DPB[i].tf.progress != NULL &&
-+                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress[i])
-+            return 0;
++        if (!low_delay) {
++            nb_cpb = get_ue_golomb_long(gb) + 1;
++            if (nb_cpb < 1 || nb_cpb > 32) {
++                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
++                return AVERROR_INVALIDDATA;
++            }
++        }
++
++        if (nal_params_present)
++            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++        if (vcl_params_present)
++            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
 +    }
-+    return 1;
++    return 0;
 +}
 +
-+// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
-+static inline void worker_submit_job(HEVCContext *const s, HEVCLocalContext * const lc)
++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps)
 +{
-+    HEVCRpiJobCtl *const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
++    int i,j;
++    int vps_id = 0;
++    ptrdiff_t nal_size;
++    HEVCRpiVPS *vps;
++    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
 +
-+    av_assert1(jb != NULL);
-+
-+    if (jb->ctu_ts_last < 0) {
-+        return;
-+    }
++    if (!vps_buf)
++        return AVERROR(ENOMEM);
++    vps = (HEVCRpiVPS*)vps_buf->data;
 +
-+    lc->last_progress_good = progress_good(s, jb);
-+    lc->jb0 = NULL;
++    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
 +
-+    if (s->offload_recon)
-+    {
-+        pthread_mutex_lock(&jbc->in_lock);
-+        jbc->offloadq[jbc->offload_in] = jb;
-+        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
-+        pthread_mutex_unlock(&jbc->in_lock);
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(vps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(vps->data));
++        vps->data_size = sizeof(vps->data);
++    } else {
++        vps->data_size = nal_size;
++    }
++    memcpy(vps->data, gb->buffer, vps->data_size);
 +
-+        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
++    vps_id = get_bits(gb, 4);
++    if (vps_id >= HEVC_MAX_VPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
++        goto err;
 +    }
-+    else
-+    {
-+        pass_queue_do_all(s, jb);  // Consumes job before return
++
++    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
++        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
++        goto err;
 +    }
-+}
 +
++    vps->vps_max_layers               = get_bits(gb, 6) + 1;
++    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
++    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
 +
-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-+// available to receive the next job.
-+//
-+// Now safe against multiple callers - needed for tiles
-+// "normal" and WPP will only call here one at a time
-+//
-+// Preferably max jobs > bit_threads + passes but this is a minimum for
-+// the non-offload logic to work
-+#if RPI_MAX_JOBS < RPI_BIT_THREADS
-+#error Max Jobs must be >= Bit threads
-+#endif
-+static inline void worker_pass0_ready(const HEVCContext * const s, HEVCLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
++    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
++        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
++        goto err;
++    }
 +
-+    if (lc->jb0 != NULL) {
-+        return;
++    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
++               vps->vps_max_sub_layers);
++        goto err;
 +    }
 +
++    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
++        goto err;
 +
-+    if (s->offload_recon)
-+    {
-+        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
-+        lc->jb0 = job_alloc(jbc, lc);
++    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
++
++    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
++    for (; i < vps->vps_max_sub_layers; i++) {
++        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
++        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
++        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
++
++        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
++            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
++                   vps->vps_max_dec_pic_buffering[i] - 1);
++            goto err;
++        }
++        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
++            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
++                   vps->vps_num_reorder_pics[i]);
++            if (avctx->err_recognition & AV_EF_EXPLODE)
++                goto err;
++        }
 +    }
-+    else
-+    {
-+        lc->jb0 = job_alloc(jbc, lc);
++
++    vps->vps_max_layer_id   = get_bits(gb, 6);
++    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
++    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
++        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
++        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
++        goto err;
 +    }
 +
-+    rpi_begin(s, lc->jb0, lc->ts);
-+}
++    for (i = 1; i < vps->vps_num_layer_sets; i++)
++        for (j = 0; j <= vps->vps_max_layer_id; j++)
++            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
++
++    vps->vps_timing_info_present_flag = get_bits1(gb);
++    if (vps->vps_timing_info_present_flag) {
++        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
++        vps->vps_time_scale                      = get_bits_long(gb, 32);
++        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
++        if (vps->vps_poc_proportional_to_timing_flag)
++            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
++        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
++        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
++            goto err;
++        }
++        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
++            int common_inf_present = 1;
 +
-+// Free up a job without submission
-+static void worker_free(const HEVCContext * const s, HEVCLocalContext * const lc)
-+{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    HEVCRpiJob * const jb = lc->jb0;
++            get_ue_golomb_long(gb); // hrd_layer_set_idx
++            if (i)
++                common_inf_present = get_bits1(gb);
++            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
++        }
++    }
++    get_bits1(gb); /* vps_extension_flag */
 +
-+    if (jb == NULL) {
-+        return;
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread VPS by %d bits\n", -get_bits_left(gb));
++        if (ps->vps_list[vps_id])
++            goto err;
 +    }
 +
-+    lc->jb0 = NULL;
++    if (ps->vps_list[vps_id] &&
++        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
++        av_buffer_unref(&vps_buf);
++    } else {
++        remove_vps(ps, vps_id);
++        ps->vps_list[vps_id] = vps_buf;
++    }
 +
-+    job_free(jbc, jb);
++    return 0;
 +
-+    // If offload then poke sem_out too
-+    if (s->offload_recon) {
-+        sem_post(&jbc->sem_out);
-+    }
++err:
++    av_buffer_unref(&vps_buf);
++    return AVERROR_INVALIDDATA;
 +}
 +
-+
-+// Call this to wait for all jobs to have completed at the end of a frame
-+// Slightly icky as there is no clean way to wait for a sem to count up
-+// Not reentrant - call on main thread only
-+static void worker_wait(const HEVCContext * const s, HEVCLocalContext * const lc)
++static void decode_vui(GetBitContext *gb, AVCodecContext *avctx,
++                       int apply_defdispwin, HEVCRpiSPS *sps)
 +{
-+    HEVCRpiJobCtl * const jbc = s->jbc;
-+    int i = 0;
++    VUI backup_vui, *vui = &sps->vui;
++    GetBitContext backup;
++    int sar_present, alt = 0;
 +
-+    // We shouldn't reach here with an unsubmitted job
-+    av_assert1(lc->jb0 == NULL);
++    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
 +
-+    // If no offload then there can't be anything to wait for
-+    if (!s->offload_recon) {
-+        return;
++    sar_present = get_bits1(gb);
++    if (sar_present) {
++        uint8_t sar_idx = get_bits(gb, 8);
++        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
++            vui->sar = vui_sar[sar_idx];
++        else if (sar_idx == 255) {
++            vui->sar.num = get_bits(gb, 16);
++            vui->sar.den = get_bits(gb, 16);
++        } else
++            av_log(avctx, AV_LOG_WARNING,
++                   "Unknown SAR index: %u.\n", sar_idx);
 +    }
 +
-+    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
-+    {
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            rpi_sem_wait(&jbc->sem_out);
-+        }
-+        for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+            sem_post(&jbc->sem_out);
++    vui->overscan_info_present_flag = get_bits1(gb);
++    if (vui->overscan_info_present_flag)
++        vui->overscan_appropriate_flag = get_bits1(gb);
++
++    vui->video_signal_type_present_flag = get_bits1(gb);
++    if (vui->video_signal_type_present_flag) {
++        vui->video_format                    = get_bits(gb, 3);
++        vui->video_full_range_flag           = get_bits1(gb);
++        vui->colour_description_present_flag = get_bits1(gb);
++        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
++            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
++        if (vui->colour_description_present_flag) {
++            vui->colour_primaries        = get_bits(gb, 8);
++            vui->transfer_characteristic = get_bits(gb, 8);
++            vui->matrix_coeffs           = get_bits(gb, 8);
++
++            // Set invalid values to "unspecified"
++            if (!av_color_primaries_name(vui->colour_primaries))
++                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
++            if (!av_color_transfer_name(vui->transfer_characteristic))
++                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
++            if (!av_color_space_name(vui->matrix_coeffs))
++                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
++            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
++                switch (sps->pix_fmt) {
++                case AV_PIX_FMT_YUV444P:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP;
++                    break;
++                case AV_PIX_FMT_YUV444P10:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
++                    break;
++                case AV_PIX_FMT_YUV444P12:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
++                    break;
++                }
++            }
 +        }
 +    }
 +
-+    // As the Q is the same length as the number of buffers out = in means all returned
-+    // ?????? probably not in the new world
-+    jbc->offload_out = jbc->offload_in;
-+}
-+
-+static void * pass_worker(void *arg)
-+{
-+    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
-+    HEVCContext *const s = pq->context;
++    vui->chroma_loc_info_present_flag = get_bits1(gb);
++    if (vui->chroma_loc_info_present_flag) {
++        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
++        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
++    }
 +
-+    for (;;)
-+    {
-+        rpi_sem_wait(&pq->sem_in);
++    vui->neutra_chroma_indication_flag = get_bits1(gb);
++    vui->field_seq_flag                = get_bits1(gb);
++    vui->frame_field_info_present_flag = get_bits1(gb);
++
++    // Backup context in case an alternate header is detected
++    memcpy(&backup, gb, sizeof(backup));
++    memcpy(&backup_vui, vui, sizeof(backup_vui));
++    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
++        vui->default_display_window_flag = 0;
++        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
++    } else
++        vui->default_display_window_flag = get_bits1(gb);
++
++    if (vui->default_display_window_flag) {
++        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
++        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
++        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
++        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
++        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
++
++        if (apply_defdispwin &&
++            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++            av_log(avctx, AV_LOG_DEBUG,
++                   "discarding vui default display window, "
++                   "original values are l:%u r:%u t:%u b:%u\n",
++                   vui->def_disp_win.left_offset,
++                   vui->def_disp_win.right_offset,
++                   vui->def_disp_win.top_offset,
++                   vui->def_disp_win.bottom_offset);
++
++            vui->def_disp_win.left_offset   =
++            vui->def_disp_win.right_offset  =
++            vui->def_disp_win.top_offset    =
++            vui->def_disp_win.bottom_offset = 0;
++        }
++    }
 +
-+        if (pq->terminate)
-+            break;
++timing_info:
++    vui->vui_timing_info_present_flag = get_bits1(gb);
++
++    if (vui->vui_timing_info_present_flag) {
++        if( get_bits_left(gb) < 66 && !alt) {
++            // The alternate syntax seem to have timing info located
++            // at where def_disp_win is normally located
++            av_log(avctx, AV_LOG_WARNING,
++                   "Strange VUI timing information, retrying...\n");
++            memcpy(vui, &backup_vui, sizeof(backup_vui));
++            memcpy(gb, &backup, sizeof(backup));
++            alt = 1;
++            goto timing_info;
++        }
++        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
++        vui->vui_time_scale                      = get_bits_long(gb, 32);
++        if (alt) {
++            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
++                   vui->vui_time_scale, vui->vui_num_units_in_tick);
++        }
++        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
++        if (vui->vui_poc_proportional_to_timing_flag)
++            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
++        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
++        if (vui->vui_hrd_parameters_present_flag)
++            decode_hrd(gb, 1, sps->max_sub_layers);
++    }
 +
-+        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq, RPI_MAX_JOBS)]);
-+        // * should really set jb->passes_done here
++    vui->bitstream_restriction_flag = get_bits1(gb);
++    if (vui->bitstream_restriction_flag) {
++        if (get_bits_left(gb) < 8 && !alt) {
++            av_log(avctx, AV_LOG_WARNING,
++                   "Strange VUI bitstream restriction information, retrying"
++                   " from timing information...\n");
++            memcpy(vui, &backup_vui, sizeof(backup_vui));
++            memcpy(gb, &backup, sizeof(backup));
++            alt = 1;
++            goto timing_info;
++        }
++        vui->tiles_fixed_structure_flag              = get_bits1(gb);
++        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
++        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
++        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
++        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
++        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
++        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
++        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
++    }
 +
-+        sem_post(pq->psem_out);
++    if (get_bits_left(gb) < 1 && !alt) {
++        // XXX: Alternate syntax when sps_range_extension_flag != 0?
++        av_log(avctx, AV_LOG_WARNING,
++               "Overread in VUI, retrying from timing information...\n");
++        memcpy(vui, &backup_vui, sizeof(backup_vui));
++        memcpy(gb, &backup, sizeof(backup));
++        alt = 1;
++        goto timing_info;
 +    }
-+    return NULL;
 +}
 +
-+static void pass_queues_start_all(HEVCContext *const s)
++static void set_default_scaling_list_data(ScalingList *sl)
 +{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
++    int matrixId;
 +
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
-+        pqs[i].started = 1;
++    for (matrixId = 0; matrixId < 6; matrixId++) {
++        // 4x4 default is 16
++        memset(sl->sl[0][matrixId], 16, 16);
++        sl->sl_dc[0][matrixId] = 16; // default for 16x16
++        sl->sl_dc[1][matrixId] = 16; // default for 32x32
 +    }
-+}
++    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
++    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
++    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
++    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
++    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
++    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
++}
++
++static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl, HEVCRpiSPS *sps)
++{
++    uint8_t scaling_list_pred_mode_flag;
++    int32_t scaling_list_dc_coef[2][6];
++    int size_id, matrix_id, pos;
++    int i;
 +
-+static void pass_queues_term_all(HEVCContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
++    for (size_id = 0; size_id < 4; size_id++)
++        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
++            scaling_list_pred_mode_flag = get_bits1(gb);
++            if (!scaling_list_pred_mode_flag) {
++                unsigned int delta = get_ue_golomb_long(gb);
++                /* Only need to handle non-zero delta. Zero means default,
++                 * which should already be in the arrays. */
++                if (delta) {
++                    // Copy from previous array.
++                    delta *= (size_id == 3) ? 3 : 1;
++                    if (matrix_id < delta) {
++                        av_log(avctx, AV_LOG_ERROR,
++                               "Invalid delta in scaling list data: %d.\n", delta);
++                        return AVERROR_INVALIDDATA;
++                    }
 +
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pqs[i].terminate = 1;
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started)
-+            sem_post(&pqs[i].sem_in);
-+    }
-+    for (i = 0; i != RPI_PASSES; ++i)
-+    {
-+        if (pqs[i].started) {
-+            pthread_join(pqs[i].thread, NULL);
-+            pqs[i].started = 0;
++                    memcpy(sl->sl[size_id][matrix_id],
++                           sl->sl[size_id][matrix_id - delta],
++                           size_id > 0 ? 64 : 16);
++                    if (size_id > 1)
++                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
++                }
++            } else {
++                int next_coef, coef_num;
++                int32_t scaling_list_delta_coef;
++
++                next_coef = 8;
++                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
++                if (size_id > 1) {
++                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
++                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
++                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
++                }
++                for (i = 0; i < coef_num; i++) {
++                    if (size_id == 0)
++                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
++                                  ff_hevc_rpi_diag_scan4x4_x[i];
++                    else
++                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
++                                  ff_hevc_rpi_diag_scan8x8_x[i];
++
++                    scaling_list_delta_coef = get_se_golomb(gb);
++                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
++                    sl->sl[size_id][matrix_id][pos] = next_coef;
++                }
++            }
 +        }
++
++    if (sps->chroma_format_idc == 3) {
++        for (i = 0; i < 64; i++) {
++            sl->sl[3][1][i] = sl->sl[2][1][i];
++            sl->sl[3][2][i] = sl->sl[2][2][i];
++            sl->sl[3][4][i] = sl->sl[2][4][i];
++            sl->sl[3][5][i] = sl->sl[2][5][i];
++        }
++        sl->sl_dc[1][1] = sl->sl_dc[0][1];
++        sl->sl_dc[1][2] = sl->sl_dc[0][2];
++        sl->sl_dc[1][4] = sl->sl_dc[0][4];
++        sl->sl_dc[1][5] = sl->sl_dc[0][5];
 +    }
-+}
 +
-+static void pass_queues_kill_all(HEVCContext *const s)
-+{
-+    unsigned int i;
-+    HEVCRpiPassQueue * const pqs = s->passq;
 +
-+    for (i = 0; i != RPI_PASSES; ++i)
-+        pass_queue_kill(pqs + i);
++    return 0;
 +}
 +
-+
-+static void worker_pic_free_one(HEVCRpiJob * const jb)
++static int map_pixel_format(HEVCRpiSPS * const sps)
 +{
-+    // Free coeff stuff - allocation not the same for all buffers
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++    const int cfmt = sps->chroma_format_idc;
 +
-+    if (cf->s[0].buf != NULL)
-+        av_freep(&cf->mptr);
-+    if (cf->s[2].buf != NULL)
-+        gpu_free(&cf->gptr);
-+    memset(cf, 0, sizeof(*cf));
-+}
++    sps->pix_fmt = AV_PIX_FMT_NONE;
++    switch (sps->bit_depth) {
++    case 8:
++        if (cfmt == 1)
++            sps->pix_fmt = AV_PIX_FMT_SAND128;
++        break;
++    case 10:
++        if (cfmt == 1)
++            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
++        break;
++    default:
++        break;
++    }
 +
-+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
-+{
-+    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++    sps->hshift[0] = sps->vshift[0] = 0;
++    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
++    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
 +
-+    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
-+        goto fail;
-+    cf->s[2].buf = (int16_t *)cf->gptr.arm;
-+    cf->s[3].buf = cf->s[2].buf + coeff_count;
++    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
 +
-+    // Must be 64 byte aligned for our zero zapping code so over-allocate &
-+    // round
-+    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
-+        goto fail;
-+    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
 +    return 0;
-+
-+fail:
-+    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
-+    worker_pic_free_one(jb);
-+    return -1;
 +}
 +
-+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++int ff_hevc_rpi_parse_sps(HEVCRpiSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++                      int apply_defdispwin, AVBufferRef **vps_list, AVCodecContext *avctx)
 +{
-+    unsigned int i;
-+    for (i = 0; i != 4; ++i) {
-+        cf->s[i].n = 0;
-+    }
-+}
-+#endif
++    HEVCWindow *ow;
++    int ret = 0;
++    int log2_diff_max_min_transform_block_size;
++    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
++    int i;
 +
++    // Coded parameters
 +
- /**
-  * NOTE: Each function hls_foo correspond to the function foo in the
-  * specification (HLS stands for High Level Syntax).
-@@ -57,6 +730,19 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
- /* free everything allocated  by pic_arrays_init() */
- static void pic_arrays_free(HEVCContext *s)
- {
-+#ifdef RPI_DEBLOCK_VPU
-+    {
-+        int i;
-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++    sps->vps_id = get_bits(gb, 4);
++    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
++        return AVERROR_INVALIDDATA;
++    }
 +
-+            if (dvq->vpu_cmds_arm) {
-+                gpu_free(&dvq->deblock_vpu_gmem);
-+              dvq->vpu_cmds_arm = 0;
++    if (vps_list && !vps_list[sps->vps_id]) {
++        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
++               sps->vps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->max_sub_layers = get_bits(gb, 3) + 1;
++    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
++               sps->max_sub_layers);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->temporal_id_nesting_flag = get_bits(gb, 1);
++
++    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
++        return ret;
++
++    *sps_id = get_ue_golomb_long(gb);
++    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->chroma_format_idc = get_ue_golomb_long(gb);
++    if (sps->chroma_format_idc > 3U) {
++        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->chroma_format_idc == 3)
++        sps->separate_colour_plane_flag = get_bits1(gb);
++
++    if (sps->separate_colour_plane_flag)
++        sps->chroma_format_idc = 0;
++
++    sps->width  = get_ue_golomb_long(gb);
++    sps->height = get_ue_golomb_long(gb);
++    if ((ret = av_image_check_size(sps->width,
++                                   sps->height, 0, avctx)) < 0)
++        return ret;
++
++    if (get_bits1(gb)) { // pic_conformance_flag
++        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
++        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
++        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
++        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
++        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
++
++        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++            av_log(avctx, AV_LOG_DEBUG,
++                   "discarding sps conformance window, "
++                   "original values are l:%u r:%u t:%u b:%u\n",
++                   sps->pic_conf_win.left_offset,
++                   sps->pic_conf_win.right_offset,
++                   sps->pic_conf_win.top_offset,
++                   sps->pic_conf_win.bottom_offset);
++
++            sps->pic_conf_win.left_offset   =
++            sps->pic_conf_win.right_offset  =
++            sps->pic_conf_win.top_offset    =
++            sps->pic_conf_win.bottom_offset = 0;
++        }
++        sps->output_window = sps->pic_conf_win;
++    }
++
++    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
++    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
++    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Luma bit depth (%d) is different from chroma bit depth (%d), "
++               "this is unsupported.\n",
++               sps->bit_depth, bit_depth_chroma);
++        return AVERROR_INVALIDDATA;
++    }
++    sps->bit_depth_chroma = bit_depth_chroma;
++
++    ret = map_pixel_format(sps);
++    if (ret < 0)
++        return ret;
++
++    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
++    if (sps->log2_max_poc_lsb > 16) {
++        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
++               sps->log2_max_poc_lsb - 4);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sublayer_ordering_info = get_bits1(gb);
++    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
++    for (i = start; i < sps->max_sub_layers; i++) {
++        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
++        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
++        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
++        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
++            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
++                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
++            return AVERROR_INVALIDDATA;
++        }
++        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
++            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
++                   sps->temporal_layer[i].num_reorder_pics);
++            if (avctx->err_recognition & AV_EF_EXPLODE ||
++                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
++                return AVERROR_INVALIDDATA;
 +            }
++            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
 +        }
 +    }
-+#endif
-     av_freep(&s->sao);
-     av_freep(&s->deblock);
- 
-@@ -93,6 +779,64 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
-     int ctb_count        = sps->ctb_width * sps->ctb_height;
-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
- 
-+#ifdef RPI
-+    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+    const int coefs_per_luma = HEVC_MAX_CTB_SIZE * RPI_MAX_WIDTH;
 +
-+    av_assert0(sps);
-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+#endif
-+#ifdef RPI_DEBLOCK_VPU
-+    {
-+        int i;
-+        s->enable_rpi_deblock = !sps->sao_enabled;
-+        s->setup_width = (sps->width+15) / 16;
-+        s->setup_height = (sps->height+15) / 16;
-+        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
-+        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
++    if (!sublayer_ordering_info) {
++        for (i = 0; i < start; i++) {
++            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
++            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
++            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
++        }
++    }
 +
-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
-+        {
-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
-+            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
-+            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
-+            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
-+            const unsigned int total_size =- cmd_size + y_size + uv_size;
-+            int p_vc;
-+            uint8_t * p_arm;
-+ #if RPI_VPU_DEBLOCK_CACHED
-+            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
-+ #else
-+            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
-+ #endif
-+            p_vc = dvq->deblock_vpu_gmem.vc;
-+            p_arm = dvq->deblock_vpu_gmem.arm;
++    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
++    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
++    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
++    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
++    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
++                                               sps->log2_min_tb_size;
 +
-+            // Zap all
-+            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
++    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
++        return AVERROR_INVALIDDATA;
++    }
 +
-+            // Subdivide
-+            dvq->vpu_cmds_arm = (void*)p_arm;
-+            dvq->vpu_cmds_vc = p_vc;
++    if (sps->log2_diff_max_min_coding_block_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
++        return AVERROR_INVALIDDATA;
++    }
 +
-+            p_arm += cmd_size;
-+            p_vc += cmd_size;
++    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
++        return AVERROR_INVALIDDATA;
++    }
 +
-+            dvq->y_setup_arm = (void*)p_arm;
-+            dvq->y_setup_vc = (void*)p_vc;
++    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
++        return AVERROR_INVALIDDATA;
++    }
 +
-+            p_arm += y_size;
-+            p_vc += y_size;
++    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
++    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
 +
-+            dvq->uv_setup_arm = (void*)p_arm;
-+            dvq->uv_setup_vc = (void*)p_vc;
++    sps->scaling_list_enable_flag = get_bits1(gb);
++    if (sps->scaling_list_enable_flag) {
++        set_default_scaling_list_data(&sps->scaling_list);
++
++        if (get_bits1(gb)) {
++            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
++            if (ret < 0)
++                return ret;
 +        }
++    }
 +
-+        s->dvq_n = 0;
-+        s->dvq = s->dvq_ents + s->dvq_n;
++    sps->amp_enabled_flag = get_bits1(gb);
++    sps->sao_enabled      = get_bits1(gb);
++
++    sps->pcm_enabled_flag = get_bits1(gb);
++    if (sps->pcm_enabled_flag) {
++        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
++        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
++        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
++        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
++                                        get_ue_golomb_long(gb);
++        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
++                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
++            return AVERROR_INVALIDDATA;
++        }
++
++        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
 +    }
-+#endif
 +
-     s->bs_width  = (width  >> 2) + 1;
-     s->bs_height = (height >> 2) + 1;
- 
-@@ -139,6 +883,29 @@ fail:
-     return AVERROR(ENOMEM);
- }
- 
-+static void default_pred_weight_table(HEVCContext * const s)
-+{
-+  unsigned int i;
-+  s->sh.luma_log2_weight_denom = 0;
-+  s->sh.chroma_log2_weight_denom = 0;
-+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
-+      s->sh.luma_weight_l0[i] = 1;
-+      s->sh.luma_offset_l0[i] = 0;
-+      s->sh.chroma_weight_l0[i][0] = 1;
-+      s->sh.chroma_offset_l0[i][0] = 0;
-+      s->sh.chroma_weight_l0[i][1] = 1;
-+      s->sh.chroma_offset_l0[i][1] = 0;
-+  }
-+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
-+      s->sh.luma_weight_l1[i] = 1;
-+      s->sh.luma_offset_l1[i] = 0;
-+      s->sh.chroma_weight_l1[i][0] = 1;
-+      s->sh.chroma_offset_l1[i][0] = 0;
-+      s->sh.chroma_weight_l1[i][1] = 1;
-+      s->sh.chroma_offset_l1[i][1] = 0;
-+  }
-+}
++    sps->nb_st_rps = get_ue_golomb_long(gb);
++    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_RPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
++               sps->nb_st_rps);
++        return AVERROR_INVALIDDATA;
++    }
++    for (i = 0; i < sps->nb_st_rps; i++) {
++        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
++                                                 sps, 0)) < 0)
++            return ret;
++    }
 +
- static int pred_weight_table(HEVCContext *s, GetBitContext *gb)
- {
-     int i = 0;
-@@ -357,11 +1124,17 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
-                      CONFIG_HEVC_VAAPI_HWACCEL + \
-                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
-                      CONFIG_HEVC_VDPAU_HWACCEL)
--    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
-+    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
- 
-     switch (sps->pix_fmt) {
-     case AV_PIX_FMT_YUV420P:
-     case AV_PIX_FMT_YUVJ420P:
-+#if RPI_HEVC_SAND
-+        // Currently geometry calc is stuffed for big sizes
-+        if (sps->width < 2048 && sps->height <= 1088) {
-+            *fmt++ = AV_PIX_FMT_SAND128;
++    sps->long_term_ref_pics_present_flag = get_bits1(gb);
++    if (sps->long_term_ref_pics_present_flag) {
++        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
++        if (sps->num_long_term_ref_pics_sps > 31U) {
++            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
++                   sps->num_long_term_ref_pics_sps);
++            return AVERROR_INVALIDDATA;
 +        }
-+#endif
- #if CONFIG_HEVC_DXVA2_HWACCEL
-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
- #endif
-@@ -380,6 +1153,12 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #endif
-         break;
-     case AV_PIX_FMT_YUV420P10:
-+#if RPI_HEVC_SAND
-+        // Currently geometry calc is stuffed for big sizes
-+        if (sps->width < 2048 && sps->height <= 1088) {
-+            *fmt++ = AV_PIX_FMT_SAND64_10;
++        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
++            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
++            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
 +        }
-+#endif
- #if CONFIG_HEVC_DXVA2_HWACCEL
-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
- #endif
-@@ -405,7 +1184,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- static int set_sps(HEVCContext *s, const HEVCSPS *sps,
-                    enum AVPixelFormat pix_fmt)
- {
--    int ret, i;
-+    int ret;
- 
-     pic_arrays_free(s);
-     s->ps.sps = NULL;
-@@ -425,26 +1204,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
-     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
-     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
-     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
-+#ifdef RPI
-+    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
-+#endif
- 
--    for (i = 0; i < 3; i++) {
--        av_freep(&s->sao_pixel_buffer_h[i]);
--        av_freep(&s->sao_pixel_buffer_v[i]);
--    }
-+    av_freep(&s->sao_pixel_buffer_h[0]);
-+    av_freep(&s->sao_pixel_buffer_v[0]);
- 
-     if (sps->sao_enabled && !s->avctx->hwaccel) {
--        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
--        int c_idx;
-+        const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
-+        unsigned int c_idx;
-+        size_t vsize[3] = {0};
-+        size_t hsize[3] = {0};
- 
-         for(c_idx = 0; c_idx < c_count; c_idx++) {
-             int w = sps->width >> sps->hshift[c_idx];
-             int h = sps->height >> sps->vshift[c_idx];
--            s->sao_pixel_buffer_h[c_idx] =
--                av_malloc((w * 2 * sps->ctb_height) <<
--                          sps->pixel_shift);
--            s->sao_pixel_buffer_v[c_idx] =
--                av_malloc((h * 2 * sps->ctb_width) <<
--                          sps->pixel_shift);
-+            // ctb height & width are a min of 8 so this must a multiple of 16
-+            // so no point rounding up!
-+            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
-+            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
-         }
++    }
 +
-+        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
-+        // when we have plaited chroma
-+        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
-+        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
-+        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
-+        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
-+        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
-+        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
-     }
- 
-     s->ps.sps = sps;
-@@ -719,6 +1508,11 @@ static int hls_slice_header(HEVCContext *s)
-                 if (ret < 0)
-                     return ret;
-             }
-+            else
-+            {
-+              // Give us unit weights
-+              default_pred_weight_table(s);
-+            }
- 
-             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
-             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -867,55 +1661,52 @@ static int hls_slice_header(HEVCContext *s)
-         return AVERROR_INVALIDDATA;
-     }
- 
--    s->HEVClc->first_qp_group = !s->sh.dependent_slice_segment_flag;
-+    // ??? overridden by get neighbour ???
-+//    s->HEVClc->first_qp_group = !s->sh.dependent_slice_segment_flag;
-+//    if (!s->sh.dependent_slice_segment_flag) {
-+//        s->HEVClc->qPy_pred = s->sh.slice_qp;
-+//   }
- 
--    if (!s->ps.pps->cu_qp_delta_enabled_flag)
--        s->HEVClc->qp_y = s->sh.slice_qp;
-+//    if (!s->ps.pps->cu_qp_delta_enabled_flag)
-+//        s->HEVClc->qp_y = s->sh.slice_qp;
- 
-     s->slice_initialized = 1;
--    s->HEVClc->tu.cu_qp_offset_cb = 0;
--    s->HEVClc->tu.cu_qp_offset_cr = 0;
-+//    s->HEVClc->tu.cu_qp_offset_cb = 0;
-+//    s->HEVClc->tu.cu_qp_offset_cr = 0;
- 
-     return 0;
- }
- 
--#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
--
--#define SET_SAO(elem, value)                            \
--do {                                                    \
--    if (!sao_merge_up_flag && !sao_merge_left_flag)     \
--        sao->elem = value;                              \
--    else if (sao_merge_left_flag)                       \
--        sao->elem = CTB(s->sao, rx-1, ry).elem;         \
--    else if (sao_merge_up_flag)                         \
--        sao->elem = CTB(s->sao, rx, ry-1).elem;         \
--    else                                                \
--        sao->elem = 0;                                  \
--} while (0)
--
--static void hls_sao_param(HEVCContext *s, int rx, int ry)
-+static void hls_sao_param(const HEVCContext *s, HEVCLocalContext * const lc, const int rx, const int ry)
- {
--    HEVCLocalContext *lc    = s->HEVClc;
--    int sao_merge_left_flag = 0;
--    int sao_merge_up_flag   = 0;
--    SAOParams *sao          = &CTB(s->sao, rx, ry);
-+    SAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
-     int c_idx, i;
- 
-     if (s->sh.slice_sample_adaptive_offset_flag[0] ||
-         s->sh.slice_sample_adaptive_offset_flag[1]) {
--        if (rx > 0) {
--            if (lc->ctb_left_flag)
--                sao_merge_left_flag = ff_hevc_sao_merge_flag_decode(s);
-+        if (lc->ctb_left_flag)
-+        {
-+            const int sao_merge_left_flag = ff_hevc_sao_merge_flag_decode(lc);
-+            if (sao_merge_left_flag) {
-+                *sao = sao[-1];
-+                return;
-+            }
-         }
--        if (ry > 0 && !sao_merge_left_flag) {
--            if (lc->ctb_up_flag)
--                sao_merge_up_flag = ff_hevc_sao_merge_flag_decode(s);
-+        if (lc->ctb_up_flag)
-+        {
-+            const int sao_merge_up_flag = ff_hevc_sao_merge_flag_decode(lc);
-+            if (sao_merge_up_flag) {
-+                *sao = sao[-(int)s->ps.sps->ctb_width];
-+                return;
-+            }
-         }
-     }
- 
-     for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
--        int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
-+        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
-                                                  s->ps.pps->log2_sao_offset_scale_chroma;
-+        int offset_abs[4];
-+        char offset_sign[4] = {0};
- 
-         if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
-             sao->type_idx[c_idx] = SAO_NOT_APPLIED;
-@@ -926,53 +1717,47 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry)
-             sao->type_idx[2] = sao->type_idx[1];
-             sao->eo_class[2] = sao->eo_class[1];
-         } else {
--            SET_SAO(type_idx[c_idx], ff_hevc_sao_type_idx_decode(s));
-+            sao->type_idx[c_idx] = ff_hevc_sao_type_idx_decode(lc);
-         }
- 
-+        // *** Could use BY22 here quite plausibly - this is all bypass stuff
++    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
++    sps->sps_strong_intra_smoothing_enable_flag = get_bits1(gb);
++    sps->vui.sar = (AVRational){0, 1};
++    vui_present = get_bits1(gb);
++    if (vui_present)
++        decode_vui(gb, avctx, apply_defdispwin, sps);
++
++    if (get_bits1(gb)) { // sps_extension_flag
++        int sps_extension_flag[1];
++        for (i = 0; i < 1; i++)
++            sps_extension_flag[i] = get_bits1(gb);
++        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
++        if (sps_extension_flag[0]) {
++            int extended_precision_processing_flag;
++            int cabac_bypass_alignment_enabled_flag;
++
++            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
++            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
++            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
++
++            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
++
++            extended_precision_processing_flag = get_bits1(gb);
++            if (extended_precision_processing_flag)
++                av_log(avctx, AV_LOG_WARNING,
++                   "extended_precision_processing_flag not yet implemented\n");
++
++            sps->intra_smoothing_disabled_flag       = get_bits1(gb);
++            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
++            if (sps->high_precision_offsets_enabled_flag)
++                av_log(avctx, AV_LOG_WARNING,
++                   "high_precision_offsets_enabled_flag not fully implemented\n");
 +
-         if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
-             continue;
- 
-         for (i = 0; i < 4; i++)
--            SET_SAO(offset_abs[c_idx][i], ff_hevc_sao_offset_abs_decode(s));
-+            offset_abs[i] = ff_hevc_sao_offset_abs_decode(s, lc);
- 
-         if (sao->type_idx[c_idx] == SAO_BAND) {
-             for (i = 0; i < 4; i++) {
--                if (sao->offset_abs[c_idx][i]) {
--                    SET_SAO(offset_sign[c_idx][i],
--                            ff_hevc_sao_offset_sign_decode(s));
--                } else {
--                    sao->offset_sign[c_idx][i] = 0;
--                }
-+                if (offset_abs[i] != 0)
-+                    offset_sign[i] = ff_hevc_sao_offset_sign_decode(lc);
-             }
--            SET_SAO(band_position[c_idx], ff_hevc_sao_band_position_decode(s));
-+            sao->band_position[c_idx] = ff_hevc_sao_band_position_decode(lc);
-         } else if (c_idx != 2) {
--            SET_SAO(eo_class[c_idx], ff_hevc_sao_eo_class_decode(s));
-+            sao->eo_class[c_idx] = ff_hevc_sao_eo_class_decode(lc);
-         }
- 
-         // Inferred parameters
-         sao->offset_val[c_idx][0] = 0;
-         for (i = 0; i < 4; i++) {
--            sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i];
-+            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
-             if (sao->type_idx[c_idx] == SAO_EDGE) {
-                 if (i > 1)
-                     sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
--            } else if (sao->offset_sign[c_idx][i]) {
-+            } else if (offset_sign[i]) {
-                 sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-             }
--            sao->offset_val[c_idx][i + 1] *= 1 << log2_sao_offset_scale;
-         }
-     }
- }
- 
--#undef SET_SAO
--#undef CTB
- 
--static int hls_cross_component_pred(HEVCContext *s, int idx) {
--    HEVCLocalContext *lc    = s->HEVClc;
--    int log2_res_scale_abs_plus1 = ff_hevc_log2_res_scale_abs(s, idx);
-+static int hls_cross_component_pred(HEVCLocalContext * const lc, const int idx) {
-+    int log2_res_scale_abs_plus1 = ff_hevc_log2_res_scale_abs(lc, idx);
- 
-     if (log2_res_scale_abs_plus1 !=  0) {
--        int res_scale_sign_flag = ff_hevc_res_scale_sign_flag(s, idx);
-+        int res_scale_sign_flag = ff_hevc_res_scale_sign_flag(lc, idx);
-         lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
-                                (1 - 2 * res_scale_sign_flag);
-     } else {
-@@ -983,20 +1768,54 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
-     return 0;
- }
- 
--static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-+#ifdef RPI
-+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
-+{
-+    return jb->intra.cmds + jb->intra.n++;
-+}
++            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
 +
-+static void rpi_intra_pred(const HEVCContext * const s, HEVCLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx)
-+{
-+    // U & V done on U call in the case of sliced frames
-+    if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
-+        return;
++            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
++            if (cabac_bypass_alignment_enabled_flag)
++                av_log(avctx, AV_LOG_WARNING,
++                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
++        }
++    }
++    if (apply_defdispwin) {
++        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
++        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
++        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
++        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
++    }
 +
-+    if (s->enable_rpi) {
-+        HEVCPredCmd *cmd = rpi_new_intra_cmd(lc->jb0);
-+        cmd->type = RPI_PRED_INTRA;
-+        cmd->size = log2_trafo_size;
-+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
-+        cmd->c_idx = c_idx;
-+        cmd->i_pred.x = x0;
-+        cmd->i_pred.y = y0;
-+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
++    ow = &sps->output_window;
++    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
++        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
++        ow->left_offset + ow->right_offset  >= sps->width ||
++        ow->top_offset  + ow->bottom_offset >= sps->height) {
++        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
++               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
++        if (avctx->err_recognition & AV_EF_EXPLODE) {
++            return AVERROR_INVALIDDATA;
++        }
++        av_log(avctx, AV_LOG_WARNING,
++               "Displaying the whole video surface.\n");
++        memset(ow, 0, sizeof(*ow));
++        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
 +    }
-+    else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
-+        s->hpc.intra_pred_c[log2_trafo_size - 2](s, lc, x0, y0, c_idx);
++
++    // Inferred parameters
++    sps->log2_ctb_size = sps->log2_min_cb_size +
++                         sps->log2_diff_max_min_coding_block_size;
++    sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
++
++    if (sps->log2_ctb_size > HEVC_MAX_LOG2_CTB_SIZE) {
++        av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size);
++        return AVERROR_INVALIDDATA;
 +    }
-+    else {
-+        s->hpc.intra_pred[log2_trafo_size - 2](s, lc, x0, y0, c_idx);
++    if (sps->log2_ctb_size < 4) {
++        av_log(avctx,
++               AV_LOG_ERROR,
++               "log2_ctb_size %d differs from the bounds of any known profile\n",
++               sps->log2_ctb_size);
++        avpriv_request_sample(avctx, "log2_ctb_size %d", sps->log2_ctb_size);
++        return AVERROR_INVALIDDATA;
 +    }
 +
-+}
-+#endif
++    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
 +
-+static int hls_transform_unit(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0,
-                               int xBase, int yBase, int cb_xBase, int cb_yBase,
-                               int log2_cb_size, int log2_trafo_size,
-                               int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1];
-     int i;
- 
-     if (lc->cu.pred_mode == MODE_INTRA) {
-         int trafo_size = 1 << log2_trafo_size;
--        ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
--
--        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
-+        ff_hevc_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size);
-+#ifdef RPI
-+        rpi_intra_pred(s, lc, log2_trafo_size, x0, y0, 0);
-+#else
-+        s->hpc.intra_pred[log2_trafo_size - 2](s, lc, x0, y0, 0);
-+#endif
-     }
- 
-     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1008,9 +1827,9 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                          (cbf_cb[1] || cbf_cr[1]));
- 
-         if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) {
--            lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(s);
-+            lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(lc);
-             if (lc->tu.cu_qp_delta != 0)
--                if (ff_hevc_cu_qp_delta_sign_flag(s) == 1)
-+                if (ff_hevc_cu_qp_delta_sign_flag(lc) == 1)
-                     lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta;
-             lc->tu.is_cu_qp_delta_coded = 1;
- 
-@@ -1025,24 +1844,24 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                 return AVERROR_INVALIDDATA;
-             }
- 
--            ff_hevc_set_qPy(s, cb_xBase, cb_yBase, log2_cb_size);
-+            ff_hevc_set_qPy(s, lc, cb_xBase, cb_yBase, log2_cb_size);
-         }
- 
--        if (s->sh.cu_chroma_qp_offset_enabled_flag && cbf_chroma &&
--            !lc->cu.cu_transquant_bypass_flag  &&  !lc->tu.is_cu_chroma_qp_offset_coded) {
--            int cu_chroma_qp_offset_flag = ff_hevc_cu_chroma_qp_offset_flag(s);
-+        if (!lc->tu.is_cu_chroma_qp_offset_coded && cbf_chroma &&
-+            !lc->cu.cu_transquant_bypass_flag) {
-+            int cu_chroma_qp_offset_flag = ff_hevc_cu_chroma_qp_offset_flag(lc);
-             if (cu_chroma_qp_offset_flag) {
-                 int cu_chroma_qp_offset_idx  = 0;
-                 if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
--                    cu_chroma_qp_offset_idx = ff_hevc_cu_chroma_qp_offset_idx(s);
-+                    cu_chroma_qp_offset_idx = ff_hevc_cu_chroma_qp_offset_idx(s, lc);
-                     av_log(s->avctx, AV_LOG_ERROR,
-                         "cu_chroma_qp_offset_idx not yet tested.\n");
-                 }
-                 lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
-                 lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
-             } else {
--                lc->tu.cu_qp_offset_cb = 0;
--                lc->tu.cu_qp_offset_cr = 0;
-+//                lc->tu.cu_qp_offset_cb = 0;
-+//                lc->tu.cu_qp_offset_cr = 0;
-             }
-             lc->tu.is_cu_chroma_qp_offset_coded = 1;
-         }
-@@ -1068,7 +1887,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-         lc->tu.cross_pf = 0;
- 
-         if (cbf_luma)
--            ff_hevc_hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0);
-+            ff_hevc_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
-         if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
-             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
-             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
-@@ -1077,15 +1896,19 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-                                  (lc->tu.chroma_mode_c ==  4)));
- 
-             if (lc->tu.cross_pf) {
--                hls_cross_component_pred(s, 0);
-+                hls_cross_component_pred(lc, 0);
-             }
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-                 if (lc->cu.pred_mode == MODE_INTRA) {
--                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
--                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
-+                    ff_hevc_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                    rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
-+#else
-+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (i << log2_trafo_size_c), 1);
-+#endif
-                 }
-                 if (cbf_cb[i])
--                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-+                    ff_hevc_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
-                                                 log2_trafo_size_c, scan_idx_c, 1);
-                 else
-                     if (lc->tu.cross_pf) {
-@@ -1106,15 +1929,19 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-             }
- 
-             if (lc->tu.cross_pf) {
--                hls_cross_component_pred(s, 1);
-+                hls_cross_component_pred(lc, 1);
-             }
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-                 if (lc->cu.pred_mode == MODE_INTRA) {
--                    ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
--                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
-+                    ff_hevc_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+                    rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
-+#else
-+                    s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (i << log2_trafo_size_c), 2);
-+#endif
-                 }
-                 if (cbf_cr[i])
--                    ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-+                    ff_hevc_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
-                                                 log2_trafo_size_c, scan_idx_c, 2);
-                 else
-                     if (lc->tu.cross_pf) {
-@@ -1138,22 +1965,30 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-                 if (lc->cu.pred_mode == MODE_INTRA) {
--                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-+                    ff_hevc_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
-                                                     trafo_size_h, trafo_size_v);
--                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
-+#ifdef RPI
-+                    rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
-+#else
-+                    s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (i << log2_trafo_size), 1);
-+#endif
-                 }
-                 if (cbf_cb[i])
--                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-+                    ff_hevc_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
-                                                 log2_trafo_size, scan_idx_c, 1);
-             }
-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
-                 if (lc->cu.pred_mode == MODE_INTRA) {
--                    ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
-+                    ff_hevc_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
-                                                 trafo_size_h, trafo_size_v);
--                    s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
-+#ifdef RPI
-+                    rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
-+#else
-+                    s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (i << log2_trafo_size), 2);
-+#endif
-                 }
-                 if (cbf_cr[i])
--                    ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-+                    ff_hevc_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
-                                                 log2_trafo_size, scan_idx_c, 2);
-             }
-         }
-@@ -1161,27 +1996,47 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-         if (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3) {
-             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
-             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
--            ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
--            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
--            s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
-+            ff_hevc_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v);
-+#ifdef RPI
-+            rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1);
-+            rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2);
-+#else
-+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0, 1);
-+            s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0, 2);
-+#endif
-             if (s->ps.sps->chroma_format_idc == 2) {
--                ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
-+                ff_hevc_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c),
-                                                 trafo_size_h, trafo_size_v);
--                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
--                s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
-+#ifdef RPI
-+                rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
-+                rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
-+#else
-+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (1 << log2_trafo_size_c), 1);
-+                s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (1 << log2_trafo_size_c), 2);
-+#endif
-             }
-         } else if (blk_idx == 3) {
-             int trafo_size_h = 1 << (log2_trafo_size + 1);
-             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
--            ff_hevc_set_neighbour_available(s, xBase, yBase,
-+            ff_hevc_set_neighbour_available(s, lc, xBase, yBase,
-                                             trafo_size_h, trafo_size_v);
--            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
--            s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
-+#ifdef RPI
-+            rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1);
-+            rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2);
-+#else
-+            s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase, 1);
-+            s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase, 2);
-+#endif
-             if (s->ps.sps->chroma_format_idc == 2) {
--                ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
-+                ff_hevc_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)),
-                                                 trafo_size_h, trafo_size_v);
--                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
--                s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
-+#ifdef RPI
-+                rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
-+                rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
-+#else
-+                s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (1 << (log2_trafo_size)), 1);
-+                s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (1 << (log2_trafo_size)), 2);
-+#endif
-             }
-         }
-     }
-@@ -1189,7 +2044,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
-     return 0;
- }
- 
--static void set_deblocking_bypass(HEVCContext *s, int x0, int y0, int log2_cb_size)
-+static void set_deblocking_bypass(const HEVCContext * const s, const int x0, const int y0, const int log2_cb_size)
- {
-     int cb_size          = 1 << log2_cb_size;
-     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
-@@ -1204,13 +2059,12 @@ static void set_deblocking_bypass(HEVCContext *s, int x0, int y0, int log2_cb_si
-             s->is_pcm[i + j * min_pu_width] = 2;
- }
- 
--static int hls_transform_tree(HEVCContext *s, int x0, int y0,
-+static int hls_transform_tree(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0,
-                               int xBase, int yBase, int cb_xBase, int cb_yBase,
-                               int log2_cb_size, int log2_trafo_size,
-                               int trafo_depth, int blk_idx,
-                               const int *base_cbf_cb, const int *base_cbf_cr)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     uint8_t split_transform_flag;
-     int cbf_cb[2];
-     int cbf_cr[2];
-@@ -1242,7 +2096,7 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
-         log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
-         trafo_depth     < lc->cu.max_trafo_depth       &&
-         !(lc->cu.intra_split_flag && trafo_depth == 0)) {
--        split_transform_flag = ff_hevc_split_transform_flag_decode(s, log2_trafo_size);
-+        split_transform_flag = ff_hevc_split_transform_flag_decode(lc, log2_trafo_size);
-     } else {
-         int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
-                           lc->cu.pred_mode == MODE_INTER &&
-@@ -1256,16 +2110,16 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
- 
-     if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) {
-         if (trafo_depth == 0 || cbf_cb[0]) {
--            cbf_cb[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-+            cbf_cb[0] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth);
-             if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
--                cbf_cb[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-+                cbf_cb[1] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth);
-             }
-         }
- 
-         if (trafo_depth == 0 || cbf_cr[0]) {
--            cbf_cr[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-+            cbf_cr[0] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth);
-             if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
--                cbf_cr[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth);
-+                cbf_cr[1] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth);
-             }
-         }
-     }
-@@ -1277,7 +2131,7 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0,
- 
- #define SUBDIVIDE(x, y, idx)                                                    \
- do {                                                                            \
--    ret = hls_transform_tree(s, x, y, x0, y0, cb_xBase, cb_yBase, log2_cb_size, \
-+    ret = hls_transform_tree(s, lc, x, y, x0, y0, cb_xBase, cb_yBase, log2_cb_size, \
-                              log2_trafo_size - 1, trafo_depth + 1, idx,         \
-                              cbf_cb, cbf_cr);                                   \
-     if (ret < 0)                                                                \
-@@ -1299,10 +2153,10 @@ do {
-         if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 ||
-             cbf_cb[0] || cbf_cr[0] ||
-             (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) {
--            cbf_luma = ff_hevc_cbf_luma_decode(s, trafo_depth);
-+            cbf_luma = ff_hevc_cbf_luma_decode(lc, trafo_depth);
-         }
- 
--        ret = hls_transform_unit(s, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
-+        ret = hls_transform_unit(s, lc, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
-                                  log2_cb_size, log2_trafo_size,
-                                  blk_idx, cbf_luma, cbf_cb, cbf_cr);
-         if (ret < 0)
-@@ -1318,7 +2172,7 @@ do {
-                 }
-         }
-         if (!s->sh.disable_deblocking_filter_flag) {
--            ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_trafo_size);
-+            ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size);
-             if (s->ps.pps->transquant_bypass_enable_flag &&
-                 lc->cu.cu_transquant_bypass_flag)
-                 set_deblocking_bypass(s, x0, y0, log2_trafo_size);
-@@ -1327,47 +2181,119 @@ do {
-     return 0;
- }
- 
--static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
++    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
++    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
++    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
++    sps->min_pu_width  = sps->width  >> sps->log2_min_pu_size;
++    sps->min_pu_height = sps->height >> sps->log2_min_pu_size;
++    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
 +
-+static int pcm_extract(const HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     GetBitContext gb;
--    int cb_size   = 1 << log2_cb_size;
--    ptrdiff_t stride0 = s->frame->linesize[0];
--    ptrdiff_t stride1 = s->frame->linesize[1];
--    ptrdiff_t stride2 = s->frame->linesize[2];
--    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
--    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
--    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
--
--    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
--                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
--                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
--                          s->ps.sps->pcm.bit_depth_chroma;
--    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
-     int ret;
- 
--    if (!s->sh.disable_deblocking_filter_flag)
--        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
--
-     ret = init_get_bits(&gb, pcm, length);
-     if (ret < 0)
-         return ret;
- 
--    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
--    if (s->ps.sps->chroma_format_idc) {
--        s->hevcdsp.put_pcm(dst1, stride1,
-+#if RPI_HEVC_SAND
-+    if (av_rpi_is_sand_frame(s->frame)) {
-+        s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
-+                           s->frame->linesize[0],
-+                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
-+
-+        s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
-+                           s->frame->linesize[1],
-                            cb_size >> s->ps.sps->hshift[1],
-                            cb_size >> s->ps.sps->vshift[1],
-                            &gb, s->ps.sps->pcm.bit_depth_chroma);
--        s->hevcdsp.put_pcm(dst2, stride2,
--                           cb_size >> s->ps.sps->hshift[2],
--                           cb_size >> s->ps.sps->vshift[2],
--                           &gb, s->ps.sps->pcm.bit_depth_chroma);
-     }
-+    else
-+#endif
-+    {
-+        const int stride0   = s->frame->linesize[0];
-+        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
-+        const int   stride1 = s->frame->linesize[1];
-+        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
-+        const int   stride2 = s->frame->linesize[2];
-+        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
-+
-+        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
-+        if (s->ps.sps->chroma_format_idc) {
-+            s->hevcdsp.put_pcm(dst1, stride1,
-+                               cb_size >> s->ps.sps->hshift[1],
-+                               cb_size >> s->ps.sps->vshift[1],
-+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
-+            s->hevcdsp.put_pcm(dst2, stride2,
-+                               cb_size >> s->ps.sps->hshift[2],
-+                               cb_size >> s->ps.sps->vshift[2],
-+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
-+        }
- 
++    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
++
++    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
++        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
++        return AVERROR_INVALIDDATA;
 +    }
-     return 0;
- }
- 
-+#ifdef RPI
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
-+{
-+    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
-+    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
-+    cfe->n += n;
-+    return coeffs;
-+}
-+#endif
 +
-+// x * 2^(y*2)
-+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
-+{
-+    return x << (y * 2);
++    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
++        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
++               sps->max_transform_hierarchy_depth_inter);
++        return AVERROR_INVALIDDATA;
++    }
++    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
++        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
++               sps->max_transform_hierarchy_depth_intra);
++        return AVERROR_INVALIDDATA;
++    }
++    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
++        av_log(avctx, AV_LOG_ERROR,
++               "max transform block size out of range: %d\n",
++               sps->log2_max_trafo_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread SPS by %d bits\n", -get_bits_left(gb));
++        return AVERROR_INVALIDDATA;
++    }
++
++    return 0;
 +}
 +
-+static int hls_pcm_sample(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps, int apply_defdispwin)
 +{
-+    // Length in bits
-+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
++    HEVCRpiSPS *sps;
++    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
++    unsigned int sps_id;
++    int ret;
++    ptrdiff_t nal_size;
 +
-+    const uint8_t * const pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
++    if (!sps_buf)
++        return AVERROR(ENOMEM);
++    sps = (HEVCRpiSPS*)sps_buf->data;
 +
-+    if (!s->sh.disable_deblocking_filter_flag)
-+        ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
++    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
 +
-+#ifdef RPI
-+    if (s->enable_rpi) {
-+        // Copy coeffs
-+        const int blen = (length + 7) >> 3;
-+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
-+        // Allocation is in int16_t s
-+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
-+        // sample this rounding doesn't affect the total size we need to allocate for
-+        // the coeff buffer
-+        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
-+        memcpy(coeffs, pcm, blen);
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(sps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(sps->data));
++        sps->data_size = sizeof(sps->data);
++    } else {
++        sps->data_size = nal_size;
++    }
++    memcpy(sps->data, gb->buffer, sps->data_size);
 +
-+        // Our coeff stash assumes that any partially allocated 64byte lump
-+        // is zeroed so make that true.
-+        {
-+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
-+            if ((-(intptr_t)eopcm & 63) != 0)
-+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
-+        }
++    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
++                            apply_defdispwin,
++                            ps->vps_list, avctx);
++    if (ret < 0) {
++        av_buffer_unref(&sps_buf);
++        return ret;
++    }
 +
-+        // Add command
-+        {
-+            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+            cmd->type = RPI_PRED_I_PCM;
-+            cmd->size = log2_cb_size;
-+            cmd->i_pcm.src = coeffs;
-+            cmd->i_pcm.x = x0;
-+            cmd->i_pcm.y = y0;
-+            cmd->i_pcm.src_len = length;
-+        }
-+        return 0;
++    if (avctx->debug & FF_DEBUG_BITSTREAM) {
++        av_log(avctx, AV_LOG_DEBUG,
++               "Parsed SPS: id %d; coded wxh: %dx%d; "
++               "cropped wxh: %dx%d; pix_fmt: %s.\n",
++               sps_id, sps->width, sps->height,
++               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
++               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
++               av_get_pix_fmt_name(sps->pix_fmt));
 +    }
-+#endif
 +
-+    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
++    /* check if this is a repeat of an already parsed SPS, then keep the
++     * original one.
++     * otherwise drop all PPSes that depend on it */
++    if (ps->sps_list[sps_id] &&
++        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
++        av_buffer_unref(&sps_buf);
++    } else {
++        remove_sps(ps, sps_id);
++        ps->sps_list[sps_id] = sps_buf;
++    }
++
++    return 0;
 +}
 +
- /**
-  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
-  *
-@@ -1384,11 +2310,11 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-  * @param luma_offset additive offset applied to the luma prediction value
-  */
- 
--static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+static void luma_mc_uni(const HEVCContext * const s, HEVCLocalContext * const lc,
-+                        uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     uint8_t *src         = ref->data[0];
-     ptrdiff_t srcstride  = ref->linesize[0];
-     int pic_width        = s->ps.sps->width;
-@@ -1399,6 +2325,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                            (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
-     int idx              = ff_hevc_pel_weight[block_w];
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
++static void hevc_pps_free(void *opaque, uint8_t *data)
++{
++    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
 +
-     x_off += mv->x >> 2;
-     y_off += mv->y >> 2;
-     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1445,11 +2375,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-  * @param mv1 motion vector1 (relative to block position) to get pixel data from
-  * @param current_mv current motion vector structure
-  */
-- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+static void luma_mc_bi(const HEVCContext * const s, HEVCLocalContext * const lc, uint8_t *dst, ptrdiff_t dststride,
-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     ptrdiff_t src0stride  = ref0->linesize[0];
-     ptrdiff_t src1stride  = ref1->linesize[0];
-     int pic_width        = s->ps.sps->width;
-@@ -1469,6 +2398,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
-     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
++    av_freep(&pps->column_width);
++    av_freep(&pps->row_height);
++    av_freep(&pps->col_bd);
++    av_freep(&pps->row_bd);
++    av_freep(&pps->col_idxX);
++    av_freep(&pps->ctb_addr_rs_to_ts);
++    av_freep(&pps->ctb_addr_ts_to_rs);
++    av_freep(&pps->tile_pos_rs);
++    av_freep(&pps->tile_size);
++    av_freep(&pps->tile_id);
++    av_freep(&pps->min_tb_addr_zs_tab);
 +
-     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
-         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
-         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1536,11 +2469,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-  * @param chroma_offset additive offset applied to the chroma prediction value
-  */
- 
--static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+static void chroma_mc_uni(const HEVCContext * const s, HEVCLocalContext * const lc, uint8_t *dst0,
-                           ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-                           int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-     int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
-     const Mv *mv         = &current_mv->mv[reflist];
-@@ -1554,6 +2486,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-     intptr_t _mx         = mx << (1 - hshift);
-     intptr_t _my         = my << (1 - vshift);
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
++    av_freep(&pps);
++}
 +
-     x_off += mv->x >> (2 + hshift);
-     y_off += mv->y >> (2 + vshift);
-     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1601,10 +2537,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-  * @param current_mv current motion vector structure
-  * @param cidx chroma component(cb, cr)
-  */
--static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+static void chroma_mc_bi(const HEVCContext * const s, HEVCLocalContext * const lc,
-+                         uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     uint8_t *src1        = ref0->data[cidx+1];
-     uint8_t *src2        = ref1->data[cidx+1];
-     ptrdiff_t src1stride = ref0->linesize[cidx+1];
-@@ -1618,6 +2554,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
-     int hshift = s->ps.sps->hshift[1];
-     int vshift = s->ps.sps->vshift[1];
- 
-+#ifdef DISABLE_MC
-+    return;
-+#endif
++static int pps_range_extensions(GetBitContext *gb, AVCodecContext *avctx,
++                                HEVCRpiPPS *pps, HEVCRpiSPS *sps) {
++    int i;
 +
-     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
-     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
-     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1691,37 +2631,136 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
-                                                          _mx1, _my1, block_w);
- }
- 
--static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
--                                const Mv *mv, int y0, int height)
-+#ifdef RPI
-+void ff_hevc_rpi_progress_wait_field(const HEVCContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int val, const int field)
++    if (pps->transform_skip_enabled_flag) {
++        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
++    }
++    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
++    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
++    if (pps->chroma_qp_offset_list_enabled_flag) {
++        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
++        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
++        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
++            return AVERROR_INVALIDDATA;
++        }
++        for (i = 0; i <= pps->chroma_qp_offset_list_len_minus1; i++) {
++            pps->cb_qp_offset_list[i] = get_se_golomb_long(gb);
++            if (pps->cb_qp_offset_list[i]) {
++                av_log(avctx, AV_LOG_WARNING,
++                       "cb_qp_offset_list not tested yet.\n");
++            }
++            pps->cr_qp_offset_list[i] = get_se_golomb_long(gb);
++            if (pps->cr_qp_offset_list[i]) {
++                av_log(avctx, AV_LOG_WARNING,
++                       "cb_qp_offset_list not tested yet.\n");
++            }
++        }
++    }
++    pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
++    pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
++
++    return(0);
++}
++
++static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb,
++                            HEVCRpiPPS *pps, HEVCRpiSPS *sps)
 +{
-+    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
-+        HEVCContext *const fs = ref->tf.owner[field]->priv_data;
-+        HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
-+        sem_t * sem = NULL;
++    int log2_diff;
++    int pic_area_in_ctbs;
++    int i, j, x, y, ctb_addr_rs, tile_id;
 +
-+        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+        if (((volatile int *)ref->tf.progress->data)[field] < val) {
-+            HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
++    // Inferred parameters
++    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
++    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
++    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
++    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
++        return AVERROR(ENOMEM);
 +
-+            av_assert1(pwait->req == -1 && pwait->next == NULL);
-+            jb->waited = 1;  // Remember that we had to wait for later scheduling
++    if (pps->uniform_spacing_flag) {
++        if (!pps->column_width) {
++            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
++        }
++        if (!pps->column_width || !pps->row_height)
++            return AVERROR(ENOMEM);
 +
-+            pwait->req = val;
-+            pwait->next = NULL;
-+            if (pstate->first == NULL)
-+                pstate->first = pwait;
-+            else
-+                pstate->last->next = pwait;
-+            pstate->last = pwait;
-+            sem = &pwait->sem;
++        for (i = 0; i < pps->num_tile_columns; i++) {
++            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
++                                   (i * sps->ctb_width) / pps->num_tile_columns;
 +        }
-+        pthread_mutex_unlock(&pstate->lock);
 +
-+        if (sem != NULL) {
-+            rpi_sem_wait(sem);
++        for (i = 0; i < pps->num_tile_rows; i++) {
++            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
++                                 (i * sps->ctb_height) / pps->num_tile_rows;
 +        }
 +    }
-+}
 +
-+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
-+{
-+    HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
++    pps->col_bd[0] = 0;
++    for (i = 0; i < pps->num_tile_columns; i++)
++        pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
 +
-+    ((int *)s->ref->tf.progress->data)[field] = val;
++    pps->row_bd[0] = 0;
++    for (i = 0; i < pps->num_tile_rows; i++)
++        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
 +
-+    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+    {
-+        HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
-+        HEVCRPiFrameProgressWait * pwait;
++    for (i = 0, j = 0; i < sps->ctb_width; i++) {
++        if (i >= pps->col_bd[j + 1])
++            j++;
++        pps->col_idxX[i] = j;
++    }
 +
-+        while ((pwait = *ppwait) != NULL) {
-+            if (pwait->req > val)
-+            {
-+                ppwait = &pwait->next;
-+                pstate->last = pwait;
++    /**
++     * 6.5
++     */
++    pic_area_in_ctbs     = sps->ctb_width    * sps->ctb_height;
++
++    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
++    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
++    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
++    pps->tile_size         = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_size));
++    pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab));
++    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
++        !pps->tile_id || !pps->min_tb_addr_zs_tab) {
++        return AVERROR(ENOMEM);
++    }
++
++    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
++        int tb_x   = ctb_addr_rs % sps->ctb_width;
++        int tb_y   = ctb_addr_rs / sps->ctb_width;
++        int tile_x = 0;
++        int tile_y = 0;
++        int val    = 0;
++
++        for (i = 0; i < pps->num_tile_columns; i++) {
++            if (tb_x < pps->col_bd[i + 1]) {
++                tile_x = i;
++                break;
 +            }
-+            else
-+            {
-+                *ppwait = pwait->next;
-+                pwait->req = -1;
-+                pwait->next = NULL;
-+                sem_post(&pwait->sem);
++        }
++
++        for (i = 0; i < pps->num_tile_rows; i++) {
++            if (tb_y < pps->row_bd[i + 1]) {
++                tile_y = i;
++                break;
 +            }
 +        }
++
++        for (i = 0; i < tile_x; i++)
++            val += pps->row_height[tile_y] * pps->column_width[i];
++        for (i = 0; i < tile_y; i++)
++            val += sps->ctb_width * pps->row_height[i];
++
++        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
++               tb_x - pps->col_bd[tile_x];
++
++        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
++        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
 +    }
-+    pthread_mutex_unlock(&pstate->lock);
-+}
 +
-+static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
-+{
-+    pstate->first = NULL;
-+    pstate->last = NULL;
-+    pthread_mutex_init(&pstate->lock, NULL);
-+}
++    for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
++        for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
++            for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
++                for (x = pps->col_bd[i]; x < pps->col_bd[i + 1]; x++)
++                    pps->tile_id[pps->ctb_addr_rs_to_ts[y * sps->ctb_width + x]] = tile_id;
 +
-+static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
-+{
-+    pwait->req = -1;
-+    pwait->next = NULL;
-+    sem_init(&pwait->sem, 0, 0);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
-+{
-+    av_assert1(pstate->first == NULL);
-+    pthread_mutex_destroy(&pstate->lock);
-+}
++    pps->tile_pos_rs = av_malloc_array(tile_id, sizeof(*pps->tile_pos_rs));
++    if (!pps->tile_pos_rs)
++        return AVERROR(ENOMEM);
 +
-+static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
- {
--    if (s->threads_type == FF_THREAD_FRAME ) {
--        int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
-+    sem_destroy(&pwait->sem);
-+}
-+#endif
- 
--        ff_thread_await_progress(&ref->tf, y, 0);
-+static void hevc_await_progress(const HEVCContext * const s, HEVCLocalContext * const lc, const HEVCFrame * const ref,
-+                                const Mv * const mv, const int y0, const int height)
-+{
-+    if (s->threads_type == FF_THREAD_FRAME) {
-+        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++    for (j = 0; j < pps->num_tile_rows; j++)
++        for (i = 0; i < pps->num_tile_columns; i++)
++        {
++            pps->tile_size[j * pps->num_tile_columns + i] =
++                pps->column_width[i] * pps->row_height[j];
++            pps->tile_pos_rs[j * pps->num_tile_columns + i] =
++                pps->row_bd[j] * sps->ctb_width + pps->col_bd[i];
++        }
 +
-+#ifdef RPI
-+        if (s->enable_rpi) {
-+            // *** Move progress to lc
-+            int16_t *const pr = lc->jb0->progress + ref->dpb_no;
-+            if (*pr < y) {
-+                *pr = y;
++    log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size;
++    pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1];
++    for (y = 0; y < sps->tb_mask+2; y++) {
++        pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1;
++        pps->min_tb_addr_zs_tab[y]    = -1;
++    }
++    for (y = 0; y < sps->tb_mask+1; y++) {
++        for (x = 0; x < sps->tb_mask+1; x++) {
++            int tb_x = x >> log2_diff;
++            int tb_y = y >> log2_diff;
++            int rs   = sps->ctb_width * tb_y + tb_x;
++            int val  = pps->ctb_addr_rs_to_ts[rs] << (log2_diff * 2);
++            for (i = 0; i < log2_diff; i++) {
++                int m = 1 << i;
++                val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0);
 +            }
++            pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val;
 +        }
-+        else
-+#endif
-+        // It is a const ThreadFrame but the prototype isn't
-+        ff_hevc_progress_wait_mv(s, lc->jb0, ref, y);
-     }
- }
- 
--static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
--                                  int nPbH, int log2_cb_size, int part_idx,
--                                  int merge_idx, MvField *mv)
-+static void hevc_luma_mv_mvp_mode(const HEVCContext * const s, HEVCLocalContext * const lc,
-+                                  const int x0, const int y0, const int nPbW,
-+                                  const int nPbH, const int log2_cb_size, const int part_idx,
-+                                  const int merge_idx, MvField * const mv)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     enum InterPredIdc inter_pred_idc = PRED_L0;
-     int mvp_flag;
- 
--    ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH);
-+    ff_hevc_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH);
-     mv->pred_flag = 0;
-     if (s->sh.slice_type == HEVC_SLICE_B)
--        inter_pred_idc = ff_hevc_inter_pred_idc_decode(s, nPbW, nPbH);
-+        inter_pred_idc = ff_hevc_inter_pred_idc_decode(lc, nPbW, nPbH);
- 
-     if (inter_pred_idc != PRED_L1) {
-         if (s->sh.nb_refs[L0])
--            mv->ref_idx[0]= ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L0]);
-+            mv->ref_idx[0]= ff_hevc_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
- 
-         mv->pred_flag = PF_L0;
--        ff_hevc_hls_mvd_coding(s, x0, y0, 0);
--        mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
--        ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
-+        ff_hevc_hls_mvd_coding(lc);
-+        mvp_flag = ff_hevc_mvp_lx_flag_decode(lc);
-+        ff_hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-                                  part_idx, merge_idx, mv, mvp_flag, 0);
-         mv->mv[0].x += lc->pu.mvd.x;
-         mv->mv[0].y += lc->pu.mvd.y;
-@@ -1729,39 +2768,577 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
- 
-     if (inter_pred_idc != PRED_L0) {
-         if (s->sh.nb_refs[L1])
--            mv->ref_idx[1]= ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L1]);
-+            mv->ref_idx[1]= ff_hevc_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
- 
-         if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
-             AV_ZERO32(&lc->pu.mvd);
-         } else {
--            ff_hevc_hls_mvd_coding(s, x0, y0, 1);
-+            ff_hevc_hls_mvd_coding(lc);
-         }
- 
-         mv->pred_flag += PF_L1;
--        mvp_flag = ff_hevc_mvp_lx_flag_decode(s);
--        ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
-+        mvp_flag = ff_hevc_mvp_lx_flag_decode(lc);
-+        ff_hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-                                  part_idx, merge_idx, mv, mvp_flag, 1);
-         mv->mv[1].x += lc->pu.mvd.x;
-         mv->mv[1].y += lc->pu.mvd.y;
-     }
- }
- 
--static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
--                                int nPbW, int nPbH,
--                                int log2_cb_size, int partIdx, int idx)
++    }
 +
-+#if RPI_INTER
++    return 0;
++}
 +
-+static HEVCRpiInterPredQ *
-+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps)
 +{
-+    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
-+    HEVCRpiInterPredQ * ypt = yp + 1;
-+    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
-+        if (ypt->load < yp->load)
-+            yp = ypt;
++    HEVCRpiSPS      *sps = NULL;
++    int i, ret = 0;
++    unsigned int pps_id = 0;
++    ptrdiff_t nal_size;
++    unsigned log2_parallel_merge_level_minus2;
++
++    AVBufferRef *pps_buf;
++    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
++
++    if (!pps)
++        return AVERROR(ENOMEM);
++
++    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
++                               hevc_pps_free, NULL, 0);
++    if (!pps_buf) {
++        av_freep(&pps);
++        return AVERROR(ENOMEM);
 +    }
 +
-+    yp->load += load_val;
-+    ipe->used_grp = 1;
-+    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
++    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
 +
-+    return yp;
-+}
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(pps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(pps->data));
++        pps->data_size = sizeof(pps->data);
++    } else {
++        pps->data_size = nal_size;
++    }
++    memcpy(pps->data, gb->buffer, pps->data_size);
++
++    // Default values
++    pps->loop_filter_across_tiles_enabled_flag = 1;
++    pps->num_tile_columns                      = 1;
++    pps->num_tile_rows                         = 1;
++    pps->uniform_spacing_flag                  = 1;
++    pps->disable_dbf                           = 0;
++    pps->beta_offset                           = 0;
++    pps->tc_offset                             = 0;
++    pps->log2_max_transform_skip_block_size    = 2;
++
++    // Coded parameters
++    pps_id = get_ue_golomb_long(gb);
++    if (pps_id >= HEVC_MAX_PPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->sps_id = get_ue_golomb_long(gb);
++    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    if (!ps->sps_list[pps->sps_id]) {
++        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
 +
++    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
++    pps->output_flag_present_flag              = get_bits1(gb);
++    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
 +
-+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
-+{
-+    for (unsigned int i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
++    pps->sign_data_hiding_flag = get_bits1(gb);
 +
-+        q->qpu_mc_curr->data[-1] = q->code_sync;
-+        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
-+        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
++    pps->cabac_init_present_flag = get_bits1(gb);
++
++    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
++    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
++
++    pps->pic_init_qp_minus26 = get_se_golomb(gb);
++
++    pps->constrained_intra_pred_flag = get_bits1(gb);
++    pps->transform_skip_enabled_flag = get_bits1(gb);
++
++    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
++    pps->diff_cu_qp_delta_depth   = 0;
++    if (pps->cu_qp_delta_enabled_flag)
++        pps->diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
++
++    if (pps->diff_cu_qp_delta_depth < 0 ||
++        pps->diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
++        av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
++               pps->diff_cu_qp_delta_depth);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
 +    }
-+}
 +
-+// Returns 0 on success, -1 if Q is dangerously full
-+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
-+{
-+    if (!ipe->used_grp)
-+        return 0;
++    pps->cb_qp_offset = get_se_golomb(gb);
++    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
++        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
++               pps->cb_qp_offset);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->cr_qp_offset = get_se_golomb(gb);
++    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
++        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
++               pps->cr_qp_offset);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
++
++    pps->weighted_pred_flag   = get_bits1(gb);
++    pps->weighted_bipred_flag = get_bits1(gb);
++
++    pps->transquant_bypass_enable_flag    = get_bits1(gb);
++    pps->tiles_enabled_flag               = get_bits1(gb);
++    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
++
++    if (pps->tiles_enabled_flag) {
++        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
++        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
++        if (pps->num_tile_columns <= 0 ||
++            pps->num_tile_columns >= sps->width) {
++            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
++                   pps->num_tile_columns - 1);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
++        if (pps->num_tile_rows <= 0 ||
++            pps->num_tile_rows >= sps->height) {
++            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
++                   pps->num_tile_rows - 1);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
 +
-+    if ((ipe->curr += ipe->n_grp) >= ipe->n)
-+    {
-+        ipe->curr = 0;
-+        rpi_inter_pred_sync(ipe);
++        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
++        if (!pps->column_width || !pps->row_height) {
++            ret = AVERROR(ENOMEM);
++            goto err;
++        }
++
++        pps->uniform_spacing_flag = get_bits1(gb);
++        if (!pps->uniform_spacing_flag) {
++            uint64_t sum = 0;
++            for (i = 0; i < pps->num_tile_columns - 1; i++) {
++                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
++                sum                 += pps->column_width[i];
++            }
++            if (sum >= sps->ctb_width) {
++                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
++
++            sum = 0;
++            for (i = 0; i < pps->num_tile_rows - 1; i++) {
++                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
++                sum               += pps->row_height[i];
++            }
++            if (sum >= sps->ctb_height) {
++                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
++        }
++        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
 +    }
-+    ipe->used = 1;
-+    ipe->used_grp = 0;
 +
-+    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
-+        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
-+            return -1;
++    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++
++    pps->deblocking_filter_control_present_flag = get_bits1(gb);
++    if (pps->deblocking_filter_control_present_flag) {
++        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
++        pps->disable_dbf                             = get_bits1(gb);
++        if (!pps->disable_dbf) {
++            int beta_offset_div2 = get_se_golomb(gb);
++            int tc_offset_div2   = get_se_golomb(gb) ;
++            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
++                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
++                       beta_offset_div2);
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
++                       tc_offset_div2);
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->beta_offset = 2 * beta_offset_div2;
++            pps->tc_offset   = 2 *   tc_offset_div2;
 +        }
 +    }
-+    return 0;
-+}
 +
-+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
-+{
-+    unsigned int i;
++    pps->scaling_list_data_present_flag = get_bits1(gb);
++    if (pps->scaling_list_data_present_flag) {
++        set_default_scaling_list_data(&pps->scaling_list);
++        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
++        if (ret < 0)
++            goto err;
++    }
++    pps->lists_modification_present_flag = get_bits1(gb);
++    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
++    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
++        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
++               log2_parallel_merge_level_minus2);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
 +
-+    ipe->curr = 0;
-+    ipe->used = 0;
-+    ipe->used_grp = 0;
-+    for (i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i;
-+        q->qpu_mc_curr = q->qpu_mc_base;
-+        q->load = 0;
-+        q->last_l0 = NULL;
-+        q->last_l1 = NULL;
++    pps->slice_header_extension_present_flag = get_bits1(gb);
++
++    if (get_bits1(gb)) { // pps_extension_present_flag
++        int pps_range_extensions_flag = get_bits1(gb);
++        /* int pps_extension_7bits = */ get_bits(gb, 7);
++        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
++            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
++                goto err;
++        }
 +    }
-+}
 +
-+static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
-+                                 const unsigned int n_max, const unsigned int n_grp,
-+                                 const unsigned int total_size, const unsigned int min_gap)
-+{
-+    memset(ipe, 0, sizeof(*ipe));
-+    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
-+    ipe->n_grp = n_grp;
-+    ipe->min_gap = min_gap;
++    ret = setup_pps(avctx, gb, pps, sps);
++    if (ret < 0)
++        goto err;
 +
-+#if RPI_CACHE_UNIF_MVS
-+    gpu_malloc_cached(total_size, &ipe->gptr);
-+#else
-+    gpu_malloc_uncached(total_size, &ipe->gptr);
-+#endif
-+}
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread PPS by %d bits\n", -get_bits_left(gb));
++        goto err;
++    }
 +
++    remove_pps(ps, pps_id);
++    ps->pps_list[pps_id] = pps_buf;
 +
-+#if RPI_QPU_EMU_Y
-+#define get_mc_address_y(f) ((f)->data[0])
-+#else
-+#define get_mc_address_y(f) get_vc_address_y(f)
-+#endif
-+#if RPI_QPU_EMU_C
-+#define get_mc_address_u(f) ((f)->data[1])
-+#else
-+#define get_mc_address_u(f) get_vc_address_u(f)
-+#endif
++    return 0;
 +
-+static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
-+{
-+    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
-+           wt << (s->ps.sps->bit_depth - 8);
++err:
++    av_buffer_unref(&pps_buf);
++    return ret;
 +}
 +
-+static void
-+rpi_pred_y(const HEVCContext *const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const Mv *const mv,
-+           const int weight_mul,
-+           const int weight_offset,
-+           AVFrame *const src_frame)
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
 +{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const unsigned int mx          = mv->x & 3;
-+    const unsigned int my          = mv->y & 3;
-+    const unsigned int my_mx       = (my << 8) | mx;
-+    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
-+    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
-+    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
++    int prev_poc_lsb = pocTid0 % max_poc_lsb;
++    int prev_poc_msb = pocTid0 - prev_poc_lsb;
++    int poc_msb;
 +
-+    if (my_mx == 0)
-+    {
-+        const int x1 = x0 + (mv->x >> 2);
-+        const int y1 = y0 + (mv->y >> 2);
-+        const int bh = nPbH;
++    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
++        poc_msb = prev_poc_msb + max_poc_lsb;
++    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
++        poc_msb = prev_poc_msb - max_poc_lsb;
++    else
++        poc_msb = prev_poc_msb;
 +
-+        for (int start_x = 0; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++    // For BLA picture types, POCmsb is set to 0.
++    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
++        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
++        nal_unit_type == HEVC_NAL_BLA_N_LP)
++        poc_msb = 0;
 +
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = &s->tstats;
-+                ++ts->y_pred1_x0y0;
++    return poc_msb + poc_lsb;
++}
+diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
+new file mode 100644
+index 0000000000..1600076a69
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.h
+@@ -0,0 +1,437 @@
++/*
++ * HEVC parameter set parsing
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
++#ifndef AVCODEC_RPI_HEVC_PS_H
++#define AVCODEC_RPI_HEVC_PS_H
 +
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
++#include <stdint.h>
 +
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src_vc_address_y;
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->wo1 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
-+    }
-+    else
-+    {
-+        const int x1_m3 = x0 + (mv->x >> 2) - 3;
-+        const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+        const unsigned int bh = nPbH;
-+        int start_x = 0;
++#include "libavutil/buffer.h"
++#include "libavutil/pixfmt.h"
++#include "libavutil/rational.h"
 +
-+#if 1
-+        // As Y-pred operates on two independant 8-wide src blocks we can merge
-+        // this pred with the previous one if it the previous one is 8 pel wide,
-+        // the same height as the current block, immediately to the left of our
-+        // current dest block and mono-pred.
++#include "avcodec.h"
++#include "get_bits.h"
++#include "hevc.h"
 +
-+        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
-+        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
-+        {
-+            const int bw = FFMIN(nPbW, 8);
-+            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
++typedef struct ShortTermRPS {
++    unsigned int num_negative_pics;
++    int num_delta_pocs;
++    int rps_idx_num_delta_pocs;
++    int32_t delta_poc[32];
++    uint8_t used[32];
++} ShortTermRPS;
 +
-+            last_y8_src2->x = x1_m3;
-+            last_y8_src2->y = y1_m3;
-+            last_y8_src2->base = src_vc_address_y;
-+            last_y8_p->w += bw;
-+            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
-+            last_y8_p->wo2 = wo;
++typedef struct LongTermRPS {
++    int     poc[32];
++    uint8_t used[32];
++    uint8_t nb_refs;
++} LongTermRPS;
 +
-+            jb->last_y8_p = NULL;
-+            jb->last_y8_l1 = NULL;
-+            start_x = bw;
-+#if RPI_TSTATS
-+            ++s->tstats.y_pred1_y8_merge;
-+#endif
-+        }
-+#endif
++typedef struct SliceHeader {
++    unsigned int pps_id;
 +
-+        for (; start_x < nPbW; start_x += 16)
-+        {
-+            const int bw = FFMIN(nPbW - start_x, 16);
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = &s->tstats;
-+                if (mx == 0 && my == 0)
-+                    ++ts->y_pred1_x0y0;
-+                else if (mx == 0)
-+                    ++ts->y_pred1_x0;
-+                else if (my == 0)
-+                    ++ts->y_pred1_y0;
-+                else
-+                    ++ts->y_pred1_xy;
++    ///< address (in raster order) of the first block in the current slice segment
++    unsigned int   slice_segment_addr;
++    ///< address (in raster order) of the first block in the current slice
++    unsigned int   slice_addr;
 +
-+                if (nPbW > 8)
-+                    ++ts->y_pred1_wgt8;
-+                else
-+                    ++ts->y_pred1_wle8;
++    enum HEVCSliceType slice_type;
 +
-+                if (nPbH > 16)
-+                    ++ts->y_pred1_hgt16;
-+                else
-+                    ++ts->y_pred1_hle16;
-+            }
-+#endif
-+            src1->x = x1_m3 + start_x;
-+            src1->y = y1_m3;
-+            src1->base = src_vc_address_y;
-+            if (bw <= 8)
-+            {
-+                src2->x = MC_DUMMY_X;
-+                src2->y = MC_DUMMY_Y;
-+#if RPI_QPU_EMU_Y
-+                src2->base = s->qpu_dummy_frame_emu;
-+#else
-+                src2->base = s->qpu_dummy_frame_qpu;
-+#endif
-+            }
-+            else
-+            {
-+                src2->x = x1_m3 + start_x + 8;
-+                src2->y = y1_m3;
-+                src2->base = src_vc_address_y;
-+            }
-+            cmd_y->w = bw;
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo;
-+            cmd_y->wo2 = wo;
-+            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++    int pic_order_cnt_lsb;
 +
-+            if (bw == 8) {
-+                jb->last_y8_l1 = src2;
-+                jb->last_y8_p = cmd_y;
-+            }
-+        }
-+    }
-+}
++    uint8_t first_slice_in_pic_flag;
++    uint8_t dependent_slice_segment_flag;
++    uint8_t pic_output_flag;
++    uint8_t colour_plane_id;
 +
-+static void
-+rpi_pred_y_b(const HEVCContext * const s, HEVCRpiJob * const jb,
-+           const int x0, const int y0,
-+           const int nPbW, const int nPbH,
-+           const struct MvField *const mv_field,
-+           const AVFrame *const src_frame,
-+           const AVFrame *const src_frame2)
-+{
-+    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const Mv * const mv  = mv_field->mv + 0;
-+    const Mv * const mv2 = mv_field->mv + 1;
++    ///< RPS coded in the slice header itself is stored here
++    int short_term_ref_pic_set_sps_flag;
++    int short_term_ref_pic_set_size;
++    ShortTermRPS slice_rps;
++    const ShortTermRPS *short_term_rps;
++    int long_term_ref_pic_set_size;
++    LongTermRPS long_term_rps;
++    unsigned int list_entry_lx[2][32];
++
++    uint8_t rpl_modification_flag[2];
++    uint8_t no_output_of_prior_pics_flag;
++    uint8_t slice_temporal_mvp_enabled_flag;
++
++    unsigned int nb_refs[2];
++
++    uint8_t slice_sample_adaptive_offset_flag[3];
++    uint8_t mvd_l1_zero_flag;
++
++    uint8_t cabac_init_flag;
++    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
++    uint8_t slice_loop_filter_across_slices_enabled_flag;
++    uint8_t collocated_list;
++
++    unsigned int collocated_ref_idx;
++
++    int slice_qp_delta;
++    int slice_cb_qp_offset;
++    int slice_cr_qp_offset;
++
++    uint8_t cu_chroma_qp_offset_enabled_flag;
++
++    int beta_offset;    ///< beta_offset_div2 * 2
++    int tc_offset;      ///< tc_offset_div2 * 2
++
++    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
++
++    unsigned *entry_point_offset;
++    int * offset;
++    int * size;
++    int num_entry_point_offsets;
++
++    int8_t slice_qp;
++
++    uint8_t luma_log2_weight_denom;
++    int16_t chroma_log2_weight_denom;
++
++    int16_t luma_weight_l0[16];
++    int16_t chroma_weight_l0[16][2];
++    int16_t chroma_weight_l1[16][2];
++    int16_t luma_weight_l1[16];
++
++    int16_t luma_offset_l0[16];
++    int16_t chroma_offset_l0[16][2];
++
++    int16_t luma_offset_l1[16];
++    int16_t chroma_offset_l1[16][2];
++
++    int slice_ctb_addr_rs;
++} SliceHeader;
++
++typedef struct HEVCWindow {
++    unsigned int left_offset;
++    unsigned int right_offset;
++    unsigned int top_offset;
++    unsigned int bottom_offset;
++} HEVCWindow;
++
++typedef struct VUI {
++    AVRational sar;
++
++    int overscan_info_present_flag;
++    int overscan_appropriate_flag;
++
++    int video_signal_type_present_flag;
++    int video_format;
++    int video_full_range_flag;
++    int colour_description_present_flag;
++    uint8_t colour_primaries;
++    uint8_t transfer_characteristic;
++    uint8_t matrix_coeffs;
++
++    int chroma_loc_info_present_flag;
++    int chroma_sample_loc_type_top_field;
++    int chroma_sample_loc_type_bottom_field;
++    int neutra_chroma_indication_flag;
++
++    int field_seq_flag;
++    int frame_field_info_present_flag;
++
++    int default_display_window_flag;
++    HEVCWindow def_disp_win;
++
++    int vui_timing_info_present_flag;
++    uint32_t vui_num_units_in_tick;
++    uint32_t vui_time_scale;
++    int vui_poc_proportional_to_timing_flag;
++    int vui_num_ticks_poc_diff_one_minus1;
++    int vui_hrd_parameters_present_flag;
++
++    int bitstream_restriction_flag;
++    int tiles_fixed_structure_flag;
++    int motion_vectors_over_pic_boundaries_flag;
++    int restricted_ref_pic_lists_flag;
++    int min_spatial_segmentation_idc;
++    int max_bytes_per_pic_denom;
++    int max_bits_per_min_cu_denom;
++    int log2_max_mv_length_horizontal;
++    int log2_max_mv_length_vertical;
++} VUI;
++
++typedef struct PTLCommon {
++    uint8_t profile_space;
++    uint8_t tier_flag;
++    uint8_t profile_idc;
++    uint8_t profile_compatibility_flag[32];
++    uint8_t level_idc;
++    uint8_t progressive_source_flag;
++    uint8_t interlaced_source_flag;
++    uint8_t non_packed_constraint_flag;
++    uint8_t frame_only_constraint_flag;
++} PTLCommon;
++
++typedef struct PTL {
++    PTLCommon general_ptl;
++    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
++
++    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
++    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
++} PTL;
++
++typedef struct HEVCRpiVPS {
++    uint8_t vps_temporal_id_nesting_flag;
++    int vps_max_layers;
++    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
++
++    PTL ptl;
++    int vps_sub_layer_ordering_info_present_flag;
++    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
++    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
++    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
++    int vps_max_layer_id;
++    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
++    uint8_t vps_timing_info_present_flag;
++    uint32_t vps_num_units_in_tick;
++    uint32_t vps_time_scale;
++    uint8_t vps_poc_proportional_to_timing_flag;
++    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
++    int vps_num_hrd_parameters;
++
++    uint8_t data[4096];
++    int data_size;
++} HEVCRpiVPS;
++
++typedef struct ScalingList {
++    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
++     * and size ID 3 only has 2 arrays, not 6. */
++    uint8_t sl[4][6][64];
++    uint8_t sl_dc[2][6];
++} ScalingList;
++
++typedef struct HEVCRpiSPS {
++    unsigned vps_id;
++    int chroma_format_idc;
++    uint8_t separate_colour_plane_flag;
++
++    HEVCWindow output_window;
++
++    HEVCWindow pic_conf_win;
++
++    int bit_depth;
++    int bit_depth_chroma;
++    int pixel_shift;
++    enum AVPixelFormat pix_fmt;
++
++    unsigned int log2_max_poc_lsb;
++    int pcm_enabled_flag;
++
++    int max_sub_layers;
++    struct {
++        int max_dec_pic_buffering;
++        int num_reorder_pics;
++        int max_latency_increase;
++    } temporal_layer[HEVC_MAX_SUB_LAYERS];
++    uint8_t temporal_id_nesting_flag;
++
++    VUI vui;
++    PTL ptl;
++
++    uint8_t scaling_list_enable_flag;
++    ScalingList scaling_list;
++
++    unsigned int nb_st_rps;
++    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_RPS_COUNT];
++
++    uint8_t amp_enabled_flag;
++    uint8_t sao_enabled;
++
++    uint8_t long_term_ref_pics_present_flag;
++    uint16_t lt_ref_pic_poc_lsb_sps[32];
++    uint8_t used_by_curr_pic_lt_sps_flag[32];
++    uint8_t num_long_term_ref_pics_sps;
++
++    struct {
++        uint8_t bit_depth;
++        uint8_t bit_depth_chroma;
++        unsigned int log2_min_pcm_cb_size;
++        unsigned int log2_max_pcm_cb_size;
++        uint8_t loop_filter_disable_flag;
++    } pcm;
++    uint8_t sps_temporal_mvp_enabled_flag;
++    uint8_t sps_strong_intra_smoothing_enable_flag;
++
++    unsigned int log2_min_cb_size;
++    unsigned int log2_diff_max_min_coding_block_size;
++    unsigned int log2_min_tb_size;
++    unsigned int log2_max_trafo_size;
++    unsigned int log2_ctb_size;
++    unsigned int log2_min_pu_size;
++
++    int max_transform_hierarchy_depth_inter;
++    int max_transform_hierarchy_depth_intra;
++
++    int transform_skip_rotation_enabled_flag;
++    int transform_skip_context_enabled_flag;
++    int implicit_rdpcm_enabled_flag;
++    int explicit_rdpcm_enabled_flag;
++    int intra_smoothing_disabled_flag;
++    int high_precision_offsets_enabled_flag;
++    int persistent_rice_adaptation_enabled_flag;
++
++    ///< coded frame dimension in various units
++    int width;
++    int height;
++    int ctb_width;
++    int ctb_height;
++    int ctb_size;
++    int min_cb_width;
++    int min_cb_height;
++    int min_tb_width;
++    int min_tb_height;
++    int min_pu_width;
++    int min_pu_height;
++    int tb_mask;
++
++    int hshift[3];
++    int vshift[3];
++
++    int qp_bd_offset;
++
++    uint8_t data[4096];
++    int data_size;
++} HEVCRpiSPS;
++
++typedef struct HEVCRpiPPS {
++    unsigned int sps_id; ///< seq_parameter_set_id
++
++    uint8_t sign_data_hiding_flag;
++
++    uint8_t cabac_init_present_flag;
++
++    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
++    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
++    int pic_init_qp_minus26;
++
++    uint8_t constrained_intra_pred_flag;
++    uint8_t transform_skip_enabled_flag;
++
++    uint8_t cu_qp_delta_enabled_flag;
++    int diff_cu_qp_delta_depth;
++
++    int cb_qp_offset;
++    int cr_qp_offset;
++    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
++    uint8_t weighted_pred_flag;
++    uint8_t weighted_bipred_flag;
++    uint8_t output_flag_present_flag;
++    uint8_t transquant_bypass_enable_flag;
++
++    uint8_t dependent_slice_segments_enabled_flag;
++    uint8_t tiles_enabled_flag;
++    uint8_t entropy_coding_sync_enabled_flag;
++
++    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
++    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
++    uint8_t uniform_spacing_flag;
++    uint8_t loop_filter_across_tiles_enabled_flag;
++
++    uint8_t seq_loop_filter_across_slices_enabled_flag;
++
++    uint8_t deblocking_filter_control_present_flag;
++    uint8_t deblocking_filter_override_enabled_flag;
++    uint8_t disable_dbf;
++    int beta_offset;    ///< beta_offset_div2 * 2
++    int tc_offset;      ///< tc_offset_div2 * 2
++
++    uint8_t scaling_list_data_present_flag;
++    ScalingList scaling_list;
++
++    uint8_t lists_modification_present_flag;
++    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
++    int num_extra_slice_header_bits;
++    uint8_t slice_header_extension_present_flag;
++    uint8_t log2_max_transform_skip_block_size;
++    uint8_t cross_component_prediction_enabled_flag;
++    uint8_t chroma_qp_offset_list_enabled_flag;
++    uint8_t diff_cu_chroma_qp_offset_depth;
++    uint8_t chroma_qp_offset_list_len_minus1;
++    int8_t  cb_qp_offset_list[6];
++    int8_t  cr_qp_offset_list[6];
++    uint8_t log2_sao_offset_scale_luma;
++    uint8_t log2_sao_offset_scale_chroma;
++
++    // Inferred parameters
++    unsigned int *column_width;  ///< ColumnWidth
++    unsigned int *row_height;    ///< RowHeight
++    unsigned int *col_bd;        ///< ColBd
++    unsigned int *row_bd;        ///< RowBd
++    int *col_idxX;
++
++    int *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
++    int *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
++    int *tile_id;           ///< TileId
++    int *tile_pos_rs;       ///< TilePosRS
++    int *tile_size;         ///< TileSize
++    int *min_tb_addr_zs;    ///< MinTbAddrZS
++    int *min_tb_addr_zs_tab;///< MinTbAddrZS
++
++    uint8_t data[4096];
++    int data_size;
++} HEVCRpiPPS;
++
++typedef struct HEVCRpiParamSets {
++    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
++    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
++    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
++
++    /* currently active parameter sets */
++    const HEVCRpiVPS *vps;
++    const HEVCRpiSPS *sps;
++    const HEVCRpiPPS *pps;
++} HEVCRpiParamSets;
++
++/**
++ * Parse the SPS from the bitstream into the provided HEVCRpiSPS struct.
++ *
++ * @param sps_id the SPS id will be written here
++ * @param apply_defdispwin if set 1, the default display window from the VUI
++ *                         will be applied to the video dimensions
++ * @param vps_list if non-NULL, this function will validate that the SPS refers
++ *                 to an existing VPS
++ */
++int ff_hevc_rpi_parse_sps(HEVCRpiSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++                      int apply_defdispwin, AVBufferRef **vps_list, AVCodecContext *avctx);
 +
-+    const unsigned int mx          = mv->x & 3;
-+    const unsigned int my          = mv->y & 3;
-+    const unsigned int my_mx = (my<<8) | mx;
-+    const unsigned int mx2          = mv2->x & 3;
-+    const unsigned int my2          = mv2->y & 3;
-+    const unsigned int my2_mx2 = (my2<<8) | mx2;
-+    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
-+    const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+    const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+    const uint32_t wt_offset =
-+        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
-+    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
-+    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps);
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps, int apply_defdispwin);
++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps);
 +
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
 +
-+    if (my2_mx2_my_mx == 0)
-+    {
-+        const int x1 = x0 + (mv->x >> 2);
-+        const int y1 = y0 + (mv->y >> 2);
-+        const int x2 = x0 + (mv2->x >> 2);
-+        const int y2 = y0 + (mv2->y >> 2);
-+        const int bh = nPbH;
++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
++                           uint8_t *buf, int buf_size);
 +
-+        // Can do chunks a full 16 wide if we don't want the H filter
-+        for (int start_x=0; start_x < nPbW; start_x += 16)
-+        {
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = &s->tstats;
-+                ++ts->y_pred2_x0y0;
++/**
++ * Compute POC of the current frame and return it.
++ */
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
 +
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 16);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = 0;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
++#endif /* AVCODEC_RPI_HEVC_PS_H */
+diff --git a/libavcodec/rpi_hevc_ps_enc.c b/libavcodec/rpi_hevc_ps_enc.c
+new file mode 100644
+index 0000000000..7fa6af1cdf
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps_enc.c
+@@ -0,0 +1,118 @@
++/*
++ * HEVC Parameter Set encoding
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "golomb.h"
++#include "rpi_hevc_ps.h"
++#include "put_bits.h"
++
++static void write_ptl_layer(PutBitContext *pb, PTLCommon *ptl)
++{
++    int i;
++
++    put_bits(pb, 2, ptl->profile_space);
++    put_bits(pb, 1, ptl->tier_flag);
++    put_bits(pb, 5, ptl->profile_idc);
++    for (i = 0; i < 32; i++)
++        put_bits(pb, 1, ptl->profile_compatibility_flag[i]);
++    put_bits(pb, 1, ptl->progressive_source_flag);
++    put_bits(pb, 1, ptl->interlaced_source_flag);
++    put_bits(pb, 1, ptl->non_packed_constraint_flag);
++    put_bits(pb, 1, ptl->frame_only_constraint_flag);
++    put_bits32(pb, 0);   // reserved
++    put_bits(pb, 12, 0); // reserved
++}
++
++static void write_ptl(PutBitContext *pb, PTL *ptl, int max_num_sub_layers)
++{
++    int i;
++
++    write_ptl_layer(pb, &ptl->general_ptl);
++    put_bits(pb, 8, ptl->general_ptl.level_idc);
++
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        put_bits(pb, 1, ptl->sub_layer_profile_present_flag[i]);
++        put_bits(pb, 1, ptl->sub_layer_level_present_flag[i]);
 +    }
-+    else
-+    {
-+        // Filter requires a run-up of 3
-+        const int x1 = x0 + (mv->x >> 2) - 3;
-+        const int y1 = y0 + (mv->y >> 2) - 3;
-+        const int x2 = x0 + (mv2->x >> 2) - 3;
-+        const int y2 = y0 + (mv2->y >> 2) - 3;
-+        const int bh = nPbH;
 +
-+        for (int start_x=0; start_x < nPbW; start_x += 8)
-+        { // B blocks work 8 at a time
-+            // B weights aren't doubled as the QPU code does the same
-+            // amount of work as it does for P
-+            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
-+            qpu_mc_src_t *const src1 = yp->last_l0;
-+            qpu_mc_src_t *const src2 = yp->last_l1;
-+            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+            {
-+                HEVCRpiStats *const ts = &s->tstats;
-+                const unsigned int mmx = mx | mx2;
-+                const unsigned int mmy = my | my2;
-+                if (mmx == 0 && mmy == 0)
-+                    ++ts->y_pred2_x0y0;
-+                else if (mmx == 0)
-+                    ++ts->y_pred2_x0;
-+                else if (mmy == 0)
-+                    ++ts->y_pred2_y0;
-+                else
-+                    ++ts->y_pred2_xy;
++    if (max_num_sub_layers > 1)
++        for (i = max_num_sub_layers - 1; i < 8; i++)
++            put_bits(pb, 2, 0); // reserved
 +
-+                if (nPbH > 16)
-+                    ++ts->y_pred2_hgt16;
-+                else
-+                    ++ts->y_pred2_hle16;
-+            }
-+#endif
-+            src1->x = x1 + start_x;
-+            src1->y = y1;
-+            src1->base = src1_base;
-+            src2->x = x2 + start_x;
-+            src2->y = y2;
-+            src2->base = src2_base;
-+            cmd_y->w = FFMIN(nPbW - start_x, 8);
-+            cmd_y->h = bh;
-+            cmd_y->mymx21 = my2_mx2_my_mx;
-+            cmd_y->wo1 = wo1;
-+            cmd_y->wo2 = wo2;
-+            cmd_y->dst_addr =  dst + (start_x << xshl);
-+            yp->last_l0 = &cmd_y->next_src1;
-+            yp->last_l1 = &cmd_y->next_src2;
-+            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+        }
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        if (ptl->sub_layer_profile_present_flag[i])
++            write_ptl_layer(pb, &ptl->sub_layer_ptl[i]);
++        if (ptl->sub_layer_level_present_flag[i])
++            put_bits(pb, 8, ptl->sub_layer_ptl[i].level_idc);
 +    }
 +}
 +
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c(const HEVCContext * const s, HEVCRpiJob * const jb,
-+  const unsigned int lx, const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const Mv * const mv,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  AVFrame * const src_frame)
++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
++                           uint8_t *buf, int buf_size)
 +{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // = s->ps.sps->hshift[1];
-+    const int vshift = 1; // = s->ps.sps->vshift[1];
++    PutBitContext pb;
++    int i;
 +
-+    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
-+    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
-+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
-+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
-+    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
-+    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
-+    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
-+    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
++    init_put_bits(&pb, buf, buf_size);
++    put_bits(&pb,  4, id);
++    put_bits(&pb,  2, 3);                               // reserved
++    put_bits(&pb,  6, vps->vps_max_layers - 1);
++    put_bits(&pb,  3, vps->vps_max_sub_layers - 1);
++    put_bits(&pb,  1, vps->vps_temporal_id_nesting_flag);
++    put_bits(&pb, 16, 0xffff);                          // reserved
++
++    write_ptl(&pb, &vps->ptl, vps->vps_max_sub_layers);
++
++    put_bits(&pb, 1, vps->vps_sub_layer_ordering_info_present_flag);
++    for (i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_layers - 1;
++         i < vps->vps_max_sub_layers; i++) {
++        set_ue_golomb(&pb, vps->vps_max_dec_pic_buffering[i] - 1);
++        set_ue_golomb(&pb, vps->vps_num_reorder_pics[i]);
++        set_ue_golomb(&pb, vps->vps_max_latency_increase[i] + 1);
++    }
 +
-+    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
-+        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
-+        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
-+        qpu_mc_src_t * const last_lx = *plast_lx;
-+        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++    put_bits(&pb, 6, vps->vps_max_layer_id);
++    set_ue_golomb(&pb, vps->vps_num_layer_sets - 1);
 +
-+        last_lx->x = x1_c + start_x;
-+        last_lx->y = y1_c;
-+        last_lx->base = src_base_u;
-+        cmd_c->h = bh;
-+        cmd_c->w = bw;
-+        cmd_c->coeffs_x = x_coeffs;
-+        cmd_c->coeffs_y = y_coeffs;
-+        cmd_c->wo_u = wo_u;
-+        cmd_c->wo_v = wo_v;
-+        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
-+        *plast_lx = &cmd_c->next_src;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
++    if (vps->vps_num_layer_sets > 1) {
++        avpriv_report_missing_feature(NULL, "Writing layer_id_included_flag");
++        return AVERROR_PATCHWELCOME;
 +    }
-+    return;
++
++    put_bits(&pb, 1, vps->vps_timing_info_present_flag);
++    if (vps->vps_timing_info_present_flag) {
++        put_bits32(&pb, vps->vps_num_units_in_tick);
++        put_bits32(&pb, vps->vps_time_scale);
++        put_bits(&pb, 1, vps->vps_poc_proportional_to_timing_flag);
++        if (vps->vps_poc_proportional_to_timing_flag)
++            set_ue_golomb(&pb, vps->vps_num_ticks_poc_diff_one - 1);
++
++        if (vps->vps_num_hrd_parameters) {
++            avpriv_report_missing_feature(NULL, "Writing HRD parameters");
++            return AVERROR_PATCHWELCOME;
++        }
++    }
++
++    put_bits(&pb, 1, 0);    // extension flag
++
++    put_bits(&pb, 1, 1);    // stop bit
++    avpriv_align_put_bits(&pb);
++
++    return put_bits_count(&pb) / 8;
 +}
+diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
+new file mode 100644
+index 0000000000..ef15784317
+--- /dev/null
++++ b/libavcodec/rpi_hevc_refs.c
+@@ -0,0 +1,515 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c_b(const HEVCContext * const s, HEVCRpiJob * const jb,
-+  const int x0_c, const int y0_c,
-+  const int nPbW_c, const int nPbH_c,
-+  const struct MvField * const mv_field,
-+  const int16_t * const c_weights,
-+  const int16_t * const c_offsets,
-+  const int16_t * const c_weights2,
-+  const int16_t * const c_offsets2,
-+  AVFrame * const src_frame,
-+  AVFrame * const src_frame2)
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "internal.h"
++#include "thread.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags)
 +{
-+    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+    const int hshift = 1; // s->ps.sps->hshift[1];
-+    const int vshift = 1; // s->ps.sps->vshift[1];
-+    const Mv * const mv = mv_field->mv + 0;
-+    const Mv * const mv2 = mv_field->mv + 1;
++    /* frame->frame can be NULL if context init failed */
++    if (!frame->frame || !frame->frame->buf[0])
++        return;
 +
-+    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
-+    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
-+    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++    frame->flags &= ~flags;
++    if (!frame->flags) {
++        ff_thread_release_buffer(s->avctx, &frame->tf);
 +
-+    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
-+    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
-+    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++        av_buffer_unref(&frame->tab_mvf_buf);
++        frame->tab_mvf = NULL;
 +
-+    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
-+    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++        av_buffer_unref(&frame->rpl_buf);
++        av_buffer_unref(&frame->rpl_tab_buf);
++        frame->rpl_tab    = NULL;
++        frame->refPicList = NULL;
 +
-+    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
-+    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
++        frame->collocated_ref = NULL;
++    }
++}
 +
-+    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
-+    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
-+    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+    const unsigned int bh = nPbH_c;
++const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref, int x0, int y0)
++{
++    int x_cb         = x0 >> s->ps.sps->log2_ctb_size;
++    int y_cb         = y0 >> s->ps.sps->log2_ctb_size;
++    int pic_width_cb = s->ps.sps->ctb_width;
++    int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb];
++    return (const RefPicList *)ref->rpl_tab[ctb_addr_ts];
++}
 +
-+    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
-+    {
-+        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
++{
++    int i;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
++                            HEVC_FRAME_FLAG_SHORT_REF |
++                            HEVC_FRAME_FLAG_LONG_REF);
++}
 +
-+        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
-+        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
-+        qpu_mc_src_t * const src_l0 = cp->last_l0;
-+        qpu_mc_src_t * const src_l1 = cp->last_l1;
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
++{
++    int i;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++}
 +
-+        src_l0->x = x1_c + start_x;
-+        src_l0->y = y1_c;
-+        src_l0->base = src1_base;
-+        src_l1->x = x2_c + start_x;
-+        src_l1->y = y2_c;
-+        src_l1->base = src2_base;
++static HEVCFrame *alloc_frame(HEVCRpiContext *s)
++{
++    int i, j, ret;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCFrame *frame = &s->DPB[i];
++        if (frame->frame->buf[0])
++            continue;
 +
-+        u[0].h = bh;
-+        u[0].w = bw;
-+        u[0].coeffs_x1 = coefs0_x;
-+        u[0].coeffs_y1 = coefs0_y;
-+        u[0].weight_u1 = c_weights[0]; // Weight L0 U
-+        u[0].weight_v1 = c_weights[1]; // Weight L0 V
-+        u[0].coeffs_x2 = coefs1_x;
-+        u[0].coeffs_y2 = coefs1_y;
-+        u[0].wo_u2 = wo_u2;
-+        u[0].wo_v2 = wo_v2;
-+        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
++        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
++                                   AV_GET_BUFFER_FLAG_REF);
++        if (ret < 0)
++            return NULL;
 +
-+        cp->last_l0 = &u[0].next_src1;
-+        cp->last_l1 = &u[0].next_src2;
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++        frame->rpl_buf = av_buffer_allocz(s->pkt.nb_nals * sizeof(RefPicListTab));
++        if (!frame->rpl_buf)
++            goto fail;
++
++        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++        if (!frame->tab_mvf_buf)
++            goto fail;
++        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++
++        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++        if (!frame->rpl_tab_buf)
++            goto fail;
++        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
++        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++        for (j = 0; j < frame->ctb_count; j++)
++            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++
++        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
++        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
++
++        return frame;
++
++fail:
++        ff_hevc_rpi_unref_frame(s, frame, ~0);
++        return NULL;
 +    }
++    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
++    return NULL;
 +}
 +
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
++{
++    HEVCFrame *ref;
++    int i;
 +
-+#endif
++    /* check that this POC doesn't already exist */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCFrame *frame = &s->DPB[i];
 +
++        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
++            frame->poc == poc) {
++            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
++                   poc);
++            return AVERROR_INVALIDDATA;
++        }
++    }
 +
++    ref = alloc_frame(s);
++    if (!ref)
++        return AVERROR(ENOMEM);
 +
-+static void hls_prediction_unit(const HEVCContext * const s, HEVCLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int nPbW, const int nPbH,
-+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
- {
- #define POS(c_idx, x, y)                                                              \
-     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
-                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
--    HEVCLocalContext *lc = s->HEVClc;
-+#ifdef RPI
-+    HEVCRpiJob * const jb = lc->jb0;
-+#endif
++    *frame = ref->frame;
++    s->ref = ref;
 +
-     int merge_idx = 0;
-     struct MvField current_mv = {{{ 0 }}};
- 
-     int min_pu_width = s->ps.sps->min_pu_width;
- 
--    MvField *tab_mvf = s->ref->tab_mvf;
--    RefPicList  *refPicList = s->ref->refPicList;
--    HEVCFrame *ref0 = NULL, *ref1 = NULL;
-+    MvField * const tab_mvf = s->ref->tab_mvf;
-+    const RefPicList  *const refPicList = s->ref->refPicList;
-+    const HEVCFrame *ref0 = NULL, *ref1 = NULL;
-     uint8_t *dst0 = POS(0, x0, y0);
-     uint8_t *dst1 = POS(1, x0, y0);
-     uint8_t *dst2 = POS(2, x0, y0);
-@@ -1771,22 +3348,21 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-     int y_cb             = y0 >> log2_min_cb_size;
-     int x_pu, y_pu;
-     int i, j;
--
--    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
-+    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
- 
-     if (!skip_flag)
--        lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-+        lc->pu.merge_flag = ff_hevc_merge_flag_decode(lc);
- 
-     if (skip_flag || lc->pu.merge_flag) {
-         if (s->sh.max_num_merge_cand > 1)
--            merge_idx = ff_hevc_merge_idx_decode(s);
-+            merge_idx = ff_hevc_merge_idx_decode(s, lc);
-         else
-             merge_idx = 0;
- 
--        ff_hevc_luma_mv_merge_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
-+        ff_hevc_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-                                    partIdx, merge_idx, &current_mv);
-     } else {
--        hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size,
-+        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-                               partIdx, merge_idx, &current_mv);
-     }
- 
-@@ -1801,13 +3377,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
-         if (!ref0)
-             return;
--        hevc_await_progress(s, ref0, &current_mv.mv[0], y0, nPbH);
-+        hevc_await_progress(s, lc, ref0, &current_mv.mv[0], y0, nPbH);
-     }
-     if (current_mv.pred_flag & PF_L1) {
-         ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
-         if (!ref1)
-             return;
--        hevc_await_progress(s, ref1, &current_mv.mv[1], y0, nPbH);
-+        hevc_await_progress(s, lc, ref1, &current_mv.mv[1], y0, nPbH);
-     }
- 
-     if (current_mv.pred_flag == PF_L0) {
-@@ -1816,16 +3392,33 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
-+#if RPI_INTER
-+        if (s->enable_rpi) {
-+            rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 0,
-+              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-+              ref0->frame);
-+        } else
-+#endif
-+        {
-+            luma_mc_uni(s, lc, dst0, s->frame->linesize[0], ref0->frame,
-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
++    if (s->sh.pic_output_flag)
++        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
++    else
++        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
++
++    ref->poc      = poc;
++    ref->sequence = s->seq_decode;
++    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
++    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
++    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
++    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
++
++    return 0;
++}
++
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
++{
++    do {
++        int nb_output = 0;
++        int min_poc   = INT_MAX;
++        int i, min_idx, ret;
++
++        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
++            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++                HEVCFrame *frame = &s->DPB[i];
++                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
++                        frame->sequence == s->seq_output) {
++                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++                }
++            }
 +        }
- 
-         if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+#if RPI_INTER
-+            if (s->enable_rpi) {
-+                rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
-+                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+                  ref0->frame);
-+                return;
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCFrame *frame = &s->DPB[i];
++            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
++                frame->sequence == s->seq_output) {
++                nb_output++;
++                if (frame->poc < min_poc || nb_output == 1) {
++                    min_poc = frame->poc;
++                    min_idx = i;
++                }
 +            }
-+#endif
-+            chroma_mc_uni(s, lc, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-+            chroma_mc_uni(s, lc, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
-         }
-@@ -1835,17 +3428,34 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
-+#if RPI_INTER
-+        if (s->enable_rpi) {
-+            rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 1,
-+              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-+              ref1->frame);
-+        } else
-+#endif
-+        {
-+            luma_mc_uni(s, lc, dst0, s->frame->linesize[0], ref1->frame,
-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
 +        }
- 
-         if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+#if RPI_INTER
-+            if (s->enable_rpi) {
-+                rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
-+                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+                  ref1->frame);
-+                return;
++
++        /* wait for more frames before output */
++        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
++            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
++            return 0;
++
++        if (nb_output) {
++            HEVCFrame *frame = &s->DPB[min_idx];
++            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
++                return 0;
++
++            ret = av_frame_ref(out, frame->frame);
++            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
++                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
++            else
++                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++            if (ret < 0)
++                return ret;
++            av_log(s->avctx, AV_LOG_DEBUG,
++                   "Output frame with POC %d.\n", frame->poc);
++            return 1;
++        }
++
++        if (s->seq_output != s->seq_decode)
++            s->seq_output = (s->seq_output + 1) & 0xff;
++        else
++            break;
++    } while (1);
++
++    return 0;
++}
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
++{
++    int dpb = 0;
++    int min_poc = INT_MAX;
++    int i;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCFrame *frame = &s->DPB[i];
++        if ((frame->flags) &&
++            frame->sequence == s->seq_output &&
++            frame->poc != s->poc) {
++            dpb++;
++        }
++    }
++
++    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCFrame *frame = &s->DPB[i];
++            if ((frame->flags) &&
++                frame->sequence == s->seq_output &&
++                frame->poc != s->poc) {
++                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
++                    min_poc = frame->poc;
++                }
 +            }
-+#endif
-+            chroma_mc_uni(s, lc, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
- 
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-+            chroma_mc_uni(s, lc, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
-         }
-@@ -1855,15 +3465,35 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
- 
--        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
-+#if RPI_INTER
-+        if (s->enable_rpi) {
-+            rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
-+        } else
-+#endif
-+        {
-+            luma_mc_bi(s, lc, dst0, s->frame->linesize[0], ref0->frame,
-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
-                    ref1->frame, &current_mv.mv[1], &current_mv);
 +        }
- 
-         if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+#if RPI_INTER
-+          if (s->enable_rpi) {
-+              rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
-+                           &current_mv,
-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
-+                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
-+                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+                           ref0->frame,
-+                           ref1->frame);
-+                return;
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCFrame *frame = &s->DPB[i];
++            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
++                frame->sequence == s->seq_output &&
++                frame->poc <= min_poc) {
++                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
 +            }
-+#endif
-+            chroma_mc_bi(s, lc, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
- 
--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-+            chroma_mc_bi(s, lc, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
-         }
-     }
-@@ -1872,10 +3502,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
- /**
-  * 8.4.1
-  */
--static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
-+static int luma_intra_pred_mode(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int pu_size,
-                                 int prev_intra_luma_pred_flag)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
-     int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
-     int min_pu_width     = s->ps.sps->min_pu_width;
-@@ -1952,7 +3581,7 @@ static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size,
-     return intra_pred_mode;
- }
- 
--static av_always_inline void set_ct_depth(HEVCContext *s, int x0, int y0,
-+static av_always_inline void set_ct_depth(const HEVCContext * const s, int x0, int y0,
-                                           int log2_cb_size, int ct_depth)
- {
-     int length = (1 << log2_cb_size) >> s->ps.sps->log2_min_cb_size;
-@@ -1969,10 +3598,9 @@ static const uint8_t tab_mode_idx[] = {
-      0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
-     21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
- 
--static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
--                                  int log2_cb_size)
-+static void intra_prediction_unit(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0,
-+                                  const int log2_cb_size)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
-     uint8_t prev_intra_luma_pred_flag[4];
-     int split   = lc->cu.part_mode == PART_NxN;
-@@ -1983,17 +3611,17 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
- 
-     for (i = 0; i < side; i++)
-         for (j = 0; j < side; j++)
--            prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_prev_intra_luma_pred_flag_decode(s);
-+            prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_prev_intra_luma_pred_flag_decode(lc);
- 
-     for (i = 0; i < side; i++) {
-         for (j = 0; j < side; j++) {
-             if (prev_intra_luma_pred_flag[2 * i + j])
--                lc->pu.mpm_idx = ff_hevc_mpm_idx_decode(s);
-+                lc->pu.mpm_idx = ff_hevc_mpm_idx_decode(lc);
-             else
--                lc->pu.rem_intra_luma_pred_mode = ff_hevc_rem_intra_luma_pred_mode_decode(s);
-+                lc->pu.rem_intra_luma_pred_mode = ff_hevc_rem_intra_luma_pred_mode_decode(lc);
- 
-             lc->pu.intra_pred_mode[2 * i + j] =
--                luma_intra_pred_mode(s, x0 + pb_size * j, y0 + pb_size * i, pb_size,
-+                luma_intra_pred_mode(s, lc, x0 + pb_size * j, y0 + pb_size * i, pb_size,
-                                      prev_intra_luma_pred_flag[2 * i + j]);
-         }
-     }
-@@ -2001,7 +3629,7 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
-     if (s->ps.sps->chroma_format_idc == 3) {
-         for (i = 0; i < side; i++) {
-             for (j = 0; j < side; j++) {
--                lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
-+                lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(lc);
-                 if (chroma_mode != 4) {
-                     if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode])
-                         lc->pu.intra_pred_mode_c[2 * i + j] = 34;
-@@ -2014,7 +3642,7 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
-         }
-     } else if (s->ps.sps->chroma_format_idc == 2) {
-         int mode_idx;
--        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
-+        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(lc);
-         if (chroma_mode != 4) {
-             if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-                 mode_idx = 34;
-@@ -2025,7 +3653,7 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
-         }
-         lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
-     } else if (s->ps.sps->chroma_format_idc != 0) {
--        chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s);
-+        chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(lc);
-         if (chroma_mode != 4) {
-             if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-                 lc->pu.intra_pred_mode_c[0] = 34;
-@@ -2037,11 +3665,10 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0,
-     }
- }
- 
--static void intra_prediction_unit_default_value(HEVCContext *s,
-+static void intra_prediction_unit_default_value(const HEVCContext * const s, HEVCLocalContext * const lc,
-                                                 int x0, int y0,
-                                                 int log2_cb_size)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     int pb_size          = 1 << log2_cb_size;
-     int size_in_pus      = pb_size >> s->ps.sps->log2_min_pu_size;
-     int min_pu_width     = s->ps.sps->min_pu_width;
-@@ -2060,10 +3687,9 @@ static void intra_prediction_unit_default_value(HEVCContext *s,
-                 tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA;
- }
- 
--static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
-+static int hls_coding_unit(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int log2_cb_size)
- {
-     int cb_size          = 1 << log2_cb_size;
--    HEVCLocalContext *lc = s->HEVClc;
-     int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-     int length           = cb_size >> log2_min_cb_size;
-     int min_cb_width     = s->ps.sps->min_cb_width;
-@@ -2083,14 +3709,14 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
-     for (x = 0; x < 4; x++)
-         lc->pu.intra_pred_mode[x] = 1;
-     if (s->ps.pps->transquant_bypass_enable_flag) {
--        lc->cu.cu_transquant_bypass_flag = ff_hevc_cu_transquant_bypass_flag_decode(s);
-+        lc->cu.cu_transquant_bypass_flag = ff_hevc_cu_transquant_bypass_flag_decode(lc);
-         if (lc->cu.cu_transquant_bypass_flag)
-             set_deblocking_bypass(s, x0, y0, log2_cb_size);
-     } else
-         lc->cu.cu_transquant_bypass_flag = 0;
- 
-     if (s->sh.slice_type != HEVC_SLICE_I) {
--        uint8_t skip_flag = ff_hevc_skip_flag_decode(s, x0, y0, x_cb, y_cb);
-+        uint8_t skip_flag = ff_hevc_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
- 
-         x = y_cb * min_cb_width + x_cb;
-         for (y = 0; y < length; y++) {
-@@ -2107,19 +3733,19 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
-     }
- 
-     if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
--        hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
--        intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
-+        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
- 
-         if (!s->sh.disable_deblocking_filter_flag)
--            ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
-+            ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
-     } else {
-         int pcm_flag = 0;
- 
-         if (s->sh.slice_type != HEVC_SLICE_I)
--            lc->cu.pred_mode = ff_hevc_pred_mode_decode(s);
-+            lc->cu.pred_mode = ff_hevc_pred_mode_decode(lc);
-         if (lc->cu.pred_mode != MODE_INTRA ||
-             log2_cb_size == s->ps.sps->log2_min_cb_size) {
--            lc->cu.part_mode        = ff_hevc_part_mode_decode(s, log2_cb_size);
-+            lc->cu.part_mode        = ff_hevc_part_mode_decode(s, lc, log2_cb_size);
-             lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
-                                       lc->cu.pred_mode == MODE_INTRA;
-         }
-@@ -2128,54 +3754,56 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
-             if (lc->cu.part_mode == PART_2Nx2N && s->ps.sps->pcm_enabled_flag &&
-                 log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
-                 log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size) {
--                pcm_flag = ff_hevc_pcm_flag_decode(s);
-+                pcm_flag = ff_hevc_pcm_flag_decode(lc);
-             }
-             if (pcm_flag) {
--                intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
--                ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
-+                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+                ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size);
-                 if (s->ps.sps->pcm.loop_filter_disable_flag)
-+                {
-                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+                }
- 
-                 if (ret < 0)
-                     return ret;
-             } else {
--                intra_prediction_unit(s, x0, y0, log2_cb_size);
-+                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
-             }
-         } else {
--            intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
-+            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-             switch (lc->cu.part_mode) {
-             case PART_2Nx2N:
--                hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-                 break;
-             case PART_2NxN:
--                hls_prediction_unit(s, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
--                hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
-                 break;
-             case PART_Nx2N:
--                hls_prediction_unit(s, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
--                hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
-                 break;
-             case PART_2NxnU:
--                hls_prediction_unit(s, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
--                hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx);
-+                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx);
-                 break;
-             case PART_2NxnD:
--                hls_prediction_unit(s, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx);
--                hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
-+                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx);
-+                hls_prediction_unit(s, lc, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
-                 break;
-             case PART_nLx2N:
--                hls_prediction_unit(s, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
--                hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
-+                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
-                 break;
-             case PART_nRx2N:
--                hls_prediction_unit(s, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2);
--                hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
-+                hls_prediction_unit(s, lc, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2);
-+                hls_prediction_unit(s, lc, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
-                 break;
-             case PART_NxN:
--                hls_prediction_unit(s, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
--                hls_prediction_unit(s, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
--                hls_prediction_unit(s, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
--                hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
-+                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
-+                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
-+                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
-                 break;
-             }
-         }
-@@ -2185,27 +3813,27 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
- 
-             if (lc->cu.pred_mode != MODE_INTRA &&
-                 !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
--                rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(s);
-+                rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(lc);
-             }
-             if (rqt_root_cbf) {
-                 const static int cbf[2] = { 0 };
-                 lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
-                                          s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
-                                          s->ps.sps->max_transform_hierarchy_depth_inter;
--                ret = hls_transform_tree(s, x0, y0, x0, y0, x0, y0,
-+                ret = hls_transform_tree(s, lc, x0, y0, x0, y0, x0, y0,
-                                          log2_cb_size,
-                                          log2_cb_size, 0, 0, cbf, cbf);
-                 if (ret < 0)
-                     return ret;
-             } else {
-                 if (!s->sh.disable_deblocking_filter_flag)
--                    ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
-+                    ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
-             }
-         }
-     }
- 
-     if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0)
--        ff_hevc_set_qPy(s, x0, y0, log2_cb_size);
-+        ff_hevc_set_qPy(s, lc, x0, y0, log2_cb_size);
- 
-     x = y_cb * min_cb_width + x_cb;
-     for (y = 0; y < length; y++) {
-@@ -2218,217 +3846,1445 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
-         lc->qPy_pred = lc->qp_y;
-     }
- 
--    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth);
-+    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth);
++        }
 +
-+    return 0;
++        dpb--;
++    }
 +}
 +
-+// Returns:
-+//  < 0  Error
-+//  0    More data wanted
-+//  1    EoSlice / EoPicture
-+static int hls_coding_quadtree(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0,
-+                               const int log2_cb_size, const int cb_depth)
++static int init_slice_rpl(HEVCRpiContext *s)
 +{
-+    const int cb_size    = 1 << log2_cb_size;
-+    int ret;
-+    int split_cu;
++    HEVCFrame *frame = s->ref;
++    int ctb_count    = frame->ctb_count;
++    int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    int i;
 +
-+    lc->ct_depth = cb_depth;
-+    if (x0 + cb_size <= s->ps.sps->width  &&
-+        y0 + cb_size <= s->ps.sps->height &&
-+        log2_cb_size > s->ps.sps->log2_min_cb_size) {
-+        split_cu = ff_hevc_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
-+    } else {
-+        split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
-+    }
-+    if (s->ps.pps->cu_qp_delta_enabled_flag &&
-+        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth) {
-+        lc->tu.is_cu_qp_delta_coded = 0;
-+        lc->tu.cu_qp_delta          = 0;
-+    }
++    if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
++        return AVERROR_INVALIDDATA;
 +
-+    lc->tu.is_cu_chroma_qp_offset_coded = !(s->sh.cu_chroma_qp_offset_enabled_flag &&
-+        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth);
-+    lc->tu.cu_qp_offset_cb = 0;
-+    lc->tu.cu_qp_offset_cr = 0;
++    for (i = ctb_addr_ts; i < ctb_count; i++)
++        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
 +
-+    if (split_cu) {
-+        int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
-+        const int cb_size_split = cb_size >> 1;
-+        const int x1 = x0 + cb_size_split;
-+        const int y1 = y0 + cb_size_split;
++    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
 +
-+        int more_data = 0;
++    return 0;
++}
 +
-+        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
-+        if (more_data < 0)
-+            return more_data;
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
++{
++    SliceHeader *sh = &s->sh;
 +
-+        if (more_data && x1 < s->ps.sps->width) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
-+        }
-+        if (more_data && x1 < s->ps.sps->width &&
-+            y1 < s->ps.sps->height) {
-+            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
-+            if (more_data < 0)
-+                return more_data;
++    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
++    uint8_t list_idx;
++    int i, j, ret;
++
++    ret = init_slice_rpl(s);
++    if (ret < 0)
++        return ret;
++
++    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
++          s->rps[LT_CURR].nb_refs)) {
++        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    for (list_idx = 0; list_idx < nb_list; list_idx++) {
++        RefPicList  rpl_tmp = { { 0 } };
++        RefPicList *rpl     = &s->ref->refPicList[list_idx];
++
++        /* The order of the elements is
++         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
++         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
++        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
++                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
++                              LT_CURR };
++
++        /* concatenate the candidate lists for the current frame */
++        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
++            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
++                RefPicList *rps = &s->rps[cand_lists[i]];
++                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
++                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
++                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
++                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
++                    rpl_tmp.nb_refs++;
++                }
++            }
 +        }
 +
-+        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
-+            lc->qPy_pred = lc->qp_y;
++        /* reorder the references if necessary */
++        if (sh->rpl_modification_flag[list_idx]) {
++            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
++                int idx = sh->list_entry_lx[list_idx][i];
 +
-+        if (more_data)
-+            return ((x1 + cb_size_split) < s->ps.sps->width ||
-+                    (y1 + cb_size_split) < s->ps.sps->height);
-+        else
-+            return 0;
-+    } else {
-+        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
-+        if (ret < 0)
-+            return ret;
-+        if ((!((x0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (x0 + cb_size >= s->ps.sps->width)) &&
-+            (!((y0 + cb_size) %
-+               (1 << (s->ps.sps->log2_ctb_size))) ||
-+             (y0 + cb_size >= s->ps.sps->height))) {
-+            int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(lc);
-+            return !end_of_slice_flag;
++                if (idx >= rpl_tmp.nb_refs) {
++                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                rpl->list[i]       = rpl_tmp.list[idx];
++                rpl->ref[i]        = rpl_tmp.ref[idx];
++                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
++                rpl->nb_refs++;
++            }
 +        } else {
-+            return 1;
++            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
++            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
 +        }
++
++        if (sh->collocated_list == list_idx &&
++            sh->collocated_ref_idx < rpl->nb_refs)
++            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
 +    }
 +
-+    return 0;  // NEVER
++    return 0;
 +}
 +
-+static void hls_decode_neighbour(const HEVCContext * const s, HEVCLocalContext * const lc,
-+                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
++static HEVCFrame *find_ref_idx(HEVCRpiContext *s, int poc)
 +{
-+    const int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
-+    const int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    const int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
-+    const int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
-+
-+    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
++    int i;
++    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
 +
-+    lc->end_of_tiles_x = idxX + 1 >= s->ps.pps->num_tile_columns ? s->ps.sps->width :
-+        (s->ps.pps->col_bd[idxX + 1] << s->ps.sps->log2_ctb_size);
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCFrame *ref = &s->DPB[i];
++        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
++            if ((ref->poc & LtMask) == poc)
++                return ref;
++        }
++    }
 +
-+    if (ctb_addr_ts == 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1] ||
-+        (s->ps.pps->entropy_coding_sync_enabled_flag && (x_ctb >> s->ps.sps->log2_ctb_size) == s->ps.pps->col_bd[idxX]))
-+    {
-+//        lc->first_qp_group = 1;
-+        lc->qPy_pred = s->sh.slice_qp;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCFrame *ref = &s->DPB[i];
++        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
++            if (ref->poc == poc || (ref->poc & LtMask) == poc)
++                return ref;
++        }
 +    }
 +
-+    lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Could not find ref with POC %d\n", poc);
++    return NULL;
++}
 +
-+    lc->boundary_flags = 0;
++static void mark_ref(HEVCFrame *frame, int flag)
++{
++    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
++    frame->flags |= flag;
++}
 +
-+    if (x_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]])
-+        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
-+    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
-+        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
-+    if (y_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]])
-+        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
-+    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
-+        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
++static HEVCFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
++{
++    HEVCFrame *frame;
++    int i, x, y;
 +
-+    lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0;
-+    lc->ctb_up_flag   = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0;
-+    lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
-+        (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width);
++    frame = alloc_frame(s);
++    if (!frame)
++        return NULL;
 +
-+    lc->ctb_up_right_flag = ((y_ctb > 0) && (x_ctb + ctb_size) < lc->end_of_tiles_x &&
-+        (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) &&
-+        (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]]));
-+}
++    if (!s->ps.sps->pixel_shift) {
++        for (i = 0; frame->frame->buf[i]; i++)
++            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
++                   frame->frame->buf[i]->size);
++    } else {
++        for (i = 0; frame->frame->data[i]; i++)
++            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
++                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
++                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
++                            1 << (s->ps.sps->bit_depth - 1));
++                }
++    }
 +
-+#ifdef RPI
++    frame->poc      = poc;
++    frame->sequence = s->seq_decode;
++    frame->flags    = 0;
 +
-+#if 0
-+static inline void ts_to_xy(const HEVCContext * const s, const unsigned int ctb_ts, unsigned int * const px, unsigned int * const py)
-+{
-+    const unsigned int ctb_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_ts];
-+    const unsigned int ctb_width = s->ps.sps->ctb_width;
-+    *px = (ctb_rs % ctb_width) << s->ps.sps->log2_ctb_size;
-+    *py = (ctb_rs / ctb_width) << s->ps.sps->log2_ctb_size;
++    ff_hevc_rpi_progress_set_all_done(frame);
++
++    return frame;
 +}
-+#endif
 +
-+static void rpi_execute_dblk_cmds(HEVCContext * const s, HEVCRpiJob * const jb)
++/* add a reference with the given poc to the list and mark it as used in DPB */
++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
++                             int poc, int ref_flag)
 +{
-+    const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
-+    const unsigned int x0 = FFMAX(jb->bounds.x, ctb_size) - ctb_size;
-+    const unsigned int y0 = FFMAX(jb->bounds.y, ctb_size) - ctb_size;
-+    const unsigned int bound_r = jb->bounds.x + jb->bounds.w;
-+    const unsigned int bound_b = jb->bounds.y + jb->bounds.h;
-+    const int x_end = (bound_r >= s->ps.sps->width);
-+    const int y_end = (bound_b >= s->ps.sps->height);
-+    const unsigned int xr = bound_r - (x_end ? 0 : ctb_size);
-+    const unsigned int yb = bound_b - (y_end ? 0 : ctb_size);
-+    unsigned int x, y;
++    HEVCFrame *ref = find_ref_idx(s, poc);
 +
-+    for (y = y0; y < yb; y += ctb_size ) {
-+        for (x = x0; x < xr; x += ctb_size ) {
-+            ff_hevc_hls_filter(s, x, y, ctb_size);
-+        }
++    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
++        return AVERROR_INVALIDDATA;
++
++    if (!ref) {
++        ref = generate_missing_ref(s, poc);
++        if (!ref)
++            return AVERROR(ENOMEM);
 +    }
 +
-+    // Flush (SAO)
-+    if (y > y0) {
-+        const int tile_end = y_end ||
-+            s->ps.pps->tile_id[jb->ctu_ts_last] != s->ps.pps->tile_id[jb->ctu_ts_last + 1];
-+        const unsigned int xl = x0 > ctb_size ? x0 - ctb_size : 0;
-+        const unsigned int yt = y0 > ctb_size ? y0 - ctb_size : 0;
-+        const unsigned int yb = tile_end ? bound_b : y - ctb_size;
++    list->list[list->nb_refs] = ref->poc;
++    list->ref[list->nb_refs]  = ref;
++    list->nb_refs++;
 +
-+        rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          xl, yt, bound_r - xl, yb - yt,
-+          s->ps.sps->vshift[1], 1, 1);
-+        rpi_cache_flush_finish(rfe);
-+    }
++    mark_ref(ref, ref_flag);
++    return 0;
++}
 +
-+    // Signal
-+    if (s->threads_type == FF_THREAD_FRAME && x_end && y0 > 0) {
-+        ff_hevc_progress_signal_recon(s, y_end ? INT_MAX : y0 - 1);
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
++{
++    const ShortTermRPS *short_rps = s->sh.short_term_rps;
++    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
++    RefPicList               *rps = s->rps;
++    int i, ret = 0;
++
++    if (!short_rps) {
++        rps[0].nb_refs = rps[1].nb_refs = 0;
++        return 0;
 +    }
 +
-+    // Job done now
-+    // ? Move outside this fn
-+    job_free(s->jbc, jb);
-+}
++    /* clear the reference flags on all frames except the current one */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCFrame *frame = &s->DPB[i];
 +
-+#if 0
-+static void rpi_execute_transform(HEVCContext *s)
-+{
-+    int i=2;
-+    int job = s->pass1_job;
-+    /*int j;
-+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
-+        s->hevcdsp.idct[4-2](coeffs, 16);
-+    }
-+    i=3;
-+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
-+        s->hevcdsp.idct[5-2](coeffs, 32);
-+    }*/
-+
-+    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
-+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
-+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
-+    //vpu_wait(s->vpu_id);
-+
-+    for(i=0;i<4;i++)
-+        s->num_coeffs[job][i] = 0;
-+}
-+#endif
++        if (frame == s->ref)
++            continue;
 +
++        mark_ref(frame, 0);
++    }
 +
-+#define RPI_OPT_SEP_PRED 0
++    for (i = 0; i < NB_RPS_TYPE; i++)
++        rps[i].nb_refs = 0;
 +
++    /* add the short refs */
++    for (i = 0; i < short_rps->num_delta_pocs; i++) {
++        int poc = s->poc + short_rps->delta_poc[i];
++        int list;
 +
-+// I-pred, transform_and_add for all blocks types done here
-+// All ARM
-+#if RPI_OPT_SEP_PRED
-+static void rpi_execute_pred_cmds(const HEVCContext *const s, HEVCRpiJob * const jb, const int do_luma, const int do_chroma)
-+#else
-+static void rpi_execute_pred_cmds(HEVCContext * const s, HEVCRpiJob * const jb)
-+#endif
-+{
-+  unsigned int i;
-+  HEVCRpiIntraPredEnv * const iap = &jb->intra;
-+  const HEVCPredCmd *cmd = iap->cmds;
-+
-+  for(i = iap->n; i > 0; i--, cmd++) {
-+//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
-+#if RPI_OPT_SEP_PRED
-+      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
-+          continue;
-+      }
-+#endif
++        if (!short_rps->used[i])
++            list = ST_FOLL;
++        else if (i < short_rps->num_negative_pics)
++            list = ST_CURR_BEF;
++        else
++            list = ST_CURR_AFT;
 +
-+      switch (cmd->type)
-+      {
-+      case RPI_PRED_INTRA:
-+          {
-+              HEVCLocalContextIntra lci; // Abbreviated local context
-+              HEVCLocalContext * const lc = (HEVCLocalContext *)&lci;
-+              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
-+              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-+              lc->na.cand_left         = (cmd->na >> 3) & 1;
-+              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
-+              lc->na.cand_up           = (cmd->na >> 1) & 1;
-+              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+              if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
-+                  s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
-+              else
-+                  s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
-+              break;
-+          }
-+
-+          case RPI_PRED_ADD_RESIDUAL:
-+              s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+              break;
-+          case RPI_PRED_ADD_DC:
-+              s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+              break;
-+#if RPI_HEVC_SAND
-+          case RPI_PRED_ADD_RESIDUAL_U:
-+              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+              break;
-+          case RPI_PRED_ADD_RESIDUAL_V:
-+              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+              break;
-+          case RPI_PRED_ADD_RESIDUAL_C:
-+              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+              break;
-+          case RPI_PRED_ADD_DC_U:
-+          case RPI_PRED_ADD_DC_V:
-+              s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+              break;
-+#endif
++        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
++        if (ret < 0)
++            goto fail;
++    }
 +
-+          case RPI_PRED_I_PCM:
-+              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
-+              break;
-+
-+          default:
-+              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
-+              abort();
-+      }
-+  }
-+#if RPI_OPT_SEP_PRED
-+  if (do_luma)
-+#endif
-+  {
-+      iap->n = 0;
-+  }
-+}
++    /* add the long refs */
++    for (i = 0; i < long_rps->nb_refs; i++) {
++        int poc  = long_rps->poc[i];
++        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
 +
++        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
++        if (ret < 0)
++            goto fail;
++    }
 +
-+#endif
++fail:
++    /* release any frames that are now unused */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
 +
-+#ifdef RPI
++    return ret;
++}
 +
-+// Set initial uniform job values & zero ctu_count
-+static void rpi_begin(const HEVCContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
 +{
-+#if RPI_INTER
-+    unsigned int i;
-+    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
-+    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
-+    const HEVCSPS * const sps = s->ps.sps;
-+
-+    const uint16_t pic_width_y   = sps->width;
-+    const uint16_t pic_height_y  = sps->height;
-+
-+    const uint16_t pic_width_c   = sps->width >> sps->hshift[1];
-+    const uint16_t pic_height_c  = sps->height >> sps->vshift[1];
-+
-+    // We expect the pointer to change if we use another sps
-+    if (sps != jb->sps)
-+    {
-+        worker_pic_free_one(jb);
++    int ret = 0;
++    int i;
++    const ShortTermRPS *rps = s->sh.short_term_rps;
++    LongTermRPS *long_rps   = &s->sh.long_term_rps;
++
++    if (rps) {
++        for (i = 0; i < rps->num_negative_pics; i++)
++            ret += !!rps->used[i];
++        for (; i < rps->num_delta_pocs; i++)
++            ret += !!rps->used[i];
++    }
 +
-+        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
-+        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
++    if (long_rps) {
++        for (i = 0; i < long_rps->nb_refs; i++)
++            ret += !!long_rps->used[i];
++    }
++    return ret;
++}
+diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
+new file mode 100644
+index 0000000000..c98b0804ed
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.c
+@@ -0,0 +1,364 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+        {
-+            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * RPI_MAX_WIDTH;
-+            const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-+            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
++#include "golomb.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
++{
++    int cIdx, i;
++    uint8_t hash_type;
++    //uint16_t picture_crc;
++    //uint32_t picture_checksum;
++    hash_type = get_bits(gb, 8);
++
++    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
++        if (hash_type == 0) {
++            s->is_md5 = 1;
++            for (i = 0; i < 16; i++)
++                s->md5[cIdx][i] = get_bits(gb, 8);
++        } else if (hash_type == 1) {
++            // picture_crc = get_bits(gb, 16);
++            skip_bits(gb, 16);
++        } else if (hash_type == 2) {
++            // picture_checksum = get_bits_long(gb, 32);
++            skip_bits(gb, 32);
 +        }
++    }
++    return 0;
++}
 +
-+        jb->sps = sps;
++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
++{
++    int i;
++    // Mastering primaries
++    for (i = 0; i < 3; i++) {
++        s->display_primaries[i][0] = get_bits(gb, 16);
++        s->display_primaries[i][1] = get_bits(gb, 16);
 +    }
++    // White point (x, y)
++    s->white_point[0] = get_bits(gb, 16);
++    s->white_point[1] = get_bits(gb, 16);
++
++    // Max and min luminance of mastering display
++    s->max_luminance = get_bits_long(gb, 32);
++    s->min_luminance = get_bits_long(gb, 32);
++
++    // As this SEI message comes before the first frame that references it,
++    // initialize the flag to 2 and decrement on IRAP access unit so it
++    // persists for the coded video sequence (e.g., between two IRAPs)
++    s->present = 2;
++    return 0;
++}
 +
-+    jb->waited = 0;
-+    jb->ctu_ts_first = ctu_ts_first;
-+    jb->ctu_ts_last = -1;
++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
++{
++    // Max and average light levels
++    s->max_content_light_level     = get_bits_long(gb, 16);
++    s->max_pic_average_light_level = get_bits_long(gb, 16);
++    // As this SEI message comes before the first frame that references it,
++    // initialize the flag to 2 and decrement on IRAP access unit so it
++    // persists for the coded video sequence (e.g., between two IRAPs)
++    s->present = 2;
++    return  0;
++}
 +
-+    rpi_inter_pred_reset(cipe);
-+    for (i = 0; i < cipe->n; i++) {
-+        HEVCRpiInterPredQ * const cp = cipe->q + i;
-+        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
++{
++    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
++    s->present = !get_bits1(gb);
 +
-+        u->next_src1.x = 0;
-+        u->next_src1.y = 0;
-+        u->next_src1.base = 0;
-+        u->pic_cw = pic_width_c;
-+        u->pic_ch = pic_height_c;
-+        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        u->wdenom = s->sh.chroma_log2_weight_denom;
-+        cp->last_l0 = &u->next_src1;
++    if (s->present) {
++        s->arrangement_type               = get_bits(gb, 7);
++        s->quincunx_subsampling           = get_bits1(gb);
++        s->content_interpretation_type    = get_bits(gb, 6);
 +
-+        u->next_fn = 0;
-+        u->next_src2.x = 0;
-+        u->next_src2.y = 0;
-+        u->next_src2.base = 0;
-+        cp->last_l1 = &u->next_src2;
++        // the following skips spatial_flipping_flag frame0_flipped_flag
++        // field_views_flag current_frame_is_frame0_flag
++        // frame0_self_contained_flag frame1_self_contained_flag
++        skip_bits(gb, 6);
 +
-+        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++        if (!s->quincunx_subsampling && s->arrangement_type != 5)
++            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
++        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
++        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
 +    }
++    skip_bits1(gb);             // upsampled_aspect_ratio_flag
++    return 0;
++}
 +
-+    rpi_inter_pred_reset(yipe);
-+    for (i = 0; i < yipe->n; i++) {
-+        HEVCRpiInterPredQ * const yp = yipe->q + i;
-+        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
++{
++    s->present = !get_bits1(gb);
 +
-+        y->next_src1.x = 0;
-+        y->next_src1.y = 0;
-+        y->next_src1.base = 0;
-+        y->next_src2.x = 0;
-+        y->next_src2.y = 0;
-+        y->next_src2.base = 0;
-+        y->pic_h = pic_height_y;
-+        y->pic_w = pic_width_y;
-+        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+        y->wdenom = s->sh.luma_log2_weight_denom;
-+        y->next_fn = 0;
-+        yp->last_l0 = &y->next_src1;
-+        yp->last_l1 = &y->next_src2;
++    if (s->present) {
++        s->hflip = get_bits1(gb);     // hor_flip
++        s->vflip = get_bits1(gb);     // ver_flip
 +
-+        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
++        s->anticlockwise_rotation = get_bits(gb, 16);
++        skip_bits1(gb);     // display_orientation_persistence_flag
 +    }
 +
-+    jb->last_y8_p = NULL;
-+    jb->last_y8_l1 = NULL;
++    return 0;
++}
 +
-+    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
-+        jb->progress[i] = -1;
-+    }
++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
++                                     void *logctx, int size)
++{
++    HEVCSEIPictureTiming *h = &s->picture_timing;
++    HEVCRpiSPS *sps;
 +
-+    worker_pic_reset(&jb->coeffs);
++    if (!ps->sps_list[s->active_seq_parameter_set_id])
++        return(AVERROR(ENOMEM));
++    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
 +
-+#endif
-+}
-+#endif
++    if (sps->vui.frame_field_info_present_flag) {
++        int pic_struct = get_bits(gb, 4);
++        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
++        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
++            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
++            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
++        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
++            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
++            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
++        }
++        get_bits(gb, 2);                   // source_scan_type
++        get_bits(gb, 1);                   // duplicate_flag
++        skip_bits1(gb);
++        size--;
++    }
++    skip_bits_long(gb, 8 * size);
 +
++    return 0;
++}
 +
-+#if RPI_INTER
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_qpu(const HEVCContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
++                                                      int size)
 +{
-+    unsigned int i;
-+    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
-+    unsigned int max_block = 0;
-+
-+    if (!ipe->used) {
-+        return 0;
-+    }
++    int flag;
++    int user_data_type_code;
++    int cc_count;
 +
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
++    if (size < 3)
++       return AVERROR(EINVAL);
 +
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
-+        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++    user_data_type_code = get_bits(gb, 8);
++    if (user_data_type_code == 0x3) {
++        skip_bits(gb, 1); // reserved
 +
-+        if (block_size > max_block)
-+            max_block = block_size;
++        flag = get_bits(gb, 1); // process_cc_data_flag
++        if (flag) {
++            skip_bits(gb, 1);
++            cc_count = get_bits(gb, 5);
++            skip_bits(gb, 8); // reserved
++            size -= 2;
 +
-+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++            if (cc_count && size >= cc_count * 3) {
++                const uint64_t new_size = (s->a53_caption_size + cc_count
++                                           * UINT64_C(3));
++                int i, ret;
 +
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_qpu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_qpu;
++                if (new_size > INT_MAX)
++                    return AVERROR(EINVAL);
 +
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
++                /* Allow merging of the cc data from two fields. */
++                ret = av_reallocp(&s->a53_caption, new_size);
++                if (ret < 0)
++                    return ret;
 +
-+        // Add to mailbox list
-+        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
-+        mail[i][1] = yp->code_setup;
++                for (i = 0; i < cc_count; i++) {
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                }
++                skip_bits(gb, 8); // marker_bits
++            }
++        }
++    } else {
++        int i;
++        for (i = 0; i < size - 1; i++)
++            skip_bits(gb, 8);
 +    }
 +
-+#if RPI_CACHE_UNIF_MVS
-+    // We don't need invalidate here as the uniforms aren't changed by the QPU
-+    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
-+    // new values which seems to give us a small performance advantage
-+    //
-+    // In most cases we will not have a completely packed set of uniforms and as
-+    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
-+    // fullest
-+    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
-+                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
-+                                  ipe->n, ipe->max_fill + ipe->min_gap);
-+#endif
-+    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
-+
-+    return 1;
++    return 0;
 +}
-+#endif
 +
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_emu(const HEVCContext * const s,
-+                                     const vpu_qpu_job_h vqj,
-+                                     rpi_cache_flush_env_t * const rfe,
-+                                     HEVCRpiInterPredEnv * const ipe)
++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
++                                                         int size)
 +{
-+    unsigned int i;
-+    if (!ipe->used) {
-+        return 0;
-+    }
++    uint32_t country_code;
++    uint32_t user_identifier;
 +
-+    if (ipe->curr != 0) {
-+        rpi_inter_pred_sync(ipe);
-+    }
++    if (size < 7)
++        return AVERROR(EINVAL);
++    size -= 7;
 +
-+    // Add final commands to Q
-+    for(i = 0; i != ipe->n; ++i) {
-+        HEVCRpiInterPredQ * const yp = ipe->q + i;
-+        qpu_mc_src_t *const p0 = yp->last_l0;
-+        qpu_mc_src_t *const p1 = yp->last_l1;
++    country_code = get_bits(gb, 8);
++    if (country_code == 0xFF) {
++        skip_bits(gb, 8);
++        size--;
++    }
 +
-+        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++    skip_bits(gb, 8);
++    skip_bits(gb, 8);
 +
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->x = MC_DUMMY_X;
-+        p0->y = MC_DUMMY_Y;
-+        p0->base = s->qpu_dummy_frame_emu;
-+        p1->x = MC_DUMMY_X;
-+        p1->y = MC_DUMMY_Y;
-+        p1->base = s->qpu_dummy_frame_emu;
++    user_identifier = get_bits_long(gb, 32);
 +
-+        yp->last_l0 = NULL;
-+        yp->last_l1 = NULL;
++    switch (user_identifier) {
++        case MKBETAG('G', 'A', '9', '4'):
++            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
++        default:
++            skip_bits_long(gb, size * 8);
++            break;
 +    }
-+
-+    return 1;
++    return 0;
 +}
-+#endif
 +
++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
++{
++    int num_sps_ids_minus1;
++    int i;
++    unsigned active_seq_parameter_set_id;
 +
-+#if RPI_QPU_EMU_Y
-+#define mc_terminate_add_y mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_y mc_terminate_add_qpu
-+#endif
-+#if RPI_QPU_EMU_C
-+#define mc_terminate_add_c mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_c mc_terminate_add_qpu
-+#endif
-+#endif
++    get_bits(gb, 4); // active_video_parameter_set_id
++    get_bits(gb, 1); // self_contained_cvs_flag
++    get_bits(gb, 1); // num_sps_ids_minus1
++    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
++
++    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
++        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
++        return AVERROR_INVALIDDATA;
++    }
++
++    active_seq_parameter_set_id = get_ue_golomb_long(gb);
++    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
++        return AVERROR_INVALIDDATA;
++    }
++    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
 +
-+#ifdef RPI
++    for (i = 1; i <= num_sps_ids_minus1; i++)
++        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
 +
++    return 0;
++}
 +
-+static void flush_frame(HEVCContext *s,AVFrame *frame)
++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
 +{
-+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
-+  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+  rpi_cache_flush_finish(rfe);
++    s->present = 1;
++    s->preferred_transfer_characteristics = get_bits(gb, 8);
++    return 0;
++}
++
++static int decode_nal_sei_prefix(GetBitContext *gb, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
++                                 int type, int size, void *logctx)
++{
++    switch (type) {
++    case 256:  // Mismatched value from HM 8.1
++        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++    case HEVC_SEI_TYPE_FRAME_PACKING:
++        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
++    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
++        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
++    case HEVC_SEI_TYPE_PICTURE_TIMING:
++        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
++    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
++        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
++    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
++        return decode_nal_sei_content_light_info(&s->content_light, gb);
++    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
++        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
++    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
++        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
++    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
++        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
++    default:
++        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
++        skip_bits_long(gb, 8 * size);
++        return 0;
++    }
 +}
 +
-+static void job_gen_bounds(const HEVCContext * const s, HEVCRpiJob * const jb)
++static int decode_nal_sei_suffix(GetBitContext *gb, HEVCSEIContext *s,
++                                 int type, int size, void *logctx)
 +{
-+    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
-+    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
-+    const unsigned int ctb_width = s->ps.sps->ctb_width;
-+    RpiBlk *const bounds = &jb->bounds;
-+    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
-+    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
-+    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
++    switch (type) {
++    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
++        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++    default:
++        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
++        skip_bits_long(gb, 8 * size);
++        return 0;
++    }
 +}
 +
-+#if RPI_PASSES == 2
-+static void worker_core2(HEVCContext * const s, HEVCRpiJob * const jb)
++static int decode_nal_sei_message(GetBitContext *gb, HEVCSEIContext *s,
++                                  const HEVCRpiParamSets *ps, int nal_unit_type,
++                                  void *logctx)
 +{
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s, jb);
-+
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s, jb);
-+}
-+#endif
-+
-+
-+// Core execution tasks
-+static void worker_core(HEVCContext * const s0, HEVCRpiJob * const jb)
-+{
-+    const HEVCContext * const s = s0;
-+
-+#if RPI_OPT_SEP_PRED
-+    vpu_qpu_wait_h sync_c;
-+#endif
-+    vpu_qpu_wait_h sync_y;
-+
-+    int pred_y, pred_c;
-+
-+    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
-+    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+
-+    {
-+        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+        if (cf->s[3].n + cf->s[2].n != 0)
-+        {
-+            const unsigned int csize = sizeof(cf->s[3].buf[0]);
-+            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
-+            vpu_qpu_job_add_vpu(vqj,
-+                vpu_get_fn(s->ps.sps->bit_depth),
-+                vpu_get_constants(),
-+                cf->gptr.vc,
-+                cf->s[2].n >> 8,
-+                cf->gptr.vc + offset32,
-+                cf->s[3].n >> 10,
-+                0);
++    int payload_type = 0;
++    int payload_size = 0;
++    int byte = 0xFF;
++    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
 +
-+            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
-+            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
-+        }
++    while (byte == 0xFF) {
++        byte          = get_bits(gb, 8);
++        payload_type += byte;
 +    }
-+
-+    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
-+
-+// We can take a sync here and try to locally overlap QPU processing with ARM
-+// but testing showed a slightly negative benefit with noticable extra complexity
-+#if RPI_OPT_SEP_PRED
-+    vpu_qpu_job_add_sync_this(vqj, &sync_c);
-+#endif
-+
-+    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
-+
-+    vpu_qpu_job_add_sync_this(vqj, &sync_y);
-+
-+    rpi_cache_flush_execute(rfe);
-+
-+    // Await progress as required
-+    {
-+        unsigned int i;
-+        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
-+            if (jb->progress[i] >= 0) {
-+                ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
-+            }
-+        }
++    byte = 0xFF;
++    while (byte == 0xFF) {
++        byte          = get_bits(gb, 8);
++        payload_size += byte;
 +    }
-+
-+    vpu_qpu_job_finish(vqj);
-+
-+    // We always work on a rectangular block
-+    if (pred_y || pred_c)
-+    {
-+        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
-+                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
-+                                        s->ps.sps->vshift[1], pred_y, pred_c);
++    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
++        return decode_nal_sei_prefix(gb, s, ps, payload_type, payload_size, logctx);
++    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
++        return decode_nal_sei_suffix(gb, s, payload_type, payload_size, logctx);
 +    }
-+
-+    // If we have emulated VPU ops - do it here
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    if (av_rpi_is_sand8_frame(s->frame))
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        rpi_shader_c8(s, &jb->luma_ip, NULL);
-+#else
-+        rpi_shader_c8(s, NULL, &jb->chroma_ip);
-+#endif
-+    else
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+        rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+        rpi_shader_c16(s, &jb->luma_ip, NULL);
-+#else
-+        rpi_shader_c16(s, NULL, &jb->chroma_ip);
-+#endif
-+#endif
-+
-+#if RPI_OPT_SEP_PRED
-+#error Needs fixup for worker core 2
-+    // Wait for transform completion
-+    vpu_qpu_wait(&sync_c);
-+
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s, jb, 0, 1);
-+
-+    // Wait for transform completion
-+    vpu_qpu_wait(&sync_y);
-+
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s, jb, 1, 0);
-+#else
-+    // Wait for transform completion
-+    vpu_qpu_wait(&sync_y);
-+
-+#endif
-+
-+    rpi_cache_flush_finish(rfe);
 +}
 +
-+#endif
-+
-+static int slice_start(const HEVCContext * const s, HEVCLocalContext *const lc)
++static int more_rbsp_data(GetBitContext *gb)
 +{
-+    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
-+    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
-+
-+    // Check for obvious disasters
-+    if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    if (s->sh.dependent_slice_segment_flag) {
-+        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
-+        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
-+            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+    }
-+
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        s->ps.pps->tile_id[ctb_addr_ts] + s->sh.num_entry_point_offsets >= tiles)
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // Tiled stuff must start at start of tile if it has multiple entry points
-+    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        s->sh.num_entry_point_offsets != 0 &&
-+        s->sh.slice_ctb_addr_rs != s->ps.pps->tile_pos_rs[s->ps.pps->tile_id[ctb_addr_ts]])
-+    {
-+        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    // Setup any required decode vars
-+    if (!s->sh.dependent_slice_segment_flag)
-+        lc->qPy_pred = s->sh.slice_qp;
-+
-+    lc->qp_y = s->sh.slice_qp;
-+
-+    // General setup
-+    lc->wpp_init = 0;
-+#ifdef RPI
-+    lc->bt_line_no = 0;
-+    lc->ts = ctb_addr_ts;
-+#endif
-+    return 0;
++    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
 +}
 +
-+static int gen_entry_points(HEVCContext * const s, const H2645NAL * const nal)
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                           const HEVCRpiParamSets *ps, int type)
 +{
-+    const GetBitContext * const gb = &s->HEVClc->gb;
-+    int i, j;
-+
-+    const unsigned int length = nal->size;
-+    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
-+    unsigned int cmpt;
-+    unsigned int startheader;
-+
-+    if (s->sh.num_entry_point_offsets == 0) {
-+        return 0;
-+    }
-+
-+    for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
-+        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+            startheader--;
-+            cmpt++;
-+        }
-+    }
++    int ret;
 +
-+    for (i = 1; i < s->sh.num_entry_point_offsets; i++) {
-+        offset += (s->sh.entry_point_offset[i - 1] - cmpt);
-+        for (j = 0, cmpt = 0, startheader = offset
-+             + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) {
-+            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+                startheader--;
-+                cmpt++;
-+            }
-+        }
-+        s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt;
-+        s->sh.offset[i - 1] = offset;
-+    }
-+    if (s->sh.num_entry_point_offsets != 0) {
-+        offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt;
-+        if (length < offset) {
-+            av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
-+            return AVERROR_INVALIDDATA;
-+        }
-+        s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset;
-+        s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset;
-+    }
-+    s->data = nal->data;
-+    return 0;
++    do {
++        ret = decode_nal_sei_message(gb, s, ps, type, logctx);
++        if (ret < 0)
++            return ret;
++    } while (more_rbsp_data(gb));
++    return 1;
 +}
 +
-+
-+#ifdef RPI
-+
-+// Return
-+// < 0   Error
-+// 0     OK
-+//
-+// jb->ctu_ts_last < 0       Job still filling
-+// jb->ctu_ts_last >= 0      Job ready
-+
-+static int fill_job(HEVCContext * const s, HEVCLocalContext *const lc, unsigned int max_blocks)
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
 +{
-+    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+    HEVCRpiJob * const jb = lc->jb0;
-+    int more_data = 1;
-+    int ctb_addr_ts = lc->ts;
++    s->a53_caption.a53_caption_size = 0;
++    av_freep(&s->a53_caption.a53_caption);
++}
+diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
+new file mode 100644
+index 0000000000..41e4a20127
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.h
+@@ -0,0 +1,135 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+    lc->unit_done = 0;
-+#if 0
-+    // Generate some errors for testing
-+    {
-+        static int z = 0;
-+        if (++z > 100) {
-+            z = 0;
-+            return -1;
-+        }
-+    }
-+#endif
-+    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
-+    {
-+        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+        const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
-+        const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
-+        int q_full;
++#ifndef AVCODEC_RPI_HEVC_SEI_H
++#define AVCODEC_RPI_HEVC_SEI_H
 +
-+        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
++#include <stdint.h>
 +
-+        ff_hevc_cabac_init(s, lc, ctb_addr_ts);
++#include "libavutil/md5.h"
 +
-+        hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
++#include "get_bits.h"
 +
-+        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
-+        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-+        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
++/**
++ * SEI message types
++ */
++typedef enum {
++    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
++    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
++    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
++    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
++    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
++    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
++    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
++    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
++    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
++    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
++    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
++    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
++    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
++    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
++    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
++    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
++    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
++    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
++    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
++    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
++    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
++    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
++    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
++    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
++    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
++    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
++} HEVC_SEI_Type;
++
++typedef struct HEVCSEIPictureHash {
++    struct AVMD5 *md5_ctx;
++    uint8_t       md5[3][16];
++    uint8_t is_md5;
++} HEVCSEIPictureHash;
++
++typedef struct HEVCSEIFramePacking {
++    int present;
++    int arrangement_type;
++    int content_interpretation_type;
++    int quincunx_subsampling;
++} HEVCSEIFramePacking;
++
++typedef struct HEVCSEIDisplayOrientation {
++    int present;
++    int anticlockwise_rotation;
++    int hflip, vflip;
++} HEVCSEIDisplayOrientation;
++
++typedef struct HEVCSEIPictureTiming {
++    int picture_struct;
++} HEVCSEIPictureTiming;
++
++typedef struct HEVCSEIA53Caption {
++    int a53_caption_size;
++    uint8_t *a53_caption;
++} HEVCSEIA53Caption;
++
++typedef struct HEVCSEIMasteringDisplay {
++    int present;
++    uint16_t display_primaries[3][2];
++    uint16_t white_point[2];
++    uint32_t max_luminance;
++    uint32_t min_luminance;
++} HEVCSEIMasteringDisplay;
++
++typedef struct HEVCSEIContentLight {
++    int present;
++    uint16_t max_content_light_level;
++    uint16_t max_pic_average_light_level;
++} HEVCSEIContentLight;
++
++typedef struct HEVCSEIAlternativeTransfer {
++    int present;
++    int preferred_transfer_characteristics;
++} HEVCSEIAlternativeTransfer;
++
++typedef struct HEVCSEIContext {
++    HEVCSEIPictureHash picture_hash;
++    HEVCSEIFramePacking frame_packing;
++    HEVCSEIDisplayOrientation display_orientation;
++    HEVCSEIPictureTiming picture_timing;
++    HEVCSEIA53Caption a53_caption;
++    HEVCSEIMasteringDisplay mastering_display;
++    HEVCSEIContentLight content_light;
++    int active_seq_parameter_set_id;
++    HEVCSEIAlternativeTransfer alternative_transfer;
++} HEVCSEIContext;
++
++struct HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                           const struct HEVCRpiParamSets *ps, int type);
++
++/**
++ * Reset SEI values that are stored on the Context.
++ * e.g. Caption data that was extracted during NAL
++ * parsing.
++ *
++ * @param s HEVCRpiContext.
++ */
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
 +
-+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
++#endif /* AVCODEC_RPI_HEVC_SEI_H */
+diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
+new file mode 100644
+index 0000000000..4f1d6c71f2
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.c
+@@ -0,0 +1,1570 @@
++#include "rpi_hevc_shader.h"
 +
-+        if (more_data < 0) {
-+            s->tab_slice_address[ctb_addr_rs] = -1;
-+            return more_data;
-+        }
++#ifdef _MSC_VER
++   #include <stdint.h>
++   /* cast through uintptr_t to avoid warnings */
++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
++#else
++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
++#endif
 +
-+        // Inc TS to next.
-+        // N.B. None of the other position vars have changed
-+        ctb_addr_ts++;
-+        ff_hevc_save_states(s, lc, ctb_addr_ts);
-+
-+        // Report progress so we can use our MVs in other frames
-+        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
-+            ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
-+        }
-+
-+        // * None of the 1st 3 tests for q_full should succeed in the current world...
-+        q_full = ((ctb_addr_ts - jb->ctu_ts_first) >= s->max_ctu_count);
-+        if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
-+            q_full = 1;
-+        if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
-+            q_full = 1;
-+        if (q_full) {
-+            // * This is very annoying (and slow) to cope with in WPP so
-+            //   we treat it as an error there (no known stream tiggers this
-+            //   with the current buffer sizes).  Non-wpp should cope fine.
-+            av_log(s, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
-+        }
-+
-+        if (q_full ||
-+            ctb_addr_ts >= s->ps.sps->ctb_size ||
-+            s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts] != ctb_addr_rs + 1 ||
-+            s->ps.pps->tile_id[ctb_addr_ts - 1] != s->ps.pps->tile_id[ctb_addr_ts] ||
-+            x_ctb + ctb_size >= s->ps.sps->width)
-+        {
-+            // Do job
-+            // Prep for submission
-+            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
-+            job_gen_bounds(s, jb);
-+            break;
-+        }
-+
-+        // If max_blocks started as 0 then this will never be true
-+        if (--max_blocks == 0)
-+            break;
-+    }
-+
-+    lc->unit_done = (more_data <= 0);
-+    lc->ts = ctb_addr_ts;
-+    return 0;
-+}
-+
-+static void bt_lc_init(HEVCContext * const s, HEVCLocalContext * const lc, const unsigned int n)
-+{
-+    lc->context = s;
-+    lc->jb0 = NULL;
-+    lc->lc_n = n;
-+    lc->bt_terminate = 0;
-+    lc->bt_psem_out = NULL;
-+    sem_init(&lc->bt_sem_in, 0, 0);
-+}
-+
-+#define TRACE_WPP 0
-+#if RPI_EXTRA_BIT_THREADS > 0
-+static inline unsigned int line_ts_width(const HEVCContext * const s, unsigned int ts)
-+{
-+    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
-+    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
-+}
-+
-+// Move local context parameters from an aux bit thread back to the main
-+// thread at the end of a slice as processing is going to continue there.
-+static void movlc(HEVCLocalContext *const dst_lc, HEVCLocalContext *const src_lc, const int is_dep)
-+{
-+    if (src_lc == dst_lc) {
-+        return;
-+    }
-+
-+    // Move the job
-+    // We will still have an active job if the final line terminates early
-+    // Dest should always be null by now
-+    av_assert1(dst_lc->jb0 == NULL);
-+    dst_lc->jb0 = src_lc->jb0;
-+    src_lc->jb0 = NULL;
-+
-+    // Always need to store where we are in the bitstream
-+    dst_lc->ts = src_lc->ts;
-+    dst_lc->gb = src_lc->gb;
-+    // Need to store context if we might have a dependent seg
-+    if (is_dep)
-+    {
-+        dst_lc->qPy_pred = src_lc->qPy_pred;
-+        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
-+        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
-+    }
-+}
-+
-+static inline int wait_bt_sem_in(HEVCLocalContext * const lc)
-+{
-+    rpi_sem_wait(&lc->bt_sem_in);
-+    return lc->bt_terminate;
-+}
-+
-+// Do one WPP line
-+// Will not work correctly over horizontal tile boundries - vertical should be OK
-+static int rpi_run_one_line(HEVCContext *const s, HEVCLocalContext * const lc, const int is_first)
-+{
-+    const int is_tile = lc->bt_is_tile;
-+    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
-+    const unsigned int line = lc->bt_line_no;
-+    const unsigned int line_inc = lc->bt_line_inc;
-+    const int is_last = (line >= lc->bt_last_line);
-+
-+    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
-+    const unsigned int ts_next =
-+        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
-+            INT_MAX :
-+        is_tile ?
-+            s->ps.pps->ctb_addr_rs_to_ts[s->ps.pps->tile_pos_rs[tile_id + line_inc]] :
-+            lc->ts + lc->bt_line_width * line_inc;
-+    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
-+    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
-+    unsigned int ts_prev;
-+    int loop_n = 0;
-+    int err = 0;
-+
-+    av_assert1(line <= s->sh.num_entry_point_offsets);
-+
-+#if TRACE_WPP
-+    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
-+           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
-+           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
-+           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
-+#endif
-+    if (line != 0)
-+    {
-+        const uint8_t * const data = s->data + s->sh.offset[line - 1];
-+        const unsigned int len = s->sh.size[line - 1];
-+        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
-+            return err;
-+
-+        ff_init_cabac_decoder(&lc->cc, data, len);
-+
-+        lc->wpp_init = 1;  // Stop ff_hevc_cabac_init trying to read non-existant termination bits
-+    }
-+
-+    // We should never be processing a dependent slice here
-+    lc->qPy_pred = s->sh.slice_qp;  //????? needed
-+    lc->qp_y = s->sh.slice_qp;  // **** (ENTP_C fails without this)
-+
-+    do
-+    {
-+        if (!is_last && loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
-+#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+        if (!is_first && loop_n != 0)
-+        {
-+#if TRACE_WPP
-+            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
-+#endif
-+            if (wait_bt_sem_in(lc) != 0)
-+                return AVERROR_EXIT;
-+        }
-+
-+#if TRACE_WPP
-+        {
-+            int n;
-+            sem_getvalue(&lc->bt_sem_in, &n);
-+            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
-+        }
-+#endif
-+
-+        ts_prev = lc->ts;
-+
-+        // If we have had an error - do no further decode but do continue
-+        // moving signals around so the other threads continue to operate
-+        // correctly (or at least as correctly as they can with this line missing)
-+        //
-+        // Errors in WPP/Tile are less fatal than normal as we have a good idea
-+        // of how to restart on the next line so there is no need to give up totally
-+        if (err != 0)
-+        {
-+            lc->unit_done = 0;
-+            lc->ts += partial_size;
-+        }
-+        else
-+        {
-+            worker_pass0_ready(s, lc);
-+
-+            if ((err = fill_job(s, lc, partial_size)) < 0 ||
-+                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
-+            {
-+                if (err == 0) {
-+                    av_log(s, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
-+                    err = AVERROR_INVALIDDATA;
-+                }
-+                worker_free(s, lc);
-+                lc->ts = ts_prev + partial_size;  // Pretend we did all that
-+                lc->unit_done = 0;
-+            }
-+            else if (is_tile)
-+            {
-+                worker_submit_job(s, lc);
-+            }
-+        }
-+
-+        ++loop_n;
-+    } while (lc->ts < ts_eol && !lc->unit_done);
-+
-+    // If we are on the last line & we didn't get a whole line we must wait for
-+    // and sink the sem_posts from the line above / tile to the left.
-+    while ((ts_prev += partial_size) < ts_eol)
-+    {
-+#if TRACE_WPP
-+        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
-+#endif
-+        if (wait_bt_sem_in(lc) != 0)
-+            return AVERROR_EXIT;
-+    }
-+
-+    lc->bt_line_no += line_inc;
-+
-+    if (!is_tile && err == 0)
-+        worker_submit_job(s, lc);
-+
-+    if (!is_last) {
-+        lc->ts = ts_next;
-+
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+        sem_post(lc->bt_psem_out);
-+        if (loop_n > 1) {
-+#if TRACE_WPP
-+            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#ifdef __cplusplus
++extern "C" { /* the types are probably wrong... */
 +#endif
-+            sem_post(lc->bt_psem_out);
-+        }
-+    }
-+    else
-+    {
-+        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);
-+
-+        // When all done poke the thread 0 sem_in one final time
-+#if TRACE_WPP
-+        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
++#ifdef __cplusplus
++}
 +#endif
-+        sem_post(&s->HEVClcList[0]->bt_sem_in);
-+    }
 +
-+#if TRACE_WPP
-+    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
++#ifdef _MSC_VER
++__declspec(align(8))
++#elif defined(__GNUC__)
++__attribute__((aligned(8)))
 +#endif
-+    return err;
-+}
-+
-+static void wpp_setup_lcs(HEVCContext * const s)
-+{
-+    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int line_width = line_ts_width(s, ts);
-+
-+    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
-+    {
-+        HEVCLocalContext * const lc = s->HEVClcList[i];
-+        lc->ts = ts;
-+        lc->bt_is_tile = 0;
-+        lc->bt_line_no = i;
-+        lc->bt_line_width = line_width;
-+        lc->bt_last_line = s->sh.num_entry_point_offsets;
-+        lc->bt_line_inc = RPI_BIT_THREADS;
-+        ts += line_width;
-+    }
-+}
-+
-+
-+// Can only process tile single row at once
-+static void tile_one_row_setup_lcs(HEVCContext * const s, unsigned int slice_row)
-+{
-+    const HEVCPPS * const pps = s->ps.pps;
-+    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    const unsigned int tile0 = pps->tile_id[ts0];
-+    const unsigned int col0 = tile0 % pps->num_tile_columns;
-+
-+    const unsigned int col = (slice_row == 0) ? col0 : 0;
-+    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
-+    const unsigned int last_line = FFMIN(
-+        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
-+
-+    const unsigned int par =
-+        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
-+#if TRACE_WPP
-+    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
-+           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
-+#endif
-+    for (unsigned int i = 0; i != par; ++i, ++line)
-+    {
-+        HEVCLocalContext * const lc = s->HEVClcList[i];
-+        const unsigned int tile = tile0 + line;
-+
-+        lc->ts = pps->ctb_addr_rs_to_ts[pps->tile_pos_rs[tile]];
-+        lc->bt_line_no = line;
-+        lc->bt_is_tile = 1;
-+        lc->bt_line_width = line_ts_width(s, lc->ts);
-+        lc->bt_last_line = last_line;
-+        lc->bt_line_inc = par;
-+    }
-+}
-+
-+
-+static void * bit_thread(void * v)
-+{
-+    HEVCLocalContext * const lc = v;
-+    HEVCContext *const s = lc->context;
-+
-+    while (wait_bt_sem_in(lc) == 0)
-+    {
-+        int err;
-+
-+        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
-+            if (lc->bt_terminate) {
-+                av_log(s, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
-+                break;
-+            }
-+            av_log(s, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
-+        }
-+    }
-+
-+    return NULL;
-+}
-+
-+static int bit_threads_start(HEVCContext * const s)
-+{
-+    if (s->bt_started)
-+        return 0;
-+
-+    for (int i = 1; i < RPI_BIT_THREADS; ++i)
-+    {
-+        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
-+        if (s->HEVClcList[i] == NULL) {
-+            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
-+                return -1;
-+        }
-+
-+        bt_lc_init(s, s->HEVClcList[i], i);
-+        job_lc_init(s->HEVClcList[i]);
-+    }
-+
-+    // Link the sems in a circle
-+    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
-+        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
-+    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
-+
-+    // Init all lc before starting any threads
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
-+            return -1;
-+    }
- 
-+    s->bt_started = 1;
-     return 0;
- }
- 
--static int hls_coding_quadtree(HEVCContext *s, int x0, int y0,
--                               int log2_cb_size, int cb_depth)
-+static int bit_threads_kill(HEVCContext * const s)
- {
--    HEVCLocalContext *lc = s->HEVClc;
--    const int cb_size    = 1 << log2_cb_size;
--    int ret;
--    int split_cu;
-+    if (!s->bt_started)
-+        return 0;
-+    s->bt_started = 0;
- 
--    lc->ct_depth = cb_depth;
--    if (x0 + cb_size <= s->ps.sps->width  &&
--        y0 + cb_size <= s->ps.sps->height &&
--        log2_cb_size > s->ps.sps->log2_min_cb_size) {
--        split_cu = ff_hevc_split_coding_unit_flag_decode(s, cb_depth, x0, y0);
--    } else {
--        split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
--    }
--    if (s->ps.pps->cu_qp_delta_enabled_flag &&
--        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth) {
--        lc->tu.is_cu_qp_delta_coded = 0;
--        lc->tu.cu_qp_delta          = 0;
-+    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+    {
-+        HEVCLocalContext *const lc = s->HEVClcList[i + 1];
-+        if (lc == NULL)
-+            break;
-+
-+        lc->bt_terminate = 1;
-+        sem_post(&lc->bt_sem_in);
-+        pthread_join(s->bit_threads[i], NULL);
-+
-+        sem_destroy(&lc->bt_sem_in);
-+        job_lc_kill(lc);
-     }
-+    return 0;
-+}
-+#endif
-+
-+
-+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+{
-+    HEVCContext * const s  = avctxt->priv_data;
-+    HEVCLocalContext * const lc = s->HEVClc;
-+    int err;
-+
-+    // Start of slice
-+    if ((err = slice_start(s, lc)) != 0)
-+        return err;
-+
-+#if RPI_EXTRA_BIT_THREADS > 0
-+
-+    if (s->sh.num_entry_point_offsets != 0 &&
-+        s->ps.pps->num_tile_columns > 1)
-+    {
-+        unsigned int slice_row = 0;
-+
-+#if TRACE_WPP
-+        printf("%s: Do Tiles\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
-+
-+        do
-+        {
-+            // Reset lc lines etc.
-+            tile_one_row_setup_lcs(s, slice_row);
- 
--    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
--        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth) {
--        lc->tu.is_cu_chroma_qp_offset_coded = 0;
-+#if TRACE_WPP
-+            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
-+                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+            while (lc->bt_line_no <= lc->bt_last_line) {
-+                rpi_sem_wait(&lc->bt_sem_in);
-+                rpi_run_one_line(s, lc, 0);
-+            }
-+#if TRACE_WPP
-+            printf("%s: Done body\n", __func__);
-+#endif
-+
-+            // Wait for everything else to finish
-+            rpi_sem_wait(&lc->bt_sem_in);
-+
-+            ++slice_row;
-+        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
-+
-+
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-     }
-+    else
- 
--    if (split_cu) {
--        int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
--        const int cb_size_split = cb_size >> 1;
--        const int x1 = x0 + cb_size_split;
--        const int y1 = y0 + cb_size_split;
-+    // * We only cope with WPP in a single column
-+    //   Probably want to deal with that case as tiles rather than WPP anyway
-+    // *** Decode error recovery NIF - we currently just crash :-(
-+    // ?? Not actually sure that the main code deals with WPP + multi-col correctly ??
-+    if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        s->ps.pps->num_tile_columns == 1 &&
-+        s->sh.num_entry_point_offsets != 0)
-+    {
-+#if TRACE_WPP
-+        printf("%s: Do WPP\n", __func__);
-+#endif
-+        // Generate & start extra bit threads if they aren't already running
-+        bit_threads_start(s);
- 
--        int more_data = 0;
-+        // Reset lc lines etc.
-+        wpp_setup_lcs(s);
- 
--        more_data = hls_coding_quadtree(s, x0, y0, log2_cb_size - 1, cb_depth + 1);
--        if (more_data < 0)
--            return more_data;
-+        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
-+#if TRACE_WPP
-+        printf("%s: Done 1st\n", __func__);
-+#endif
- 
--        if (more_data && x1 < s->ps.sps->width) {
--            more_data = hls_coding_quadtree(s, x1, y0, log2_cb_size - 1, cb_depth + 1);
--            if (more_data < 0)
--                return more_data;
--        }
--        if (more_data && y1 < s->ps.sps->height) {
--            more_data = hls_coding_quadtree(s, x0, y1, log2_cb_size - 1, cb_depth + 1);
--            if (more_data < 0)
--                return more_data;
--        }
--        if (more_data && x1 < s->ps.sps->width &&
--            y1 < s->ps.sps->height) {
--            more_data = hls_coding_quadtree(s, x1, y1, log2_cb_size - 1, cb_depth + 1);
--            if (more_data < 0)
--                return more_data;
-+        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
-+            rpi_sem_wait(&lc->bt_sem_in);
-+            rpi_run_one_line(s, lc, 0);
-         }
-+#if TRACE_WPP
-+        printf("%s: Done body\n", __func__);
-+#endif
- 
--        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
--            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
--            lc->qPy_pred = lc->qp_y;
-+        // Wait for everything else to finish
-+        rpi_sem_wait(&lc->bt_sem_in);
- 
--        if (more_data)
--            return ((x1 + cb_size_split) < s->ps.sps->width ||
--                    (y1 + cb_size_split) < s->ps.sps->height);
--        else
--            return 0;
--    } else {
--        ret = hls_coding_unit(s, x0, y0, log2_cb_size);
--        if (ret < 0)
--            return ret;
--        if ((!((x0 + cb_size) %
--               (1 << (s->ps.sps->log2_ctb_size))) ||
--             (x0 + cb_size >= s->ps.sps->width)) &&
--            (!((y0 + cb_size) %
--               (1 << (s->ps.sps->log2_ctb_size))) ||
--             (y0 + cb_size >= s->ps.sps->height))) {
--            int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(s);
--            return !end_of_slice_flag;
--        } else {
--            return 1;
--        }
-+#if TRACE_WPP
-+        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-     }
-+    else
-+#endif
-+    {
-+#if TRACE_WPP
-+        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
-+#endif
-+        // Single bit thread
-+        do {
-+            // Make sure we have space to prepare the next job
-+            worker_pass0_ready(s, lc);
- 
--    return 0;
--}
-+            if ((err = fill_job(s, lc, 0)) < 0)
-+                goto fail;
- 
--static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
--                                 int ctb_addr_ts)
--{
--    HEVCLocalContext *lc  = s->HEVClc;
--    int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
--    int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
--    int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;
-+            worker_submit_job(s, lc);
-+        } while (!lc->unit_done);
- 
--    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
-+#if TRACE_WPP
-+        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
-+#endif
-+    }
- 
--    if (s->ps.pps->entropy_coding_sync_enabled_flag) {
--        if (x_ctb == 0 && (y_ctb & (ctb_size - 1)) == 0)
--            lc->first_qp_group = 1;
--        lc->end_of_tiles_x = s->ps.sps->width;
--    } else if (s->ps.pps->tiles_enabled_flag) {
--        if (ctb_addr_ts && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
--            int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
--            lc->end_of_tiles_x   = x_ctb + (s->ps.pps->column_width[idxX] << s->ps.sps->log2_ctb_size);
--            lc->first_qp_group   = 1;
--        }
--    } else {
--        lc->end_of_tiles_x = s->ps.sps->width;
-+    // If we have reached the end of the frame then wait for the worker to finish all its jobs
-+    if (lc->ts >= s->ps.sps->ctb_size) {
-+        worker_wait(s, lc);
-     }
- 
--    lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
-+#if RPI_TSTATS
-+    {
-+        HEVCRpiStats *const ts = &s->tstats;
- 
--    lc->boundary_flags = 0;
--    if (s->ps.pps->tiles_enabled_flag) {
--        if (x_ctb > 0 && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]])
--            lc->boundary_flags |= BOUNDARY_LEFT_TILE;
--        if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
--            lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
--        if (y_ctb > 0 && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]])
--            lc->boundary_flags |= BOUNDARY_UPPER_TILE;
--        if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
--            lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
--    } else {
--        if (ctb_addr_in_slice <= 0)
--            lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
--        if (ctb_addr_in_slice < s->ps.sps->ctb_width)
--            lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
-+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
-+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
-+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
-+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
-+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
-+        memset(ts, 0, sizeof(*ts));
-     }
-+#endif
-+
-+    return lc->ts;
- 
--    lc->ctb_left_flag = ((x_ctb > 0) && (ctb_addr_in_slice > 0) && !(lc->boundary_flags & BOUNDARY_LEFT_TILE));
--    lc->ctb_up_flag   = ((y_ctb > 0) && (ctb_addr_in_slice >= s->ps.sps->ctb_width) && !(lc->boundary_flags & BOUNDARY_UPPER_TILE));
--    lc->ctb_up_right_flag = ((y_ctb > 0)  && (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]]));
--    lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
-+fail:
-+    // Cleanup
-+    av_log(s, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
-+    // Free our job & wait for temination
-+    worker_free(s, lc);
-+    worker_wait(s, lc);
-+    return err;
- }
- 
-+
-+#endif  // RPI
-+
- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
- {
--    HEVCContext *s  = avctxt->priv_data;
-+    HEVCContext * const s  = avctxt->priv_data;
-+    HEVCLocalContext *const lc = s->HEVClc;
-     int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-     int more_data   = 1;
-     int x_ctb       = 0;
-     int y_ctb       = 0;
-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
--    int ret;
--
--    if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
--        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
--        return AVERROR_INVALIDDATA;
--    }
-+    int err;
- 
--    if (s->sh.dependent_slice_segment_flag) {
--        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
--        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
--            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
--            return AVERROR_INVALIDDATA;
--        }
--    }
-+    // Start of slice
-+    if ((err = slice_start(s, lc)) != 0)
-+        return err;
- 
-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
--        int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
- 
--        x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
--        y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
--        hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
-+        x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
-+        y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
-+        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
- 
--        ret = ff_hevc_cabac_init(s, ctb_addr_ts);
--        if (ret < 0) {
-+        err = ff_hevc_cabac_init(s, lc, ctb_addr_ts);
-+        if (err < 0) {
-             s->tab_slice_address[ctb_addr_rs] = -1;
--            return ret;
-+            return err;
-         }
- 
--        hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
-+        hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
- 
-         s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
- 
--        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+
-         if (more_data < 0) {
-             s->tab_slice_address[ctb_addr_rs] = -1;
-             return more_data;
-         }
- 
--
-         ctb_addr_ts++;
--        ff_hevc_save_states(s, ctb_addr_ts);
-+        ff_hevc_save_states(s, lc, ctb_addr_ts);
-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
-     }
- 
--    if (x_ctb + ctb_size >= s->ps.sps->width &&
--        y_ctb + ctb_size >= s->ps.sps->height)
--        ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
--
-     return ctb_addr_ts;
- }
- 
--static int hls_slice_data(HEVCContext *s)
-+static int hls_slice_data(HEVCContext * const s, const H2645NAL * const nal)
- {
--    int arg[2];
--    int ret[2];
-+#ifdef RPI
-+    // * We don't support cross_component_prediction_enabled_flag but as that
-+    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
-+    //   only deal with sand which is never 4:4:4
-+    //   [support wouldn't be hard]
-+    //
-+    //  *** Really this wants to be set further out than here - we do not
-+    //      expect this to change mid-stream
-+
-+    s->enable_rpi =
-+        ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
-+         (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
-+
-+    if (s->enable_rpi)
-+    {
-+        int err;
-+        if ((err = gen_entry_points(s, nal)) < 0)
-+            return err;
-+
-+        return rpi_decode_entry(s->avctx, NULL);
-+    }
-+    else
-+#endif
-+    {
-+        int arg[2];
-+        int ret[2];
- 
--    arg[0] = 0;
--    arg[1] = 1;
-+        arg[0] = 0;
-+        arg[1] = 1;
- 
--    s->avctx->execute(s->avctx, hls_decode_entry, arg, ret , 1, sizeof(int));
--    return ret[0];
-+        s->avctx->execute(s->avctx, hls_decode_entry, arg, ret , 1, sizeof(int));
-+        return ret[0];
-+    }
- }
-+
- static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int job, int self_id)
- {
-     HEVCContext *s1  = avctxt->priv_data, *s;
-@@ -2445,6 +5301,10 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-     s = s1->sList[self_id];
-     lc = s->HEVClc;
- 
-+#ifdef RPI
-+    s->enable_rpi = 0;
-+#endif
-+
-     if(ctb_row) {
-         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
-         if (ret < 0)
-@@ -2456,7 +5316,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-         int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
-         int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
- 
--        hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
-+        hls_decode_neighbour(s, s->HEVClc, x_ctb, y_ctb, ctb_addr_ts);
- 
-         ff_thread_await_progress2(s->avctx, ctb_row, thread, SHIFT_CTB_WPP);
- 
-@@ -2465,11 +5325,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-             return 0;
-         }
- 
--        ret = ff_hevc_cabac_init(s, ctb_addr_ts);
-+        ret = ff_hevc_cabac_init(s, s->HEVClc, ctb_addr_ts);
-         if (ret < 0)
-             goto error;
--        hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
--        more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
-+        hls_sao_param(s, s->HEVClc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
-+        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
- 
-         if (more_data < 0) {
-             ret = more_data;
-@@ -2478,7 +5338,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
- 
-         ctb_addr_ts++;
- 
--        ff_hevc_save_states(s, ctb_addr_ts);
-+        ff_hevc_save_states(s, s->HEVClc, ctb_addr_ts);
-         ff_thread_report_progress2(s->avctx, ctb_row, thread, 1);
-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
- 
-@@ -2489,7 +5349,6 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
-         }
- 
-         if ((x_ctb+ctb_size) >= s->ps.sps->width && (y_ctb+ctb_size) >= s->ps.sps->height ) {
--            ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-             ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP);
-             return ctb_addr_ts;
-         }
-@@ -2512,14 +5371,16 @@ error:
- 
- static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal)
- {
--    const uint8_t *data = nal->data;
--    int length          = nal->size;
--    HEVCLocalContext *lc = s->HEVClc;
-+//    const uint8_t *data = nal->data;
-+//    int length          = nal->size;
-+//    HEVCLocalContext *lc = s->HEVClc;
-     int *ret = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
-     int *arg = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int));
--    int64_t offset;
--    int64_t startheader, cmpt = 0;
--    int i, j, res = 0;
-+//    int64_t offset;
-+//    int64_t startheader, cmpt = 0;
-+    int i;
-+//    int j;
-+    int res = 0;
- 
-     if (!ret || !arg) {
-         av_free(ret);
-@@ -2547,6 +5408,10 @@ static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal)
-         }
-     }
- 
-+#if 1
-+    if ((res = gen_entry_points(s, nal)) != 0)
-+        goto error;
-+#else
-     offset = (lc->gb.index >> 3);
- 
-     for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
-@@ -2581,9 +5446,10 @@ static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal)
- 
-     }
-     s->data = data;
-+#endif
- 
-     for (i = 1; i < s->threads_number; i++) {
--        s->sList[i]->HEVClc->first_qp_group = 1;
-+//        s->sList[i]->HEVClc->first_qp_group = 1;
-         s->sList[i]->HEVClc->qp_y = s->sList[0]->HEVClc->qp_y;
-         memcpy(s->sList[i], s, sizeof(HEVCContext));
-         s->sList[i]->HEVClc = s->HEVClcList[i];
-@@ -2745,9 +5611,8 @@ static int set_side_data(HEVCContext *s)
-     return 0;
- }
- 
--static int hevc_frame_start(HEVCContext *s)
-+static int hevc_frame_start(HEVCContext * const s)
- {
--    HEVCLocalContext *lc = s->HEVClc;
-     int pic_size_in_ctb  = ((s->ps.sps->width  >> s->ps.sps->log2_min_cb_size) + 1) *
-                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
-     int ret;
-@@ -2763,9 +5628,6 @@ static int hevc_frame_start(HEVCContext *s)
- 
-     s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
- 
--    if (s->ps.pps->tiles_enabled_flag)
--        lc->end_of_tiles_x = s->ps.pps->column_width[0] << s->ps.sps->log2_ctb_size;
--
-     ret = ff_hevc_set_new_ref(s, &s->frame, s->poc);
-     if (ret < 0)
-         goto fail;
-@@ -2806,8 +5668,7 @@ fail:
- 
- static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
- {
--    HEVCLocalContext *lc = s->HEVClc;
--    GetBitContext *gb    = &lc->gb;
-+    GetBitContext * const gb    = &s->HEVClc->gb;
-     int ctb_addr_ts, ret;
- 
-     *gb              = nal->gb;
-@@ -2857,6 +5718,37 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
-         if (ret < 0)
-             return ret;
- 
-+        // The definition of _N unit types is "non-reference for other frames
-+        // with the same temporal_id" so they may/will be ref frames for pics
-+        // with a higher temporal_id.
-+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
-+            !(s->nal_unit_type == HEVC_NAL_TRAIL_N ||
-+                        s->nal_unit_type == HEVC_NAL_TSA_N   ||
-+                        s->nal_unit_type == HEVC_NAL_STSA_N  ||
-+                        s->nal_unit_type == HEVC_NAL_RADL_N  ||
-+                        s->nal_unit_type == HEVC_NAL_RASL_N);
-+#ifdef RPI
-+        s->offload_recon = s->used_for_ref;
-+//        s->offload_recon = 0;
-+#endif
-+
-+#if DEBUG_DECODE_N
-+        {
-+            static int z = 0;
-+            if (IS_IDR(s)) {
-+                z = 1;
-+            }
-+            if (z != 0 && z++ > DEBUG_DECODE_N) {
-+                s->is_decoded = 0;
-+                break;
-+            }
-+        }
-+#endif
-+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
-+            s->is_decoded = 0;
-+            break;
-+        }
-+
-         if (s->sh.first_slice_in_pic_flag) {
-             if (s->max_ra == INT_MAX) {
-                 if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-@@ -2915,7 +5807,7 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
-             if (s->threads_number > 1 && s->sh.num_entry_point_offsets > 0)
-                 ctb_addr_ts = hls_slice_data_wpp(s, nal);
-             else
--                ctb_addr_ts = hls_slice_data(s);
-+                ctb_addr_ts = hls_slice_data(s, nal);
-             if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) {
-                 s->is_decoded = 1;
-             }
-@@ -2988,10 +5880,22 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
-         }
-     }
- 
--fail:
--    if (s->ref && s->threads_type == FF_THREAD_FRAME)
--        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
--
-+fail:  // Also success path
-+    if (s->ref != NULL) {
-+        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
-+            ff_hevc_progress_signal_all_done(s);
-+        }
-+#ifdef RPI
-+        // * Flush frame will become confused if we pass it something
-+        //   that doesn't have an expected number of planes (e.g. 400)
-+        //   So only flush if we are sure we can.
-+        else if (s->enable_rpi) {
-+            // Flush frame to real memory as we expect to be able to pass
-+            // it straight on to mmal
-+            flush_frame(s, s->frame);
-+        }
-+#endif
-+    }
-     return ret;
- }
- 
-@@ -3193,9 +6097,174 @@ fail:
-     return AVERROR(ENOMEM);
- }
- 
-+#ifdef RPI
-+
-+
-+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
-+{
-+    av_freep(&ipe->q);
-+    gpu_free(&ipe->gptr);
-+}
-+
-+static HEVCRpiJob * job_new(void)
-+{
-+    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
-+
-+    // **** Offload init?
-+
-+    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
-+
-+    jb->intra.n = 0;
-+    jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
-+
-+    // * Sizeof the union structure might be overkill but at the moment it
-+    //   is correct (it certainly isn't going to be too small)
-+    // *** really should add per ctu sync words to be accurate
-+
-+    rpi_inter_pred_alloc(&jb->chroma_ip,
-+                         QPU_N_MAX, QPU_N_GRP,
-+                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
-+                         QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
-+    rpi_inter_pred_alloc(&jb->luma_ip,
-+                         QPU_N_MAX,  QPU_N_GRP,
-+                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
-+                         QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
-+
-+    return jb;
-+}
-+
-+static void job_delete(HEVCRpiJob * const jb)
-+{
-+    worker_pic_free_one(jb);
-+    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
-+    av_freep(&jb->intra.cmds);
-+    rpi_free_inter_pred(&jb->chroma_ip);
-+    rpi_free_inter_pred(&jb->luma_ip);
-+}
-+
-+static void jbg_delete(HEVCRpiJobGlobal * const jbg)
-+{
-+    HEVCRpiJob * jb;
-+
-+    if (jbg == NULL)
-+        return;
-+
-+    jb = jbg->free1;
-+    while (jb != NULL)
-+    {
-+        HEVCRpiJob * const jb2 = jb;
-+        jb = jb2->next;
-+        job_delete(jb2);
-+    }
-+
-+    pthread_mutex_destroy(&jbg->lock);
-+    av_free(jbg);
-+}
-+
-+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
-+{
-+    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
-+    if (jbg == NULL)
-+        return NULL;
-+
-+    pthread_mutex_init(&jbg->lock, NULL);
-+
-+    while (job_count-- != 0)
-+    {
-+        HEVCRpiJob * const jb = job_new();
-+        if (jb == NULL)
-+            goto fail;
-+
-+        jb->next = jbg->free1;
-+        jbg->free1 = jb;
-+    }
-+
-+    return jbg;
-+
-+fail:
-+    jbg_delete(jbg);
-+    return NULL;
-+}
-+
-+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
-+{
-+    HEVCRpiJobGlobal * jbg;
-+
-+    if (jbc == NULL)
-+        return;
-+
-+    jbg = jbc->jbg;
-+
-+    if (jbc->jb1 != NULL)
-+        job_delete(jbc->jb1);
-+
-+    pthread_mutex_destroy(&jbc->in_lock);
-+    pthread_mutex_destroy(&jbc->out_lock);
-+    sem_destroy(&jbc->sem_out);
-+    av_free(jbc);
-+
-+    // Deref the global job context
-+    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
-+        jbg_delete(jbg);
-+}
-+
-+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
-+{
-+    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
-+
-+    if (jbc == NULL)
-+        return NULL;
-+
-+    jbc->jbg = jbg;
-+    atomic_fetch_add(&jbg->ref_count, 1);
-+
-+    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
-+    pthread_mutex_init(&jbc->in_lock, NULL);
-+    pthread_mutex_init(&jbc->out_lock, NULL);
-+
-+    if ((jbc->jb1 = job_new()) == NULL)
-+        goto fail;
-+    jbc->jb1->jbc_local = jbc;
-+
-+    return jbc;
-+
-+fail:
-+    rpi_job_ctl_delete(jbc);
-+    return NULL;
-+}
-+
-+
-+
-+static av_cold void hevc_init_worker(HEVCContext * const s)
-+{
-+#if RPI_PASSES == 2
-+    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
-+#elif RPI_PASSES == 3
-+    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
-+    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
-+#else
-+#error Passes confused
-+#endif
-+    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
-+
-+    pass_queues_start_all(s);
-+}
-+
-+static av_cold void hevc_exit_worker(HEVCContext *s)
-+{
-+    pass_queues_term_all(s);
-+
-+    pass_queues_kill_all(s);
-+
-+    rpi_job_ctl_delete(s->jbc);
-+    s->jbc = NULL;
-+}
-+
-+#endif
-+
- static av_cold int hevc_decode_free(AVCodecContext *avctx)
- {
--    HEVCContext       *s = avctx->priv_data;
-+    HEVCContext * const s = avctx->priv_data;
-     int i;
- 
-     pic_arrays_free(s);
-@@ -3204,10 +6273,22 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
- 
-     av_freep(&s->cabac_state);
- 
--    for (i = 0; i < 3; i++) {
--        av_freep(&s->sao_pixel_buffer_h[i]);
--        av_freep(&s->sao_pixel_buffer_v[i]);
-+#ifdef RPI
-+#if RPI_EXTRA_BIT_THREADS
-+    bit_threads_kill(s);
-+#endif
-+
-+    hevc_exit_worker(s);
-+    vpu_qpu_term();
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
-     }
-+    job_lc_kill(s->HEVClc);
-+    av_rpi_zc_uninit(avctx);
-+#endif
-+
-+    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
-+    av_freep(&s->sao_pixel_buffer_v[0]);
-     av_frame_free(&s->output_frame);
- 
-     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-@@ -3230,21 +6311,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
-     av_freep(&s->sh.size);
- 
-     for (i = 1; i < s->threads_number; i++) {
--        HEVCLocalContext *lc = s->HEVClcList[i];
--        if (lc) {
--            av_freep(&s->HEVClcList[i]);
-+        if (s->sList[i] != NULL) {
-             av_freep(&s->sList[i]);
-         }
-     }
--    if (s->HEVClc == s->HEVClcList[0])
--        s->HEVClc = NULL;
--    av_freep(&s->HEVClcList[0]);
-+
-+    // Free separately from sLists as used that way by RPI WPP
-+    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
-+        av_freep(s->HEVClcList + i);
-+    }
-+    s->HEVClc = NULL;  // Allocated as part of HEVClcList
- 
-     ff_h2645_packet_uninit(&s->pkt);
- 
-     return 0;
- }
- 
-+
- static av_cold int hevc_init_context(AVCodecContext *avctx)
- {
-     HEVCContext *s = avctx->priv_data;
-@@ -3258,6 +6341,38 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     s->HEVClcList[0] = s->HEVClc;
-     s->sList[0] = s;
- 
-+#ifdef RPI
-+    // Whilst FFmpegs init fn is only called once the close fn is called as
-+    // many times as we have threads (init_thread_copy is called for the
-+    // threads).  So to match init & term put the init here where it will be
-+    // called by both init & copy
-+    av_rpi_zc_init(avctx);
-+
-+    if (vpu_qpu_init() != 0)
-+        goto fail;
-+
-+#if RPI_INTER
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    {
-+        static const uint32_t dframe[1] = {0x80808080};
-+        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
-+    }
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
-+#endif
-+#endif
-+    //gpu_malloc_uncached(2048*64,&s->dummy);
-+
-+    s->enable_rpi = 0;
-+    bt_lc_init(s, s->HEVClc, 0);
-+    job_lc_init(s->HEVClc);
-+
-+    for (i = 0; i != 2; ++i) {
-+        ff_hevc_rpi_progress_init_state(s->progress_states + i);
-+    }
-+#endif
-+
-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
-     if (!s->cabac_state)
-         goto fail;
-@@ -3271,6 +6386,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-         if (!s->DPB[i].frame)
-             goto fail;
-         s->DPB[i].tf.f = s->DPB[i].frame;
-+        s->DPB[i].dpb_no = i;
-     }
- 
-     s->max_ra = INT_MAX;
-@@ -3289,6 +6405,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
-     return 0;
- 
- fail:
-+    av_log(s, AV_LOG_ERROR, "%s: Failed\n", __func__);
-     hevc_decode_free(avctx);
-     return AVERROR(ENOMEM);
- }
-@@ -3372,6 +6489,14 @@ static int hevc_update_thread_context(AVCodecContext *dst,
-     s->sei.content_light        = s0->sei.content_light;
-     s->sei.alternative_transfer = s0->sei.alternative_transfer;
- 
-+#ifdef RPI
-+    if (s->jbc == NULL)
-+    {
-+        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
-+        hevc_init_worker(s);
-+    }
-+#endif
-+
-     return 0;
- }
- 
-@@ -3382,10 +6507,31 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
- 
-     avctx->internal->allocate_progress = 1;
- 
-+#ifdef RPI
-+    {
-+        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
-+        if (jbg == NULL)
-+        {
-+            av_log(s, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
-+            return -1;
-+        }
-+
-+        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
-+        {
-+            av_log(s, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
-+            return -1;
-+        }
-+    }
-+#endif
-+
-     ret = hevc_init_context(avctx);
-     if (ret < 0)
-         return ret;
- 
-+#ifdef RPI
-+    hevc_init_worker(s);
-+#endif
-+
-     s->enable_parallel_tiles = 0;
-     s->sei.picture_timing.picture_struct = 0;
-     s->eos = 1;
-@@ -3406,9 +6552,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
-     }
- 
-     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
--            s->threads_type = FF_THREAD_FRAME;
--        else
--            s->threads_type = FF_THREAD_SLICE;
-+        s->threads_type = FF_THREAD_FRAME;
-+    else
-+        s->threads_type = FF_THREAD_SLICE;
- 
-     return 0;
- }
-@@ -3467,7 +6613,16 @@ AVCodec ff_hevc_decoder = {
-     .update_thread_context = hevc_update_thread_context,
-     .init_thread_copy      = hevc_init_thread_copy,
-     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-+#if 0
-+    // Debugging is often easier without threads getting in the way
-+                            0,
-+#warning H265 threading turned off
-+#elif defined(RPI)
-+    // We only have decent optimisation for frame - so only admit to that
-+                             AV_CODEC_CAP_FRAME_THREADS,
-+#else
-                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
-+#endif
-     .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING,
-     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
- };
-diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h
-index 293beb7083..38caf6388a 100644
---- a/libavcodec/hevcdec.h
-+++ b/libavcodec/hevcdec.h
-@@ -24,6 +24,9 @@
- #define AVCODEC_HEVCDEC_H
- 
- #include <stdatomic.h>
-+#ifdef RPI
-+#include <semaphore.h>
-+#endif
- 
- #include "libavutil/buffer.h"
- 
-@@ -55,6 +58,8 @@
- 
- #define MRG_MAX_NUM_CANDS     5
- 
-+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
-+
- #define L0 0
- #define L1 1
- 
-@@ -251,17 +256,6 @@ typedef struct CodingUnit {
-     uint8_t cu_transquant_bypass_flag;
- } CodingUnit;
- 
--typedef struct Mv {
--    int16_t x;  ///< horizontal component of motion vector
--    int16_t y;  ///< vertical component of motion vector
--} Mv;
--
--typedef struct MvField {
--    DECLARE_ALIGNED(4, Mv, mv)[2];
--    int8_t ref_idx[2];
--    int8_t pred_flag;
--} MvField;
--
- typedef struct NeighbourAvailable {
-     int cand_bottom_left;
-     int cand_left;
-@@ -298,8 +292,8 @@ typedef struct TransformUnit {
- } TransformUnit;
- 
- typedef struct DBParams {
--    int beta_offset;
--    int tc_offset;
-+    int8_t beta_offset; // -12 to +12
-+    int8_t tc_offset;   // -12 to +12
- } DBParams;
- 
- #define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
-@@ -307,6 +301,8 @@ typedef struct DBParams {
- #define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
- #define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
- 
-+struct HEVCRpiJob;
-+
- typedef struct HEVCFrame {
-     AVFrame *frame;
-     ThreadFrame tf;
-@@ -334,14 +330,59 @@ typedef struct HEVCFrame {
-      * A combination of HEVC_FRAME_FLAG_*
-      */
-     uint8_t flags;
-+
-+    // Entry no in DPB - can be used as a small unique
-+    // frame identifier (within the current thread)
-+    uint8_t dpb_no;
- } HEVCFrame;
- 
-+#ifdef RPI
-+typedef struct HEVCLocalContextIntra {
-+    TransformUnit tu;
-+    NeighbourAvailable na;
-+} HEVCLocalContextIntra;
-+#endif
-+
- typedef struct HEVCLocalContext {
-+    TransformUnit tu;  // Moved to start to match HEVCLocalContextIntra (yuk!)
-+    NeighbourAvailable na;
-+
-+#ifdef RPI
-+    // Vars that allow us to locate everything from just an lc
-+    struct HEVCContext * context;  // ??? make const ???
-+    unsigned int lc_n; // lc list el no
-+
-+    // Job wait links
-+    struct HEVCLocalContext * jw_next;
-+    struct HEVCLocalContext * jw_prev;
-+    struct HEVCLocalContext * ljw_next;
-+    struct HEVCLocalContext * ljw_prev;
-+    struct HEVCRpiJob * volatile jw_job;
-+    sem_t jw_sem;
-+
-+    // ?? Wrap in structure ??
-+    sem_t bt_sem_in;
-+    sem_t * bt_psem_out;
-+    volatile int bt_terminate;
-+    unsigned int ts;
-+    unsigned int bt_last_line;  // Last line in this bit_thread chunk
-+    unsigned int bt_line_no;
-+    unsigned int bt_line_width;
-+    unsigned int bt_line_inc;
-+
-+    struct HEVCRpiJob * jb0;
-+    char unit_done;  // Set once we have dealt with this slice
-+//    char max_done;
-+    char bt_is_tile;
-+    char last_progress_good;
-+#endif
-+    char wpp_init;   // WPP/Tile bitstream init has happened
-+
-     uint8_t cabac_state[HEVC_CONTEXTS];
- 
-     uint8_t stat_coeff[4];
- 
--    uint8_t first_qp_group;
-+//    uint8_t first_qp_group;
- 
-     GetBitContext gb;
-     CABACContext cc;
-@@ -351,8 +392,6 @@ typedef struct HEVCLocalContext {
- 
-     int qPy_pred;
- 
--    TransformUnit tu;
--
-     uint8_t ctb_left_flag;
-     uint8_t ctb_up_flag;
-     uint8_t ctb_up_right_flag;
-@@ -368,7 +407,6 @@ typedef struct HEVCLocalContext {
-     int ct_depth;
-     CodingUnit cu;
-     PredictionUnit pu;
--    NeighbourAvailable na;
- 
- #define BOUNDARY_LEFT_SLICE     (1 << 0)
- #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -379,6 +417,242 @@ typedef struct HEVCLocalContext {
-     int boundary_flags;
- } HEVCLocalContext;
- 
-+#ifdef RPI
-+
-+// This is the number of _extra_ bit threads - the we will have
-+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
-+//#define RPI_EXTRA_BIT_THREADS 0
-+#define RPI_EXTRA_BIT_THREADS 2
-+
-+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+// Various buffer sizes depend on this so do not over allocate
-+#define RPI_MAX_WIDTH 2048
-+
-+// Each block can have an intra prediction and an add_residual command
-+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
-+#if RPI_HEVC_SAND
-+// Sand only has 2 planes (Y/C)
-+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(RPI_MAX_WIDTH/4))
-+#else
-+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*3*(RPI_MAX_WIDTH/4))
-+#endif
-+
-+#ifdef RPI_DEBLOCK_VPU
-+// Worst case is 16x16 CTUs
-+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
-+#endif
-+
-+// Command for intra prediction and transform_add of predictions to coefficients
-+enum rpi_pred_cmd_e
-+{
-+    RPI_PRED_ADD_RESIDUAL,
-+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
-+    RPI_PRED_ADD_DC,
-+    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
-+    RPI_PRED_ADD_DC_V,
-+    RPI_PRED_INTRA,
-+    RPI_PRED_I_PCM,
-+    RPI_PRED_CMD_MAX
-+};
-+
-+typedef struct HEVCPredCmd {
-+    uint8_t type;
-+    uint8_t size;  // log2 "size" used by all variants
-+    uint8_t na;    // i_pred - but left here as they pack well
-+    uint8_t c_idx; // i_pred
-+    union {
-+        struct {  // TRANSFORM_ADD
-+            uint8_t * dst;
-+            const int16_t * buf;
-+            uint16_t stride;  // Should be good enough for all pic fmts we use
-+            int16_t dc;
-+        } ta;
-+        struct {
-+            uint8_t * dst;
-+            uint32_t stride;
-+            int dc;
-+        } dc;
-+        struct {  // INTRA
-+            uint16_t x;
-+            uint16_t y;
-+            enum IntraPredMode mode;
-+        } i_pred;
-+        struct {  // I_PCM
-+            uint16_t x;
-+            uint16_t y;
-+            const void * src;
-+            uint32_t src_len;
-+        } i_pcm;
-+    };
-+} HEVCPredCmd;
-+
-+#endif
-+
-+#ifdef RPI
-+
-+union qpu_mc_pred_cmd_s;
-+struct qpu_mc_pred_y_p_s;
-+struct qpu_mc_src_s;
-+
-+typedef struct HEVCRpiInterPredQ
-+{
-+    union qpu_mc_pred_cmd_u *qpu_mc_base;
-+    union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    struct qpu_mc_src_s *last_l0;
-+    struct qpu_mc_src_s *last_l1;
-+    unsigned int load;
-+    uint32_t code_setup;
-+    uint32_t code_sync;
-+    uint32_t code_exit;
-+} HEVCRpiInterPredQ;
-+
-+typedef struct HEVCRpiInterPredEnv
-+{
-+    HEVCRpiInterPredQ * q;
-+    uint8_t n;                  // Number of Qs
-+    uint8_t n_grp;              // Number of Q in a group
-+    uint8_t curr;               // Current Q number (0..n-1)
-+    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
-+    uint8_t used_grp;           // 0 if nothing in any Q in the current group
-+    unsigned int max_fill;
-+    unsigned int min_gap;
-+    GPU_MEM_PTR_T gptr;
-+} HEVCRpiInterPredEnv;
-+
-+typedef struct HEVCRpiIntraPredEnv {
-+    unsigned int n;        // Number of commands
-+    HEVCPredCmd * cmds;
-+} HEVCRpiIntraPredEnv;
-+
-+typedef struct HEVCRpiCoeffEnv {
-+    unsigned int n;
-+    int16_t * buf;
-+} HEVCRpiCoeffEnv;
-+
-+typedef struct HEVCRpiCoeffsEnv {
-+    HEVCRpiCoeffEnv s[4];
-+    GPU_MEM_PTR_T gptr;
-+    void * mptr;
-+} HEVCRpiCoeffsEnv;
-+
-+typedef struct HEVCRPiFrameProgressWait {
-+    int req;
-+    struct HEVCRPiFrameProgressWait * next;
-+    sem_t sem;
-+} HEVCRPiFrameProgressWait;
-+
-+typedef struct HEVCRPiFrameProgressState {
-+    struct HEVCRPiFrameProgressWait * first;
-+    struct HEVCRPiFrameProgressWait * last;
-+    pthread_mutex_t lock;
-+} HEVCRPiFrameProgressState;
-+
-+typedef struct RpiBlk
-+{
-+    unsigned int x;
-+    unsigned int y;
-+    unsigned int w;
-+    unsigned int h;
-+} RpiBlk;
-+
-+typedef struct HEVCRpiJob {
-+    struct HEVCRpiJob * next;  // Free chain
-+    struct HEVCRpiJobCtl * jbc_local;
-+    const HEVCSPS * sps;       // sps used to set up this job
-+
-+    int waited;
-+    int ctu_ts_first;
-+    int ctu_ts_last;
-+    RpiBlk bounds;  // Bounding box of job
-+
-+    struct qpu_mc_pred_y_p_s * last_y8_p;
-+    struct qpu_mc_src_s * last_y8_l1;
-+
-+    HEVCRpiInterPredEnv chroma_ip;
-+    HEVCRpiInterPredEnv luma_ip;
-+    int16_t progress[32];  // index by dpb_no
-+    HEVCRpiIntraPredEnv intra;
-+    HEVCRpiCoeffsEnv coeffs;
-+    HEVCRPiFrameProgressWait progress_wait;
-+} HEVCRpiJob;
-+
-+struct HEVCContext;
-+
-+typedef void HEVCRpiWorkerFn(struct HEVCContext * const s, HEVCRpiJob * const jb);
-+
-+typedef struct HEVCRpiPassQueue
-+{
-+//    int pending;
-+    volatile int terminate;
-+    sem_t sem_in;
-+    sem_t * psem_out;
-+    void * job_n; // cas takes void * so we need to store as such (but really int)
-+    struct HEVCContext * context; // Context pointer as we get to pass a single "void * this" to the thread
-+    HEVCRpiWorkerFn * worker;
-+    pthread_t thread;
-+    uint8_t pass_n;  // Pass number - debug
-+    uint8_t started;
-+} HEVCRpiPassQueue;
-+
-+
-+struct HEVCRpiJobGlobal;
-+
-+typedef struct HEVCRpiJobCtl
-+{
-+    sem_t sem_out;
-+
-+    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
-+    struct HEVCRpiJobGlobal * jbg;
-+
-+    HEVCLocalContext * lcw_head;
-+    HEVCLocalContext * lcw_tail;
-+
-+    pthread_mutex_t in_lock;
-+    int offload_in;
-+    pthread_mutex_t out_lock;
-+    int offload_out;
-+
-+    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
-+} HEVCRpiJobCtl;
-+
-+
-+typedef struct HEVCRpiJobGlobal
-+{
-+    intptr_t ref_count;
-+    pthread_mutex_t lock;
-+    HEVCRpiJob * free1;
-+    HEVCLocalContext * wait_head;
-+    HEVCLocalContext * wait_tail;
-+
-+} HEVCRpiJobGlobal;
-+
-+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
-+
-+#if RPI_TSTATS
-+typedef struct HEVCRpiStats {
-+    int y_pred1_y8_merge;
-+    int y_pred1_xy;
-+    int y_pred1_x0;
-+    int y_pred1_y0;
-+    int y_pred1_x0y0;
-+    int y_pred1_wle8;
-+    int y_pred1_wgt8;
-+    int y_pred1_hle16;
-+    int y_pred1_hgt16;
-+    int y_pred2_xy;
-+    int y_pred2_x0;
-+    int y_pred2_y0;
-+    int y_pred2_x0y0;
-+    int y_pred2_hle16;
-+    int y_pred2_hgt16;
-+} HEVCRpiStats;
-+#endif
-+
-+#define RPI_PASSES 3
-+#endif
-+
- typedef struct HEVCContext {
-     const AVClass *c;  // needed by private avoptions
-     AVCodecContext *avctx;
-@@ -394,6 +668,63 @@ typedef struct HEVCContext {
-     int                 width;
-     int                 height;
- 
-+    char used_for_ref;  // rpi
-+#ifdef RPI
-+    char offload_recon;
-+    char enable_rpi;
-+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+
-+    HEVCRpiJobCtl * jbc;
-+
-+    HEVCRpiPassQueue passq[RPI_PASSES];
-+#if RPI_TSTATS
-+    HEVCRpiStats tstats;
-+#endif
-+#if RPI_INTER
-+
-+    // Function pointers
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+    const uint8_t * qpu_dummy_frame_emu;
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
-+#endif
-+    HEVCRpiQpu qpu;
-+#endif
-+
-+#ifdef RPI_DEBLOCK_VPU
-+#define RPI_DEBLOCK_VPU_Q_COUNT 2
-+    int enable_rpi_deblock;
-+
-+    int uv_setup_width;
-+    int uv_setup_height;
-+    int setup_width; // Number of 16x16 blocks across the image
-+    int setup_height; // Number of 16x16 blocks down the image
-+
-+    struct dblk_vpu_q_s
-+    {
-+        GPU_MEM_PTR_T deblock_vpu_gmem;
-+
-+        uint8_t (*y_setup_arm)[2][2][2][4];
-+        uint8_t (*y_setup_vc)[2][2][2][4];
-+
-+        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
-+        uint8_t (*uv_setup_vc)[2][2][2][4];
-+
-+        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
-+        int vpu_cmds_vc;
-+
-+        vpu_qpu_wait_h cmd_id;
-+    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
-+
-+    struct dblk_vpu_q_s * dvq;
-+    unsigned int dvq_n;
-+
-+#endif
-+    HEVCLocalContextIntra HEVClcIntra;
-+    HEVCRPiFrameProgressState progress_states[2];
-+#endif
-+
-     uint8_t *cabac_state;
- 
-     /** 1 if the independent slice segment header was successfully parsed */
-@@ -482,6 +813,14 @@ typedef struct HEVCContext {
-     int nuh_layer_id;
- 
-     HEVCSEIContext sei;
-+
-+#ifdef RPI
-+#if RPI_EXTRA_BIT_THREADS > 0
-+    int bt_started;
-+    // This simply contains thread descriptors - task setup is held elsewhere
-+    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
-+#endif
-+#endif
- } HEVCContext;
- 
- /**
-@@ -494,7 +833,7 @@ void ff_hevc_clear_refs(HEVCContext *s);
-  */
- void ff_hevc_flush_dpb(HEVCContext *s);
- 
--RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *frame,
-+const RefPicList *ff_hevc_get_ref_list(const HEVCContext * const s, const HEVCFrame * const ref,
-                                  int x0, int y0);
- 
- /**
-@@ -507,6 +846,39 @@ int ff_hevc_frame_rps(HEVCContext *s);
-  */
- int ff_hevc_slice_rpl(HEVCContext *s);
- 
-+void ff_hevc_save_states(HEVCContext *s, const HEVCLocalContext * const lc, int ctb_addr_ts);
-+int ff_hevc_cabac_init(const HEVCContext * const s, HEVCLocalContext *const lc, int ctb_addr_ts);
-+int ff_hevc_sao_merge_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_sao_type_idx_decode(HEVCLocalContext * const lc);
-+int ff_hevc_sao_band_position_decode(HEVCLocalContext * const lc);
-+int ff_hevc_sao_offset_abs_decode(const HEVCContext * const s, HEVCLocalContext * const lc);
-+int ff_hevc_sao_offset_sign_decode(HEVCLocalContext * const lc);
-+int ff_hevc_sao_eo_class_decode(HEVCLocalContext * const lc);
-+int ff_hevc_end_of_slice_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_cu_transquant_bypass_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_skip_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc,
-+                             const int x0, const int y0, const int x_cb, const int y_cb);
-+int ff_hevc_pred_mode_decode(HEVCLocalContext * const lc);
-+int ff_hevc_split_coding_unit_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc, const int ct_depth,
-+                                          const int x0, const int y0);
-+int ff_hevc_part_mode_decode(const HEVCContext * const s, HEVCLocalContext * const lc, const int log2_cb_size);
-+int ff_hevc_pcm_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_mpm_idx_decode(HEVCLocalContext * const lc);
-+int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCLocalContext * const lc);
-+int ff_hevc_intra_chroma_pred_mode_decode(HEVCLocalContext * const lc);
-+int ff_hevc_merge_idx_decode(const HEVCContext * const s, HEVCLocalContext * const lc);
-+int ff_hevc_merge_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_inter_pred_idc_decode(HEVCLocalContext * const lc, int nPbW, int nPbH);
-+int ff_hevc_ref_idx_lx_decode(HEVCLocalContext * const lc, const int num_ref_idx_lx);
-+int ff_hevc_mvp_lx_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_no_residual_syntax_flag_decode(HEVCLocalContext * const lc);
-+int ff_hevc_split_transform_flag_decode(HEVCLocalContext * const lc, const int log2_trafo_size);
-+int ff_hevc_cbf_cb_cr_decode(HEVCLocalContext * const lc, const int trafo_depth);
-+int ff_hevc_cbf_luma_decode(HEVCLocalContext * const lc, const int trafo_depth);
-+int ff_hevc_log2_res_scale_abs(HEVCLocalContext * const lc, const int idx);
-+int ff_hevc_res_scale_sign_flag(HEVCLocalContext *const lc, const int idx);
-+#if 0
- void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts);
- int ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts);
- int ff_hevc_sao_merge_flag_decode(HEVCContext *s);
-@@ -539,6 +911,8 @@ int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth);
- int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth);
- int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx);
- int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx);
-+>>>>>>> n3.4
-+#endif
- 
- /**
-  * Get the number of candidate references for the current frame.
-@@ -557,33 +931,119 @@ void ff_hevc_bump_frame(HEVCContext *s);
- 
- void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags);
- 
--void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0,
--                                     int nPbW, int nPbH);
--void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0,
--                                int nPbW, int nPbH, int log2_cb_size,
--                                int part_idx, int merge_idx, MvField *mv);
--void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0,
--                              int nPbW, int nPbH, int log2_cb_size,
--                              int part_idx, int merge_idx,
--                              MvField *mv, int mvp_lx_flag, int LX);
--void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase,
--                     int log2_cb_size);
--void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
-+void ff_hevc_set_neighbour_available(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0,
-+                                     const int nPbW, const int nPbH);
-+void ff_hevc_luma_mv_merge_mode(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int nPbW,
-+                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, MvField * const mv);
-+void ff_hevc_luma_mv_mvp_mode(const HEVCContext * const s, HEVCLocalContext *lc, int x0, int y0, int nPbW,
-+                              int nPbH, int log2_cb_size, int part_idx,
-+                              int merge_idx, MvField * const mv,
-+                              int mvp_lx_flag, int LX);
-+void ff_hevc_set_qPy(const HEVCContext * const s, HEVCLocalContext * const lc, int xBase, int yBase, int log2_cb_size);
-+void ff_hevc_deblocking_boundary_strengths(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0,
-                                            int log2_trafo_size);
--int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s);
--int ff_hevc_cu_qp_delta_abs(HEVCContext *s);
--int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s);
--int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s);
--void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size);
-+int ff_hevc_cu_qp_delta_sign_flag(HEVCLocalContext * const lc);
-+int ff_hevc_cu_qp_delta_abs(HEVCLocalContext * const lc);
-+int ff_hevc_cu_chroma_qp_offset_flag(HEVCLocalContext * const lc);
-+int ff_hevc_cu_chroma_qp_offset_idx(const HEVCContext * const s, HEVCLocalContext * const lc);
-+void ff_hevc_hls_filter(HEVCContext * const s, const int x, const int y, const int ctb_size);
- void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size);
--void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
--                                 int log2_trafo_size, enum ScanType scan_idx,
--                                 int c_idx);
-+void ff_hevc_hls_residual_coding(const HEVCContext * const s, HEVCLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int log2_trafo_size, const enum ScanType scan_idx,
-+                                const int c_idx);
-+
-+void ff_hevc_hls_mvd_coding(HEVCLocalContext * const lc);
- 
--void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
- 
- extern const uint8_t ff_hevc_qpel_extra_before[4];
- extern const uint8_t ff_hevc_qpel_extra_after[4];
- extern const uint8_t ff_hevc_qpel_extra[4];
- 
-+#ifdef RPI
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
-+
-+// arm/hevc_misc_neon.S
-+// Neon coeff zap fn
-+#if HAVE_NEON
-+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
-+#endif
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int val, const int field);
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
-+
-+// All of these expect that s->threads_type == FF_THREAD_FRAME
-+
-+static inline void ff_hevc_progress_wait_mv(const HEVCContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int y)
-+{
-+    if (s->enable_rpi)
-+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
-+    else
-+        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
-+}
-+
-+static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
-+{
-+    if (s->enable_rpi && s->used_for_ref)
-+        ff_hevc_rpi_progress_signal_field(s, y, 1);
-+}
-+
-+static inline void ff_hevc_progress_wait_recon(const HEVCContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int y)
-+{
-+    if (s->enable_rpi)
-+        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
-+    else
-+        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
-+}
-+
-+static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
-+{
-+    if (s->used_for_ref)
-+    {
-+        if (s->enable_rpi)
-+            ff_hevc_rpi_progress_signal_field(s, y, 0);
-+        else
-+            ff_thread_report_progress(&s->ref->tf, y, 0);
-+    }
-+}
-+
-+static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
-+{
-+    if (s->enable_rpi)
-+    {
-+        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
-+        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
-+    }
-+    else
-+        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
-+}
-+
-+#else
-+
-+// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
-+#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
-+#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
-+#define ff_hevc_progress_signal_mv(s, y)
-+#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
-+#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
-+
-+#endif
-+
-+// Set all done - signal nothing (used in missing refs)
-+// Works for both rpi & non-rpi
-+static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
-+{
-+    if (ref->tf.progress != NULL)
-+    {
-+        int * const p = (int *)&ref->tf.progress->data;
-+        p[0] = INT_MAX;
-+        p[1] = INT_MAX;
-+    }
-+}
-+
- #endif /* AVCODEC_HEVCDEC_H */
-diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-index 76ae72b6d4..85c1f03718 100644
---- a/libavcodec/hevcdsp.c
-+++ b/libavcodec/hevcdsp.c
-@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
- #include "hevcdsp_template.c"
- #undef BIT_DEPTH
- 
-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               const MvField *curr, const MvField *neigh, uint8_t *bs)
-+{
-+    for (; pus > 0; pus--) {
-+        int strength, out;
-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
-+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
-+
-+#if 1 // This more directly matches the original implementation
-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
-+            // same L0 and L1
-+            if (curr_refL0 == neigh_refL0 &&
-+                curr_refL0 == curr_refL1 &&
-+                neigh_refL0 == neigh_refL1) {
-+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL0 == curr_refL0 &&
-+                       neigh_refL1 == curr_refL1) {
-+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else if (neigh_refL1 == curr_refL0 &&
-+                       neigh_refL0 == curr_refL1) {
-+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else {
-+                strength = 1;
-+            }
-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+            Mv curr_mv0, neigh_mv0;
-+
-+            if (curr->pred_flag & 1) {
-+                curr_mv0   = curr->mv[0];
-+            } else {
-+                curr_mv0   = curr->mv[1];
-+                curr_refL0 = curr_refL1;
-+            }
-+
-+            if (neigh->pred_flag & 1) {
-+                neigh_mv0   = neigh->mv[0];
-+            } else {
-+                neigh_mv0   = neigh->mv[1];
-+                neigh_refL0 = neigh_refL1;
-+            }
-+
-+            if (curr_refL0 == neigh_refL0) {
-+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
-+                    strength = 1;
-+                else
-+                    strength = 0;
-+            } else
-+                strength = 1;
-+        } else
-+            strength = 1;
-+#else // This has exactly the same effect, but is more suitable for vectorisation
-+        Mv curr_mv[2];
-+        Mv neigh_mv[2];
-+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
-+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
-+
-+        if (!(curr->pred_flag & 2)) {
-+            curr_mv[1] = curr_mv[0];
-+            curr_refL1 = curr_refL0;
-+        }
-+        if (!(neigh->pred_flag & 2)) {
-+            neigh_mv[1] = neigh_mv[0];
-+            neigh_refL1 = neigh_refL0;
-+        }
-+        if (!(curr->pred_flag & 1)) {
-+            curr_mv[0] = curr_mv[1];
-+            curr_refL0 = curr_refL1;
-+        }
-+        if (!(neigh->pred_flag & 1)) {
-+            neigh_mv[0] = neigh_mv[1];
-+            neigh_refL0 = neigh_refL1;
-+        }
-+
-+        strength = 1;
-+
-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
-+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
-+
-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
-+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
-+
-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
-+#endif
-+
-+        curr += in_inc / sizeof (MvField);
-+        neigh += in_inc / sizeof (MvField);
-+
-+        for (out = dup; out > 0; out--)
-+        {
-+            *bs = strength;
-+            bs += out_inc;
-+        }
-+    }
-+}
-+
- void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
- {
- #undef FUNC
-@@ -193,12 +307,54 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
-     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
- 
-+#if !RPI_HEVC_SAND
-+#define SLICED_LOOP_FILTERS(depth)
-+#define SLICED_ADD_RESIDUAL(depth)
-+#define SLICED_SAO(depth)
-+#else
-+#define SLICED_ADD_RESIDUAL(depth)\
-+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
-+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
-+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
-+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
-+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
-+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
-+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
-+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
-+    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
-+    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
-+    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
-+    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
-+    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
-+    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
-+    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
-+#define SLICED_LOOP_FILTERS(depth)\
-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
-+#define SLICED_SAO(depth)\
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
-+        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
-+        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
-+    }                                                                         \
-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
-+
-+#endif
-+
- #define HEVC_DSP(depth)                                                     \
-     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-     hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
-     hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
-     hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
-     hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
-+    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
-+    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
-+    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
-+    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
-+    SLICED_ADD_RESIDUAL(depth);                                             \
-     hevcdsp->dequant                = FUNC(dequant, depth);                 \
-     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-     hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
-@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
-     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
-                                                                             \
--    hevcdsp->sao_band_filter[0] =                                              \
--    hevcdsp->sao_band_filter[1] =                                              \
--    hevcdsp->sao_band_filter[2] =                                              \
--    hevcdsp->sao_band_filter[3] =                                              \
--    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
--    hevcdsp->sao_edge_filter[0] =                                              \
--    hevcdsp->sao_edge_filter[1] =                                              \
--    hevcdsp->sao_edge_filter[2] =                                              \
--    hevcdsp->sao_edge_filter[3] =                                              \
--    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
-+    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
-+        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
-+        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
-+    }                                                                       \
-     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
-     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
-+    SLICED_SAO(depth);                                                         \
-                                                                                \
-     QPEL_FUNCS(depth);                                                         \
-     QPEL_UNI_FUNCS(depth);                                                     \
-@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-     EPEL_UNI_FUNCS(depth);                                                     \
-     EPEL_BI_FUNCS(depth);                                                      \
-                                                                                \
-+    SLICED_LOOP_FILTERS(depth);                                                \
-     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
-     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
-     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
-@@ -257,6 +409,8 @@ int i = 0;
-         break;
-     }
- 
-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-+
-     if (ARCH_PPC)
-         ff_hevc_dsp_init_ppc(hevcdsp, bit_depth);
-     if (ARCH_X86)
-diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-index dc48ebca11..25ed9a447a 100644
---- a/libavcodec/hevcdsp.h
-+++ b/libavcodec/hevcdsp.h
-@@ -25,28 +25,56 @@
- #ifndef AVCODEC_HEVCDSP_H
- #define AVCODEC_HEVCDSP_H
- 
-+#include "hevc.h"
- #include "get_bits.h"
- 
- #define MAX_PB_SIZE 64
- 
- typedef struct SAOParams {
--    int offset_abs[3][4];   ///< sao_offset_abs
--    int offset_sign[3][4];  ///< sao_offset_sign
-+//    int offset_abs[3][4];   ///< sao_offset_abs
-+//    int offset_sign[3][4];  ///< sao_offset_sign
- 
-     uint8_t band_position[3];   ///< sao_band_position
--
--    int eo_class[3];        ///< sao_eo_class
-+    uint8_t eo_class[3];        ///< sao_eo_class
-+    uint8_t type_idx[3];    ///< sao_type_idx
- 
-     int16_t offset_val[3][5];   ///<SaoOffsetVal
- 
--    uint8_t type_idx[3];    ///< sao_type_idx
- } SAOParams;
- 
-+typedef struct Mv {
-+    int16_t x;  ///< horizontal component of motion vector
-+    int16_t y;  ///< vertical component of motion vector
-+} Mv;
-+
-+typedef struct MvField {
-+    DECLARE_ALIGNED(4, Mv, mv)[2];
-+    int8_t ref_idx[2];
-+    int8_t pred_flag;
-+} MvField;
-+
-+#ifdef RPI
-+#define SAO_FILTER_N 6
-+#else
-+#define SAO_FILTER_N 5
-+#endif
-+
-+
- typedef struct HEVCDSPContext {
-     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-                     struct GetBitContext *gb, int pcm_bit_depth);
- 
-     void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
-+#if RPI_HEVC_SAND
-+    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
-+    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
-+
-+    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
-+#endif
- 
-     void (*dequant)(int16_t *coeffs, int16_t log2_size);
- 
-@@ -58,16 +86,31 @@ typedef struct HEVCDSPContext {
- 
-     void (*idct_dc[4])(int16_t *coeffs);
- 
--    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+#if RPI_HEVC_SAND
-+    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                               int width, int height);
-+#endif
- 
-     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
--    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+#if RPI_HEVC_SAND
-+    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
-+#endif
- 
-     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
-                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+#if RPI_HEVC_SAND
-+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+#endif
- 
-     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-                                     int height, intptr_t mx, intptr_t my, int width);
-@@ -120,6 +163,22 @@ typedef struct HEVCDSPContext {
-     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-                                         int32_t *tc, uint8_t *no_p,
-                                         uint8_t *no_q);
-+#ifdef RPI
-+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
-+                                 const uint8_t no_p[2], const uint8_t no_q[2],
-+                                 uint8_t * _pix_l);
-+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f);
-+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f);
-+
-+#endif
-+
-+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
-+                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               const MvField *curr, const MvField *neigh, uint8_t *bs);
- } HEVCDSPContext;
- 
- void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
-index e09c661759..1977a316c9 100644
---- a/libavcodec/hevcdsp_template.c
-+++ b/libavcodec/hevcdsp_template.c
-@@ -26,6 +26,8 @@
- #include "bit_depth_template.c"
- #include "hevcdsp.h"
- 
-+#include "rpi_shader_template.h"
-+
- static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-                           GetBitContext *gb, int pcm_bit_depth)
- {
-@@ -41,6 +43,30 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
-     }
- }
- 
-+#if RPI_HEVC_SAND
-+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+                          GetBitContext *gb, int pcm_bit_depth)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+
-+    dst = (pixel *)_dst + 1;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x++)
-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+        dst += stride;
-+    }
-+}
-+#endif
-+
- static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
-                                                 ptrdiff_t stride, int size)
- {
-@@ -58,6 +84,106 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
-     }
- }
- 
-+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size; x++) {
-+            dst[x] = av_clip_pixel(dst[x] + dc);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+#if RPI_HEVC_SAND
-+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_v, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + *res);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, const int dc_u, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
-+            res++;
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
-+                                                ptrdiff_t stride, unsigned int size)
-+{
-+    unsigned int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int16_t * ru = res;
-+    const int16_t * rv = res + size * size;
-+
-+//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
-+//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
-+//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
-+        }
-+        dst += stride;
-+    }
-+
-+//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
-+{
-+    int x, y;
-+    pixel *dst = (pixel *)_dst;
-+    const int dc_v = dc >> 16;
-+    const int dc_u = (dc << 16) >> 16;
-+
-+    stride /= sizeof(pixel);
-+
-+    for (y = 0; y < size; y++) {
-+        for (x = 0; x < size * 2; x += 2) {
-+            dst[x] = av_clip_pixel(dst[x] + dc_u);
-+            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+        }
-+        dst += stride;
-+    }
-+}
-+
-+
-+#endif
-+
- static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
-                                   ptrdiff_t stride)
- {
-@@ -82,6 +208,132 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
-     FUNC(add_residual)(_dst, res, stride, 32);
- }
- 
-+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+    FUNC(add_residual_dc)(_dst, stride, dc, 32);
-+}
-+
-+#if RPI_HEVC_SAND
-+// -- U -- (plaited)
-+
-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
-+}
-+
-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
-+}
-+
-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
-+}
-+
-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_u)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- V -- (plaited)
-+
-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
-+}
-+
-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
-+}
-+
-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
-+}
-+
-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride, int dc_v)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+// -- C -- (plaited - both U & V)
-+
-+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
-+                                  ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    FUNC(add_residual_c)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
-+                                    ptrdiff_t stride)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+    // Should never occur for 420, which is all that sand supports
-+    av_assert0(0);
-+}
-+
-+#endif
-+
-+
- static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
- {
-     int16_t *coeffs = (int16_t *) _coeffs;
-@@ -352,6 +604,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
-     }
- }
- 
-+
-+#if BIT_DEPTH == 10
-+#if RPI_HEVC_SAND
-+// We need a 32 bit variation for the _c restores so hijack bit depth 10
-+#undef pixel
-+#undef BIT_DEPTH
-+#define pixel uint32_t
-+#define BIT_DEPTH 32
-+#endif
-+// All 16 bit variations are the same
-+#define sao_edge_restore_0_10 sao_edge_restore_0_9
-+#define sao_edge_restore_1_10 sao_edge_restore_1_9
-+#define sao_edge_restore_0_11 sao_edge_restore_0_9
-+#define sao_edge_restore_1_11 sao_edge_restore_1_9
-+#define sao_edge_restore_0_12 sao_edge_restore_0_9
-+#define sao_edge_restore_1_12 sao_edge_restore_1_9
-+#define sao_edge_restore_0_13 sao_edge_restore_0_9
-+#define sao_edge_restore_1_13 sao_edge_restore_1_9
-+#define sao_edge_restore_0_14 sao_edge_restore_0_9
-+#define sao_edge_restore_1_14 sao_edge_restore_1_9
-+#define sao_edge_restore_0_15 sao_edge_restore_0_9
-+#define sao_edge_restore_1_15 sao_edge_restore_1_9
-+#define sao_edge_restore_0_16 sao_edge_restore_0_9
-+#define sao_edge_restore_1_16 sao_edge_restore_1_9
-+#endif
-+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
- static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
-                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
-                                     int *borders, int _width, int _height,
-@@ -361,7 +639,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
-     int x, y;
-     pixel *dst = (pixel *)_dst;
-     pixel *src = (pixel *)_src;
--    int16_t *sao_offset_val = sao->offset_val[c_idx];
-     int sao_eo_class    = sao->eo_class[c_idx];
-     int init_x = 0, width = _width, height = _height;
- 
-@@ -370,33 +647,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
- 
-     if (sao_eo_class != SAO_EO_VERT) {
-         if (borders[0]) {
--            int offset_val = sao_offset_val[0];
-             for (y = 0; y < height; y++) {
--                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
-+                dst[y * stride_dst] = src[y * stride_src];
-             }
-             init_x = 1;
-         }
-         if (borders[2]) {
--            int offset_val = sao_offset_val[0];
-             int offset     = width - 1;
-             for (x = 0; x < height; x++) {
--                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-             }
-             width--;
-         }
-     }
-     if (sao_eo_class != SAO_EO_HORIZ) {
-         if (borders[1]) {
--            int offset_val = sao_offset_val[0];
-             for (x = init_x; x < width; x++)
--                dst[x] = av_clip_pixel(src[x] + offset_val);
-+                dst[x] = src[x];
-         }
-         if (borders[3]) {
--            int offset_val   = sao_offset_val[0];
-             ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-             ptrdiff_t y_stride_src = stride_src * (height - 1);
-             for (x = init_x; x < width; x++)
--                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-             height--;
-         }
-     }
-@@ -411,7 +684,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
-     int x, y;
-     pixel *dst = (pixel *)_dst;
-     pixel *src = (pixel *)_src;
--    int16_t *sao_offset_val = sao->offset_val[c_idx];
-     int sao_eo_class    = sao->eo_class[c_idx];
-     int init_x = 0, init_y = 0, width = _width, height = _height;
- 
-@@ -420,34 +692,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
- 
-     if (sao_eo_class != SAO_EO_VERT) {
-         if (borders[0]) {
--            int offset_val = sao_offset_val[0];
-             for (y = 0; y < height; y++) {
--                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
-+                dst[y * stride_dst] = src[y * stride_src];
-             }
-             init_x = 1;
-         }
-         if (borders[2]) {
--            int offset_val = sao_offset_val[0];
-             int offset     = width - 1;
-             for (x = 0; x < height; x++) {
--                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
-             }
-             width--;
-         }
-     }
-     if (sao_eo_class != SAO_EO_HORIZ) {
-         if (borders[1]) {
--            int offset_val = sao_offset_val[0];
-             for (x = init_x; x < width; x++)
--                dst[x] = av_clip_pixel(src[x] + offset_val);
-+                dst[x] = src[x];
-             init_y = 1;
-         }
-         if (borders[3]) {
--            int offset_val   = sao_offset_val[0];
-             ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-             ptrdiff_t y_stride_src = stride_src * (height - 1);
-             for (x = init_x; x < width; x++)
--                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
-+                dst[x + y_stride_dst] = src[x + y_stride_src];
-             height--;
-         }
-     }
-@@ -487,6 +755,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
- 
-     }
- }
-+#endif
-+#if BIT_DEPTH == 32
-+#undef BIT_DEPTH
-+#undef pixel
-+#define BIT_DEPTH 10
-+#define pixel uint16_t
-+#endif
-+
-+// --- Plaited chroma versions
-+
-+#if RPI_HEVC_SAND
-+
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int offset_table_u[32] = { 0 };
-+    int offset_table_v[32] = { 0 };
-+    int k, y, x;
-+    int shift  = BIT_DEPTH - 5;
-+
-+    stride_dst /= sizeof(pixel);
-+    stride_src /= sizeof(pixel);
-+    width *= 2;
-+
-+    for (k = 0; k < 4; k++)
-+    {
-+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+    }
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2)
-+        {
-+//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
-+//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
-+            // *** & 31 shouldn't be wanted but just now we generate broken input that
-+            // crashes us in 10-bit world
-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
-+        }
-+        dst += stride_dst;
-+        src += stride_src;
-+    }
-+}
-+
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+                                  int eo, int width, int height) {
-+
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
-+
-+    stride_dst /= sizeof(pixel);
-+    width *= 2;
-+
-+    av_assert0(width <= 64);
-+
-+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+    for (y = 0; y < height; y++) {
-+        for (x = 0; x < width; x += 2) {
-+            int diff0u = CMP(src[x], src[x + a_stride]);
-+            int diff1u = CMP(src[x], src[x + b_stride]);
-+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
-+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
-+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
-+        }
-+        src += stride_src;
-+        dst += stride_dst;
-+    }
-+}
-+
-+// Do once
-+#if BIT_DEPTH == 8
-+// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
-+#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
-+// We need 32 bit for 9 bit+
-+#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
-+#endif
-+
-+#endif  // RPI_HEVC_SAND
-+
- 
- #undef CMP
- 
-@@ -1690,3 +2073,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
- #undef TQ1
- #undef TQ2
- #undef TQ3
-+
-+#if RPI_HEVC_SAND
-+
-+// line zero
-+#define P3 pix_l[0 * xstride]
-+#define P2 pix_l[1 * xstride]
-+#define P1 pix_l[2 * xstride]
-+#define P0 pix_l[3 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+#define Q2 pix_r[2 * xstride]
-+#define Q3 pix_r[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix_l[0 * xstride + 3 * ystride]
-+#define TP2 pix_l[1 * xstride + 3 * ystride]
-+#define TP1 pix_l[2 * xstride + 3 * ystride]
-+#define TP0 pix_l[3 * xstride + 3 * ystride]
-+#define TQ0 pix_r[0 * xstride + 3 * ystride]
-+#define TQ1 pix_r[1 * xstride + 3 * ystride]
-+#define TQ2 pix_r[2 * xstride + 3 * ystride]
-+#define TQ3 pix_r[3 * xstride + 3 * ystride]
-+
-+// This is identical to hevc_loop_filter_luma except that the P/Q
-+// components are on separate pointers
-+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
-+                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
-+                                 uint8_t * _pix_l)
-+{
-+    int d, j;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    const ptrdiff_t xstride = 1;
-+    const ptrdiff_t ystride = _stride / sizeof(pixel);
-+
-+    beta <<= BIT_DEPTH - 8;
-+
-+    for (j = 0; j < 2; j++) {
-+        const int dp0  = abs(P2  - 2 * P1  + P0);
-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
-+        const int d0   = dp0 + dq0;
-+        const int d3   = dp3 + dq3;
-+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
-+        const int no_p = _no_p[j];
-+        const int no_q = _no_q[j];
-+
-+        if (d0 + d3 >= beta) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        } else {
-+            const int beta_3 = beta >> 3;
-+            const int beta_2 = beta >> 2;
-+            const int tc25   = ((tc * 5 + 1) >> 1);
-+
-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
-+                // strong filtering
-+                const int tc2 = tc << 1;
-+                for (d = 0; d < 4; d++) {
-+                    const int p3 = P3;
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    const int q3 = Q3;
-+                    if (!no_p) {
-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+                    }
-+                    if (!no_q) {
-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            } else { // normal filtering
-+                int nd_p = 1;
-+                int nd_q = 1;
-+                const int tc_2 = tc >> 1;
-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_p = 2;
-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+                    nd_q = 2;
-+
-+                for (d = 0; d < 4; d++) {
-+                    const int p2 = P2;
-+                    const int p1 = P1;
-+                    const int p0 = P0;
-+                    const int q0 = Q0;
-+                    const int q1 = Q1;
-+                    const int q2 = Q2;
-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+                    if (abs(delta0) < 10 * tc) {
-+                        delta0 = av_clip(delta0, -tc, tc);
-+                        if (!no_p)
-+                            P0 = av_clip_pixel(p0 + delta0);
-+                        if (!no_q)
-+                            Q0 = av_clip_pixel(q0 - delta0);
-+                        if (!no_p && nd_p > 1) {
-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+                            P1 = av_clip_pixel(p1 + deltap1);
-+                        }
-+                        if (!no_q && nd_q > 1) {
-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+                            Q1 = av_clip_pixel(q1 + deltaq1);
-+                        }
-+                    }
-+                    pix_l += ystride;
-+                    pix_r += ystride;
-+                }
-+            }
-+        }
-+    }
-+}
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#define P1 pix_l[0 * xstride]
-+#define P0 pix_l[1 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+
-+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
-+                                          ptrdiff_t _ystride, const int32_t *_tc,
-+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
-+{
-+    int d, j, no_p, no_q;
-+    pixel *pix_l        = (pixel *)_pix_l;
-+    pixel *pix_r        = (pixel *)_pix_r;
-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+    for (j = 0; j < 2; j++) {
-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
-+        if (tc <= 0) {
-+            pix_l += 4 * ystride;
-+            pix_r += 4 * ystride;
-+            continue;
-+        }
-+        no_p = _no_p[j];
-+        no_q = _no_q[j];
-+
-+        for (d = 0; d < 4; d++) {
-+            int delta0;
-+            const int p1 = P1;
-+            const int p0 = P0;
-+            const int q0 = Q0;
-+            const int q1 = Q1;
-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+            if (!no_p)
-+                P0 = av_clip_pixel(p0 + delta0);
-+            if (!no_q)
-+                Q0 = av_clip_pixel(q0 - delta0);
-+            pix_l += ystride;
-+            pix_r += ystride;
-+        }
-+    }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
-+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+                                 uint8_t * src_l,
-+                                 unsigned int no_f)
-+{
-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
-+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
-+}
-+
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+
-+
-+#endif
-+
-diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
-index 7a86ed3d31..7d32c4ab14 100644
---- a/libavcodec/hevcpred.c
-+++ b/libavcodec/hevcpred.c
-@@ -24,6 +24,7 @@
- 
- #include "hevcpred.h"
- 
-+#define PRED_C 0
- #define BIT_DEPTH 8
- #include "hevcpred_template.c"
- #undef BIT_DEPTH
-@@ -39,13 +40,37 @@
- #define BIT_DEPTH 12
- #include "hevcpred_template.c"
- #undef BIT_DEPTH
-+#undef PRED_C
-+
-+#ifdef RPI
-+#define PRED_C 1
-+#define BIT_DEPTH 8
-+#include "hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+#endif
- 
- void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
- {
- #undef FUNC
- #define FUNC(a, depth) a ## _ ## depth
- 
--#define HEVC_PRED(depth)                                \
-+#undef FUNCC
-+#define FUNCC(a, depth) a ## _ ## depth ## _c
-+
-+#define HEVC_PRED_Y(depth)                                \
-     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
-     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
-     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
-@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
-     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
-     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
- 
-+#define HEVC_PRED_C(depth)                                \
-+    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
-+    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
-+    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
-+    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
-+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
-+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
-+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
-+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
-+    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
-+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
-+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
-+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
-+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
-+
-+#ifdef RPI
-+#define HEVC_PRED(depth) \
-+    HEVC_PRED_Y(depth); \
-+    HEVC_PRED_C(depth);
-+#else
-+#define HEVC_PRED(depth) \
-+    HEVC_PRED_Y(depth);
-+#endif
-+
-     switch (bit_depth) {
-     case 9:
-         HEVC_PRED(9);
-diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
-index eb17663683..6711cfc06f 100644
---- a/libavcodec/hevcpred.h
-+++ b/libavcodec/hevcpred.h
-@@ -27,9 +27,10 @@
- #include <stdint.h>
- 
- struct HEVCContext;
-+struct HEVCLocalContext;
- 
- typedef struct HEVCPredContext {
--    void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
-+    void (*intra_pred[4])(const struct HEVCContext * const s, struct HEVCLocalContext * const lc, int x0, int y0, int c_idx);
- 
-     void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-                            const uint8_t *left, ptrdiff_t stride);
-@@ -38,6 +39,17 @@ typedef struct HEVCPredContext {
-     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-                             const uint8_t *left, ptrdiff_t stride,
-                             int c_idx, int mode);
-+#ifdef RPI
-+    void (*intra_pred_c[4])(const struct HEVCContext * const s, struct HEVCLocalContext * const lc, int x0, int y0, int c_idx);
-+
-+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
-+                           const uint8_t *left, ptrdiff_t stride);
-+    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+                    ptrdiff_t stride, int log2_size, int c_idx);
-+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
-+                            const uint8_t *left, ptrdiff_t stride,
-+                            int c_idx, int mode);
-+#endif
- } HEVCPredContext;
- 
- void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 6fe33546b1..999ef369fe 100644
---- a/libavcodec/hevcpred_template.c
-+++ b/libavcodec/hevcpred_template.c
-@@ -20,14 +20,111 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+//#define DISABLE_INTRA
-+
- #include "libavutil/pixdesc.h"
- 
- #include "bit_depth_template.c"
- #include "hevcpred.h"
- 
-+#ifdef RPI
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
-+
-+#define DUMP_PRED 0
-+
- #define POS(x, y) src[(x) + stride * (y)]
- 
--static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
-+// REPEAT_INCLUDE defined at EOF
-+#if defined(RPI) && !defined(INCLUDED_ONCE)
-+typedef uint8_t (* c8_dst_ptr_t)[2];
-+typedef const uint8_t (* c8_src_ptr_t)[2];
-+typedef uint16_t (* c16_dst_ptr_t)[2];
-+typedef const uint16_t (* c16_src_ptr_t)[2];
-+
-+// *** On ARM make these NEON registers
-+typedef struct pixel4_16 {
-+    uint16_t x[4];
-+} pixel4_16;
-+typedef struct pixel4_32 {
-+    uint32_t x[4];
-+} pixel4_32;
-+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
-+{
-+    pixel4_16 t = {{x, x, x, x}};
-+    return t;
-+}
-+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
-+{
-+    pixel4_32 t = {{x, x, x, x}};
-+    return t;
-+}
-+#endif
-+
-+#if PRED_C
-+// For chroma we double pixel size so we copy pairs
-+#undef pixel
-+#undef pixel2
-+#undef pixel4
-+#undef dctcoef
-+#undef INIT_CLIP
-+#undef no_rnd_avg_pixel4
-+#undef rnd_avg_pixel4
-+#undef AV_RN2P
-+#undef AV_RN4P
-+#undef AV_RN4PA
-+#undef AV_WN2P
-+#undef AV_WN4P
-+#undef AV_WN4PA
-+#undef CLIP
-+#undef FUNC
-+#undef FUNCC
-+#undef av_clip_pixel
-+#undef PIXEL_SPLAT_X4
-+
-+#if BIT_DEPTH == 8
-+#define pixel uint16_t
-+#define pixel4 pixel4_16
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
-+#define cpel uint8_t
-+#define c_src_ptr_t  c8_src_ptr_t
-+#define c_dst_ptr_t  c8_dst_ptr_t
-+#else
-+#define pixel uint32_t
-+#define pixel4 pixel4_32
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
-+#define cpel uint16_t
-+#define c_src_ptr_t c16_dst_ptr_t
-+#define c_dst_ptr_t c16_dst_ptr_t
-+#endif
-+#define AV_RN4P(p) (*(pixel4*)(p))
-+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
-+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
-+#endif
-+
-+
-+// Get PW prior to horrid PRED_C trickery
-+#if BIT_DEPTH == 8
-+#define PW 1
-+#else
-+#define PW 2
-+#endif
-+
-+
-+#if DUMP_PRED && !defined(INCLUDE_ONCE)
-+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
-+{
-+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
-+        for (unsigned int x = 0; x != size; x++) {
-+            printf("%4d", data[x * 2]);
-+        }
-+        printf("\n");
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+static av_always_inline void FUNC(intra_pred)(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0,
-                                               int log2_size, int c_idx)
- {
- #define PU(x) \
-@@ -69,8 +166,6 @@ do {                                  \
-                 AV_WN4P(&ptr[i], a);                                           \
-             else                                                               \
-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
--
--    HEVCLocalContext *lc = s->HEVClc;
-     int i;
-     int hshift = s->ps.sps->hshift[c_idx];
-     int vshift = s->ps.sps->vshift[c_idx];
-@@ -79,15 +174,23 @@ do {                                  \
-     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
-     int size_in_luma_v = size << vshift;
-     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
--    int x = x0 >> hshift;
--    int y = y0 >> vshift;
-+    const int x = x0 >> hshift;
-+    const int y = y0 >> vshift;
-     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
-     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
- 
-     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
- 
--    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
-+    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
-+#if defined(RPI)
-+    pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
-+            (pixel*)s->frame->data[c_idx] + x + y * stride :
-+        c_idx == 0 ?
-+            (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
-+            (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
-+#else
-     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
-+#endif
- 
-     int min_pu_width = s->ps.sps->min_pu_width;
- 
-@@ -95,14 +198,20 @@ do {                                  \
-                               lc->tu.intra_pred_mode;
-     pixel4 a;
-     pixel  left_array[2 * MAX_TB_SIZE + 1];
-+#if !PRED_C
-     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
-+#endif
-     pixel  top_array[2 * MAX_TB_SIZE + 1];
-+#if !PRED_C
-     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
-+#endif
- 
-     pixel  *left          = left_array + 1;
-     pixel  *top           = top_array  + 1;
-+#if !PRED_C
-     pixel  *filtered_left = filtered_left_array + 1;
-     pixel  *filtered_top  = filtered_top_array  + 1;
-+#endif
-     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
-     int cand_left        = lc->na.cand_left;
-     int cand_up_left     = lc->na.cand_up_left;
-@@ -114,6 +223,27 @@ do {                                  \
-     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
-                            (x0 + size_in_luma_h)) >> hshift;
- 
-+    pixel * src_l = src - 1;
-+    pixel * src_u = src - stride;
-+    pixel * src_ur = src_u + size;
-+
-+#ifdef DISABLE_INTRA
-+    return;
-+#endif
-+
-+#if defined(RPI)
-+    if (av_rpi_is_sand_frame(s->frame)) {
-+        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
-+        const AVFrame * const frame = s->frame;
-+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
-+        if ((x & mask) == 0)
-+            src_l -= stripe_adj;
-+        if (((x + size) & mask) == 0)
-+            src_ur += stripe_adj;
-+    }
-+#endif
-+
-     if (s->ps.pps->constrained_intra_pred_flag == 1) {
-         int size_in_luma_pu_v = PU(size_in_luma_v);
-         int size_in_luma_pu_h = PU(size_in_luma_h);
-@@ -163,23 +293,24 @@ do {                                  \
-         top[-1] = 128;
-     }
-     if (cand_up_left) {
--        left[-1] = POS(-1, -1);
-+        left[-1] = src_l[-stride];
-         top[-1]  = left[-1];
-     }
-     if (cand_up)
--        memcpy(top, src - stride, size * sizeof(pixel));
-+        // Always good - even with sand
-+        memcpy(top, src_u, size * sizeof(pixel));
-     if (cand_up_right) {
--        memcpy(top + size, src - stride + size, size * sizeof(pixel));
--        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
-+        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
-+        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
-                size - top_right_size);
-     }
-     if (cand_left)
-         for (i = 0; i < size; i++)
--            left[i] = POS(-1, i);
-+            left[i] = src_l[stride * i];
-     if (cand_bottom_left) {
-         for (i = size; i < size + bottom_left_size; i++)
--            left[i] = POS(-1, i);
--        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
-+            left[i] = src_l[stride * i];
-+        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
-                size - bottom_left_size);
-     }
- 
-@@ -268,7 +399,11 @@ do {                                  \
-             cand_up_left = 1;
-             cand_left    = 1;
-         } else { // No samples available
-+#if PRED_C
-+            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
-+#else
-             left[-1] = (1 << (BIT_DEPTH - 1));
-+#endif
-             EXTEND(top,  left[-1], 2 * size);
-             EXTEND(left, left[-1], 2 * size);
-         }
-@@ -287,6 +422,9 @@ do {                                  \
-     top[-1] = left[-1];
- 
-     // Filtering process
-+    // Sand can only apply to chroma_format_idc == 1 so we don't need to
-+    // worry about chroma smoothing for that case
-+#if !PRED_C
-     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
-         if (mode != INTRA_DC && size != 4){
-             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-@@ -342,12 +480,36 @@ do {                                  \
-                                            mode);
-         break;
-     }
-+#else
-+    switch (mode) {
-+    case INTRA_PLANAR:
-+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
-+                                          (uint8_t *)left, stride);
-+        break;
-+    case INTRA_DC:
-+        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
-+                       (uint8_t *)left, stride, log2_size, c_idx);
-+        break;
-+    default:
-+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
-+                                           (uint8_t *)left, stride, c_idx,
-+                                           mode);
-+        break;
-+    }
-+
-+#if DUMP_PRED
-+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
-+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
-+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
-+#endif
-+#endif
- }
- 
- #define INTRA_PRED(size)                                                            \
--static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
-+static void FUNC(intra_pred_ ## size)(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int c_idx)    \
- {                                                                                   \
--    FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
-+    FUNC(intra_pred)(s, lc, x0, y0, size, c_idx);                                       \
- }
- 
- INTRA_PRED(2)
-@@ -357,6 +519,7 @@ INTRA_PRED(5)
- 
- #undef INTRA_PRED
- 
-+#if !PRED_C
- static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
-                                   const uint8_t *_left, ptrdiff_t stride,
-                                   int trafo_size)
-@@ -371,6 +534,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
-             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
-                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
- }
-+#else
-+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
-+                                  const uint8_t * _left, ptrdiff_t stride,
-+                                  int trafo_size)
-+{
-+    int x, y;
-+    int size = 1 << trafo_size;
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    for (y = 0; y < size; y++, src += stride)
-+    {
-+        for (x = 0; x < size; x++)
-+        {
-+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
-+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
-+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
-+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
-+        }
-+    }
-+}
-+#endif
- 
- #define PRED_PLANAR(size)\
- static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-@@ -386,6 +572,7 @@ PRED_PLANAR(3)
- 
- #undef PRED_PLANAR
- 
-+#if !PRED_C
- static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-                           const uint8_t *_left,
-                           ptrdiff_t stride, int log2_size, int c_idx)
-@@ -416,7 +603,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
-     }
- }
-+#else
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+                          const uint8_t *_left,
-+                          ptrdiff_t stride, int log2_size, int c_idx)
-+{
-+    unsigned int i, j;
-+    const unsigned int size = (1 << log2_size);
-+    c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+    const c_src_ptr_t top = (c_src_ptr_t)_top;
-+    const c_src_ptr_t left = (c_src_ptr_t)_left;
-+    unsigned int dc0 = size;
-+    unsigned int dc1 = size;
-+
-+    for (i = 0; i < size; i++)
-+    {
-+        dc0 += left[i][0] + top[i][0];
-+        dc1 += left[i][1] + top[i][1];
-+    }
-+
-+    dc0 >>= log2_size + 1;
-+    dc1 >>= log2_size + 1;
-+
-+    for (i = 0; i < size; i++, src += stride)
-+    {
-+        for (j = 0; j < size; ++j)
-+        {
-+            src[j][0] = dc0;
-+            src[j][1] = dc1;
- 
-+        }
-+    }
-+}
-+#endif
-+
-+#ifndef ANGLE_CONSTS
-+#define ANGLE_CONSTS
-+static const int intra_pred_angle[] = {
-+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
-+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
-+};
-+static const int inv_angle[] = {
-+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
-+    -630, -910, -1638, -4096
-+};
-+#endif
-+
-+#if !PRED_C
- static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-                                                 const uint8_t *_top,
-                                                 const uint8_t *_left,
-@@ -428,15 +661,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-     const pixel *top  = (const pixel *)_top;
-     const pixel *left = (const pixel *)_left;
- 
--    static const int intra_pred_angle[] = {
--         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
--        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
--    };
--    static const int inv_angle[] = {
--        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
--        -630, -910, -1638, -4096
--    };
--
-     int angle = intra_pred_angle[mode - 2];
-     pixel ref_array[3 * MAX_TB_SIZE + 4];
-     pixel *ref_tmp = ref_array + size;
-@@ -509,6 +733,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-         }
-     }
- }
-+#else
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+                                                const uint8_t *_top,
-+                                                const uint8_t *_left,
-+                                                ptrdiff_t stride, int c_idx,
-+                                                int mode, int size)
-+{
-+    int x, y;
-+    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
-+    c_src_ptr_t top  = (c_src_ptr_t)_top;
-+    c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+    const int angle = intra_pred_angle[mode - 2];
-+    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
-+    c_dst_ptr_t ref_tmp = ref_array + size;
-+    c_src_ptr_t ref;
-+    const int last = (size * angle) >> 5;
-+
-+    if (mode >= 18) {
-+        ref = top - 1;
-+        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (y = 0; y < size; y++, src += stride) {
-+            const int idx  = ((y + 1) * angle) >> 5;
-+            const int fact = ((y + 1) * angle) & 31;
-+            if (fact) {
-+                for (x = 0; x < size; ++x) {
-+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
-+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
-+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
-+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                memcpy(src, ref + idx + 1, size * 2 * PW);
-+            }
-+        }
-+    } else {
-+        ref = left - 1;
-+        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
-+            for (x = last; x <= -1; x++)
-+            {
-+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+            }
-+            ref = (c_src_ptr_t)ref_tmp;
-+        }
-+
-+        for (x = 0; x < size; x++, src++) {
-+            const int idx  = ((x + 1) * angle) >> 5;
-+            const int fact = ((x + 1) * angle) & 31;
-+            if (fact) {
-+                for (y = 0; y < size; y++) {
-+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
-+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
-+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
-+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
-+                }
-+            } else {
-+                for (y = 0; y < size; y++)
-+                {
-+                    src[y * stride][0] = ref[y + idx + 1][0];
-+                    src[y * stride][1] = ref[y + idx + 1][1];
-+                }
-+            }
-+        }
-+    }
-+}
-+#endif
- 
- static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
-                                  const uint8_t *left,
-@@ -538,6 +839,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
-     FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
- }
- 
-+#undef cpel
-+#undef c_src_ptr_t
-+#undef c_dst_ptr_t
-+
- #undef EXTEND_LEFT_CIP
- #undef EXTEND_RIGHT_CIP
- #undef EXTEND_UP_CIP
-@@ -549,3 +854,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
- #undef EXTEND
- #undef MIN_TB_ADDR_ZS
- #undef POS
-+#undef PW
-+
-+#ifndef INCLUDED_ONCE
-+#define INCLUDED_ONCE
-+#endif
-+
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 0b1195dc3e..5ef81fa739 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
-  * MMAL Video Decoder
-  */
- 
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- #include <stdatomic.h>
- 
- #include "avcodec.h"
-diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index 8da2a9735e..0ff0e421fc 100644
---- a/libavcodec/raw.c
-+++ b/libavcodec/raw.c
-@@ -283,6 +283,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
-     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
-     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
- 
-+    /* RPI */
-+#ifdef RPI
-+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
-+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
-+#endif
-+
-     /* special */
-     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
-     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d181b74570..7d89af32a2 100644
---- a/libavcodec/rawenc.c
-+++ b/libavcodec/rawenc.c
-@@ -31,6 +31,8 @@
- #include "libavutil/intreadwrite.h"
- #include "libavutil/imgutils.h"
- #include "libavutil/internal.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/rpi_sand_fns.h"
- 
- static av_cold int raw_encode_init(AVCodecContext *avctx)
- {
-@@ -49,6 +51,55 @@ FF_ENABLE_DEPRECATION_WARNINGS
-     return 0;
- }
- 
-+#ifdef RPI
-+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3 / 2;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height;
-+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
-+    dst += width * height * 2;
-+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
-+    return 0;
-+}
-+#endif
-+
-+
- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-                       const AVFrame *frame, int *got_packet)
- {
-@@ -58,6 +109,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-     if (ret < 0)
-         return ret;
- 
-+#ifdef RPI
-+    if (av_rpi_is_sand_frame(frame)) {
-+        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
-+        *got_packet = (ret == 0);
-+        return ret;
-+    }
-+#endif
-+
-     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
-         return ret;
-     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000000..391f761df9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,923 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# HEVC VPU Transform
-+#             fe
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+
-+.equ TRANS_SHIFT, 20 - BIT_DEPTH
-+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
-+.equ TRANS_ASL2, 16 - TRANS_SHIFT
-+
-+
-+hevc_trans_16x16:
-+  cmp r5,1
-+  beq memclear16
-+  cmp r5,2
-+  beq hevc_deblock_16x16
-+  cmp r5,3
-+  beq hevc_uv_deblock_16x16
-+  cmp r5,4
-+  beq hevc_uv_deblock_16x16_with_clear
-+  cmp r5,5
-+  beq hevc_run_command_list
-+
-+  push r6-r15, lr # TODO cut down number of used registers
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,TRANS_RND2 # Constant used for rounding second pass
-+
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+block_loop:
-+  eor r0,r8
-+  add r1,r7
-+  # Prefetch the next block
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  eor r0,r8
-+  sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-+  pop r6-r15, pc
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+#
-+hevc_trans_32x32:
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
-+
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-+  # set r8 to 32byte aligned stack pointer
-+  add r8,sp,31
-+  lsr r8,5
-+  lsl r8,5
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+block_loop32:
-+
-+  # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r4, TRANS_RND2 # Constant used for rounding second pass
-+  mov r5, TRANS_ASL2 # left shift used for rounding second pass
-+
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+
-+  add sp,sp,32*32*2+32 # Restore stack
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  push lr
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0),(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  pop pc
-+
-+memclear16:
-+  # r0 is address
-+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
-+  vmov HX(0++,0),0 REP 16
-+  mov r2,32
-+loop:
-+  vsth HX(0++,0),(r0+=r2) REP 16
-+  add r0,16*16*2
-+  sub r1,16*16
-+  cmp r1,0
-+  bgt loop
-+  b lr
-+
-+
-+################################################################################
-+# HEVC VPU Deblock
-+#
-+# Vertical edges before horizontal
-+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
-+#
-+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
-+# The VPU code works in units of 16x16 blocks.
-+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
-+# One final horizontal filter is required at the end.
-+# PCM is not allowed in this code.
-+#
-+#
-+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
-+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
-+
-+.set P0,63
-+.set P1,62
-+.set P2,61
-+.set P3,60
-+.set Q0,59
-+.set Q1,58
-+.set Q2,57
-+.set Q3,56
-+
-+.set dp,32
-+.set dq,33
-+.set d,34
-+.set decision,35
-+.set beta,36
-+.set beta2,37
-+.set beta3,38
-+.set ptest,39
-+.set qtest,40
-+.set pqtest,41
-+.set thresh,42
-+.set deltatest, 44
-+.set deltap1, 45
-+.set tc25, 46
-+.set setup,47
-+.set tc,48
-+.set tc25,49
-+.set tc2, 50
-+.set do_filter, 51
-+.set delta, 52
-+.set tc10, 53
-+.set delta0, 54
-+.set delta1, 55
-+.set zeros, 0
-+.set setup_input, 1
-+.set deltaq1, 2
-+
-+
-+
-+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
-+# Row has num16 16x16 blocks across
-+# Beta goes from 0 to 64
-+# tc goes from 0 to 24
-+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
-+#   has 8 bytes per edge
-+#   has 16 bytes per direction
-+#   has 32 bytes per 16x16 block
-+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
-+hevc_deblock_16x16:
-+  push r6-r15, lr
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+
-+process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl vert_filter
-+  sub r3,8
-+  b start_deblock_loop
-+deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt start_again
-+  pop r6-r15, pc
-+start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+vert_filter:
-+  push lr
-+
-+  vmov HX(P3,0), V(16,12)+r3
-+  vmov HX(P2,0), V(16,13)+r3
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+  vmov HX(Q2,0), V(16,18)+r3
-+  vmov HX(Q3,0), V(16,19)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds V(16,13)+r3, HX(P2,0), 0
-+  vadds V(16,14)+r3, HX(P1,0), 0
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+  vadds V(16,17)+r3, HX(Q1,0), 0
-+  vadds V(16,18)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+horz_filter:
-+  push lr
-+
-+  vmov HX(P3,0), H(12,0)+r3
-+  vmov HX(P2,0), H(13,0)+r3
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+  vmov HX(Q2,0), H(18,0)+r3
-+  vmov HX(Q3,0), H(19,0)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds H(13,0)+r3, HX(P2,0), 0
-+  vadds H(14,0)+r3, HX(P1,0), 0
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+  vadds H(17,0)+r3, HX(Q1,0), 0
-+  vadds H(18,0)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_luma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
-+  valtl HX(beta,0),H(setup,0),H(setup,0)
-+  valtu HX(tc,0),H(setup,0),H(setup,0)
-+  vmul HX(tc25,0), HX(tc,0), 5
-+  vadd HX(tc25,0),HX(tc25,0), 1
-+  vasr HX(tc25,0), HX(tc25,0), 1
-+
-+  # Compute decision
-+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
-+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
-+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
-+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
-+
-+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
-+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
-+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
-+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
-+
-+  vadd HX(d,0), HX(dp,0), HX(dq,0)
-+  vasr HX(beta2,0),HX(beta,0),2
-+  vasr HX(beta3,0),HX(beta,0),3
-+
-+  # Compute flags that are negative if all conditions pass
-+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
-+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
-+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
-+
-+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
-+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
-+  vmov HX(decision,0), 1 IFNN
-+  vadd H(decision,0),H(decision,3),0 IFN
-+  vadd H(decision,16),H(decision,19),0 IFN
-+  vmov -,HX(decision,0) SETF   # N marks strong filter
-+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
-+
-+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
-+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
-+  vmov HX(decision,0),0 IFNN # Z marks no filter
-+
-+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
-+  # First extract out even terms
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
-+  # Now expand back
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
-+
-+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
-+
-+  # Do a quick check to see if there is anything to do
-+  mov r11, 0 # Signal no filtering
-+  vmov -,1 IFNZ SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+  mov r11, 1 # Signal some filtering
-+  # And whether there is any strong filtering
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq normal_filtering
-+
-+  ##############################################################################
-+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
-+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
-+
-+  # Take a copy of the original pixels for use in decision calculation
-+  vmov HX(P0,32),HX(P0,0)
-+  vmov HX(Q0,32),HX(Q0,0)
-+  vmov HX(P1,32),HX(P1,0)
-+  vmov HX(Q1,32),HX(Q1,0)
-+  vmov HX(P2,32),HX(P2,0)
-+  vmov HX(Q2,32),HX(Q2,0)
-+
-+  vadd -,HX(P2,32),4 CLRA SACC
-+  vshl -,HX(P1,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl HX(delta,0),HX(Q1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P2,32),2 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vshl HX(delta,0),HX(Q0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q0,32),4 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vmul -,HX(P2,32),3 SACC
-+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
-+  #vmov HX(P2,0),3 IFN
-+
-+  # Now reverse all P/Qs
-+
-+  vadd -,HX(Q2,32),4 CLRA SACC
-+  vshl -,HX(Q1,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl HX(delta,0),HX(P1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q2,32),2 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vshl HX(delta,0),HX(P0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P0,32),4 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vmul -,HX(Q2,32),3 SACC
-+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
-+
-+  ##############################################################################
-+  # Normal filtering
-+normal_filtering:
-+  # Invert the decision flags
-+  # make instruction more complicated as assembler has error and loses SETF
-+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
-+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
-+
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+
-+  vasr HX(tc2,0), HX(tc,0), 1
-+  vmul HX(tc10,0), HX(tc,0), 10
-+
-+  vasr HX(thresh,0), HX(beta,0), 1
-+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
-+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
-+
-+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
-+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
-+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
-+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
-+  # Expand ptest and qtest together
-+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
-+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
-+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
-+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
-+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
-+
-+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
-+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
-+  vmov -,8 CLRA SACC
-+  vmul -,HX(delta0,0), 9 SACC
-+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
-+  vasr HX(delta0,0), HX(delta0,0), 4
-+  vdist HX(deltatest,0), HX(delta0,0), 0
-+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
-+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
-+
-+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
-+
-+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
-+  vadd HX(deltap1,0), HX(deltap1,0), 1
-+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
-+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
-+  vasr HX(deltap1,0), HX(deltap1,0), 1
-+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
-+
-+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
-+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
-+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
-+  vrsub -, HX(delta0,0), 0 SACC
-+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
-+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
-+
-+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
-+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
-+
-+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
-+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
-+
-+  vmov -,HX(deltatest,0) SETF
-+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
-+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
-+
-+  #vmov HX(P2,0),1 IFN
-+
-+filtering_done:
-+  b lr
-+
-+
-+hevc_uv_deblock_16x16:
-+  push r6-r15, lr
-+  mov r14,0
-+  b hevc_uv_start
-+hevc_uv_deblock_16x16_with_clear:
-+  push r6-r15, lr
-+  mov r14,1
-+  b hevc_uv_start
-+
-+hevc_uv_start:
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+# r14 is 1 if we should clear the old contents, or 0 if not
-+
-+uv_process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  cmp r14,1
-+  bne uv_skip0
-+  vstb H(zeros,0),(r4)
-+uv_skip0:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl uv_vert_filter
-+  sub r3,8
-+  b uv_start_deblock_loop
-+uv_deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  cmp r14,1
-+  bne uv_skip1
-+  vstb H(zeros,0),(r4)
-+uv_skip1:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip3
-+  vstb H(zeros,0),-16(r4)
-+uv_skip3:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,uv_skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+uv_start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt uv_deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip2
-+  vstb H(zeros,0),-16(r4)
-+uv_skip2:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,uv_skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt uv_start_again
-+  pop r6-r15, pc
-+uv_start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b uv_process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+uv_vert_filter:
-+  push lr
-+
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+uv_horz_filter:
-+  push lr
-+
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_chroma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
-+  valtl HX(tc,0),H(setup,0),H(setup,0)
-+
-+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
-+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
-+  vsub -,HX(P1,0),HX(Q1,0) SACC
-+  vmov HX(delta,0),4 SACC
-+  vasr HX(delta,0),HX(delta,0),3
-+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
-+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
-+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
-+  b lr
-+
-+# r0 = list
-+# r1 = number
-+hevc_run_command_list:
-+  push r6-r7, lr
-+  mov r6, r0
-+  mov r7, r1
-+loop_cmds:
-+  ld r0,(r6) # How to encode r6++?
-+  add r6,4
-+  ld r1,(r6)
-+  add r6,4
-+  ld r2,(r6)
-+  add r6,4
-+  ld r3,(r6)
-+  add r6,4
-+  ld r4,(r6)
-+  add r6,4
-+  ld r5,(r6)
-+  add r6,4
-+  bl hevc_trans_16x16
-+  sub r7,1
-+  cmp r7,0
-+  bgt loop_cmds
-+
-+  pop r6-r7, pc
-diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
-new file mode 100644
-index 0000000000..b0e9902d82
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform10.h
-@@ -0,0 +1,3070 @@
-+static const unsigned char rpi_hevc_transform10 [] = {
-+21,
-+106,
-+0,
-+144,
-+47,
-+1,
-+37,
-+106,
-+0,
-+144,
-+66,
-+1,
-+53,
-+106,
-+0,
-+144,
-+192,
-+4,
-+69,
-+106,
-+0,
-+144,
-+192,
-+4,
-+85,
-+106,
-+0,
-+144,
-+220,
-+5,
-+169,
-+3,
-+62,
-+64,
-+79,
-+64,
-+3,
-+232,
-+32,
-+0,
-+0,
-+0,
-+12,
-+248,
-+0,
-+136,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
-+64,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+248,
-+0,
-+168,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
-+0,
-+96,
-+3,
-+232,
-+32,
-+0,
-+0,
-+0,
-+7,
-+232,
-+0,
-+2,
-+0,
-+0,
-+8,
-+232,
-+0,
-+4,
-+0,
-+0,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+8,
-+4,
-+0,
-+4,
-+232,
-+64,
-+0,
-+0,
-+0,
-+5,
-+232,
-+0,
-+2,
-+0,
-+0,
-+128,
-+69,
-+113,
-+66,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+8,
-+4,
-+0,
-+128,
-+69,
-+113,
-+70,
-+128,
-+144,
-+40,
-+0,
-+4,
-+255,
-+48,
-+192,
-+128,
-+3,
-+32,
-+8,
-+16,
-+0,
-+76,
-+254,
-+48,
-+192,
-+9,
-+4,
-+32,
-+8,
-+0,
-+0,
-+4,
-+254,
-+0,
-+144,
-+128,
-+2,
-+0,
-+8,
-+2,
-+0,
-+128,
-+144,
-+23,
-+0,
-+4,
-+255,
-+48,
-+192,
-+128,
-+3,
-+32,
-+8,
-+20,
-+0,
-+76,
-+254,
-+48,
-+192,
-+6,
-+4,
-+32,
-+8,
-+0,
-+0,
-+140,
-+248,
-+44,
-+0,
-+0,
-+0,
-+32,
-+48,
-+4,
-+0,
-+128,
-+69,
-+113,
-+66,
-+242,
-+140,
-+211,
-+192,
-+34,
-+31,
-+41,
-+3,
-+70,
-+192,
-+80,
-+7,
-+164,
-+255,
-+36,
-+204,
-+96,
-+2,
-+0,
-+248,
-+62,
-+0,
-+3,
-+255,
-+55,
-+208,
-+120,
-+3,
-+224,
-+3,
-+190,
-+11,
-+16,
-+139,
-+246,
-+91,
-+0,
-+103,
-+90,
-+0,
-+70,
-+192,
-+80,
-+7,
-+164,
-+255,
-+36,
-+204,
-+224,
-+2,
-+0,
-+248,
-+62,
-+0,
-+3,
-+255,
-+55,
-+208,
-+120,
-+3,
-+224,
-+3,
-+190,
-+11,
-+16,
-+139,
-+246,
-+91,
-+0,
-+103,
-+90,
-+0,
-+225,
-+64,
-+242,
-+64,
-+3,
-+232,
-+128,
-+0,
-+0,
-+0,
-+7,
-+232,
-+0,
-+2,
-+0,
-+0,
-+57,
-+239,
-+224,
-+247,
-+255,
-+255,
-+72,
-+192,
-+95,
-+207,
-+88,
-+122,
-+88,
-+124,
-+137,
-+64,
-+26,
-+64,
-+4,
-+232,
-+64,
-+0,
-+0,
-+0,
-+149,
-+96,
-+161,
-+64,
-+152,
-+64,
-+128,
-+144,
-+35,
-+0,
-+72,
-+232,
-+0,
-+4,
-+0,
-+0,
-+65,
-+232,
-+32,
-+0,
-+0,
-+0,
-+128,
-+144,
-+27,
-+0,
-+4,
-+232,
-+0,
-+2,
-+0,
-+0,
-+101,
-+96,
-+145,
-+64,
-+168,
-+64,
-+128,
-+144,
-+19,
-+0,
-+72,
-+232,
-+0,
-+4,
-+0,
-+0,
-+65,
-+232,
-+32,
-+0,
-+0,
-+0,
-+128,
-+144,
-+11,
-+0,
-+74,
-+232,
-+0,
-+8,
-+0,
-+0,
-+242,
-+140,
-+221,
-+192,
-+57,
-+239,
-+32,
-+8,
-+0,
-+0,
-+41,
-+3,
-+239,
-+3,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+248,
-+4,
-+0,
-+12,
-+248,
-+0,
-+132,
-+64,
-+0,
-+192,
-+248,
-+4,
-+0,
-+0,
-+96,
-+255,
-+159,
-+154,
-+255,
-+0,
-+232,
-+0,
-+4,
-+0,
-+0,
-+255,
-+159,
-+165,
-+255,
-+4,
-+255,
-+48,
-+204,
-+16,
-+3,
-+224,
-+251,
-+62,
-+0,
-+4,
-+255,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+16,
-+0,
-+76,
-+254,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+20,
-+0,
-+128,
-+64,
-+6,
-+232,
-+64,
-+0,
-+0,
-+0,
-+140,
-+248,
-+47,
-+0,
-+0,
-+0,
-+224,
-+99,
-+0,
-+0,
-+32,
-+247,
-+240,
-+207,
-+16,
-+3,
-+32,
-+247,
-+176,
-+207,
-+17,
-+19,
-+32,
-+247,
-+112,
-+207,
-+18,
-+35,
-+32,
-+247,
-+48,
-+207,
-+19,
-+51,
-+32,
-+247,
-+240,
-+206,
-+20,
-+67,
-+32,
-+247,
-+176,
-+206,
-+21,
-+83,
-+32,
-+247,
-+112,
-+206,
-+22,
-+99,
-+32,
-+247,
-+48,
-+206,
-+23,
-+115,
-+32,
-+247,
-+240,
-+205,
-+24,
-+131,
-+32,
-+247,
-+176,
-+205,
-+25,
-+147,
-+32,
-+247,
-+112,
-+205,
-+26,
-+163,
-+32,
-+247,
-+48,
-+205,
-+27,
-+179,
-+32,
-+247,
-+240,
-+204,
-+28,
-+195,
-+32,
-+247,
-+176,
-+204,
-+29,
-+211,
-+32,
-+247,
-+112,
-+204,
-+30,
-+227,
-+32,
-+247,
-+48,
-+204,
-+31,
-+243,
-+4,
-+255,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+16,
-+0,
-+76,
-+254,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+20,
-+0,
-+0,
-+237,
-+32,
-+0,
-+0,
-+0,
-+140,
-+248,
-+47,
-+0,
-+0,
-+0,
-+224,
-+99,
-+0,
-+0,
-+111,
-+3,
-+4,
-+254,
-+0,
-+128,
-+0,
-+4,
-+0,
-+248,
-+0,
-+0,
-+2,
-+232,
-+32,
-+0,
-+0,
-+0,
-+140,
-+248,
-+32,
-+0,
-+0,
-+0,
-+224,
-+35,
-+0,
-+0,
-+64,
-+232,
-+0,
-+2,
-+0,
-+0,
-+193,
-+232,
-+0,
-+1,
-+0,
-+0,
-+1,
-+106,
-+116,
-+30,
-+90,
-+0,
-+169,
-+3,
-+73,
-+64,
-+52,
-+64,
-+45,
-+64,
-+2,
-+64,
-+10,
-+64,
-+64,
-+198,
-+1,
-+7,
-+8,
-+232,
-+63,
-+0,
-+0,
-+0,
-+6,
-+232,
-+253,
-+255,
-+255,
-+255,
-+0,
-+246,
-+0,
-+0,
-+0,
-+4,
-+215,
-+64,
-+3,
-+96,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+137,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+129,
-+0,
-+131,
-+102,
-+0,
-+158,
-+67,
-+0,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+108,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+100,
-+0,
-+131,
-+102,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+161,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+150,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+3,
-+99,
-+131,
-+71,
-+68,
-+232,
-+32,
-+0,
-+0,
-+0,
-+0,
-+99,
-+2,
-+99,
-+23,
-+102,
-+7,
-+106,
-+127,
-+156,
-+182,
-+255,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+112,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+101,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+25,
-+102,
-+9,
-+106,
-+2,
-+30,
-+41,
-+3,
-+26,
-+87,
-+162,
-+64,
-+64,
-+198,
-+1,
-+23,
-+127,
-+158,
-+103,
-+255,
-+239,
-+3,
-+0,
-+254,
-+0,
-+143,
-+92,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+143,
-+93,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+143,
-+94,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+95,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+208,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+209,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+142,
-+210,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+0,
-+142,
-+211,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+107,
-+0,
-+8,
-+255,
-+99,
-+23,
-+0,
-+212,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+23,
-+0,
-+228,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+227,
-+23,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+52,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+99,
-+52,
-+0,
-+164,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+52,
-+0,
-+148,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+239,
-+3,
-+0,
-+254,
-+0,
-+143,
-+12,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+143,
-+13,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+143,
-+14,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+15,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+16,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+17,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+142,
-+18,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+0,
-+142,
-+19,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+33,
-+0,
-+8,
-+255,
-+99,
-+3,
-+0,
-+212,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+3,
-+0,
-+228,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+227,
-+3,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+4,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+99,
-+4,
-+0,
-+164,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+4,
-+0,
-+148,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+32,
-+246,
-+192,
-+11,
-+1,
-+16,
-+32,
-+246,
-+2,
-+137,
-+47,
-+240,
-+40,
-+246,
-+2,
-+140,
-+47,
-+240,
-+128,
-+245,
-+99,
-+140,
-+5,
-+4,
-+0,
-+247,
-+99,
-+140,
-+1,
-+20,
-+88,
-+246,
-+99,
-+140,
-+1,
-+20,
-+0,
-+247,
-+35,
-+136,
-+62,
-+226,
-+32,
-+247,
-+35,
-+136,
-+32,
-+210,
-+0,
-+247,
-+34,
-+136,
-+63,
-+2,
-+208,
-+246,
-+34,
-+136,
-+0,
-+4,
-+0,
-+247,
-+99,
-+136,
-+58,
-+162,
-+32,
-+247,
-+99,
-+136,
-+33,
-+146,
-+0,
-+247,
-+98,
-+136,
-+59,
-+18,
-+208,
-+246,
-+98,
-+136,
-+0,
-+20,
-+0,
-+247,
-+162,
-+136,
-+33,
-+2,
-+88,
-+246,
-+98,
-+137,
-+2,
-+68,
-+88,
-+246,
-+162,
-+137,
-+3,
-+68,
-+208,
-+254,
-+227,
-+136,
-+60,
-+242,
-+192,
-+243,
-+188,
-+11,
-+208,
-+254,
-+227,
-+136,
-+56,
-+178,
-+192,
-+243,
-+188,
-+10,
-+32,
-+255,
-+226,
-+136,
-+38,
-+58,
-+192,
-+243,
-+60,
-+0,
-+208,
-+254,
-+227,
-+136,
-+59,
-+242,
-+192,
-+243,
-+60,
-+128,
-+32,
-+255,
-+226,
-+136,
-+49,
-+58,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+226,
-+136,
-+34,
-+34,
-+192,
-+243,
-+60,
-+128,
-+32,
-+255,
-+226,
-+136,
-+37,
-+58,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+192,
-+136,
-+1,
-+4,
-+0,
-+240,
-+0,
-+160,
-+0,
-+255,
-+194,
-+8,
-+0,
-+52,
-+195,
-+243,
-+0,
-+128,
-+0,
-+255,
-+202,
-+40,
-+0,
-+52,
-+195,
-+243,
-+0,
-+128,
-+0,
-+254,
-+0,
-+240,
-+35,
-+10,
-+0,
-+240,
-+60,
-+0,
-+0,
-+254,
-+192,
-+136,
-+1,
-+4,
-+0,
-+240,
-+0,
-+160,
-+0,
-+255,
-+226,
-+140,
-+34,
-+34,
-+195,
-+243,
-+60,
-+0,
-+32,
-+255,
-+227,
-+140,
-+36,
-+58,
-+192,
-+243,
-+60,
-+0,
-+0,
-+254,
-+192,
-+136,
-+0,
-+4,
-+0,
-+240,
-+0,
-+160,
-+16,
-+246,
-+226,
-+136,
-+35,
-+50,
-+16,
-+246,
-+226,
-+136,
-+35,
-+50,
-+32,
-+246,
-+226,
-+136,
-+35,
-+50,
-+32,
-+254,
-+226,
-+136,
-+35,
-+58,
-+192,
-+243,
-+60,
-+0,
-+11,
-+96,
-+0,
-+254,
-+0,
-+240,
-+1,
-+4,
-+0,
-+240,
-+64,
-+115,
-+5,
-+106,
-+0,
-+144,
-+173,
-+1,
-+27,
-+96,
-+0,
-+254,
-+0,
-+240,
-+1,
-+4,
-+0,
-+240,
-+64,
-+147,
-+5,
-+106,
-+0,
-+144,
-+227,
-+0,
-+64,
-+246,
-+163,
-+140,
-+1,
-+4,
-+0,
-+246,
-+192,
-+175,
-+63,
-+2,
-+0,
-+246,
-+192,
-+174,
-+59,
-+2,
-+0,
-+246,
-+128,
-+175,
-+62,
-+2,
-+0,
-+246,
-+128,
-+174,
-+58,
-+2,
-+0,
-+246,
-+64,
-+175,
-+61,
-+2,
-+0,
-+246,
-+64,
-+174,
-+57,
-+2,
-+0,
-+255,
-+43,
-+240,
-+4,
-+212,
-+192,
-+243,
-+128,
-+11,
-+64,
-+254,
-+43,
-+240,
-+1,
-+228,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+244,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+180,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+164,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+191,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+235,
-+143,
-+52,
-+242,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+2,
-+212,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+191,
-+226,
-+192,
-+243,
-+188,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+180,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+2,
-+68,
-+32,
-+247,
-+35,
-+141,
-+190,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+171,
-+143,
-+52,
-+226,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+4,
-+180,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+191,
-+226,
-+192,
-+243,
-+188,
-+10,
-+128,
-+253,
-+43,
-+240,
-+3,
-+212,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+35,
-+141,
-+1,
-+196,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+189,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+107,
-+143,
-+52,
-+210,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+4,
-+148,
-+192,
-+243,
-+128,
-+11,
-+64,
-+254,
-+43,
-+240,
-+1,
-+164,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+180,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+244,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+228,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+187,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+235,
-+142,
-+52,
-+178,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+2,
-+148,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+187,
-+162,
-+192,
-+243,
-+188,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+244,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+2,
-+68,
-+32,
-+247,
-+35,
-+141,
-+186,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+171,
-+142,
-+52,
-+162,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+4,
-+244,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+187,
-+162,
-+192,
-+243,
-+188,
-+10,
-+128,
-+253,
-+43,
-+240,
-+3,
-+148,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+35,
-+141,
-+1,
-+132,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+185,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+107,
-+142,
-+52,
-+146,
-+192,
-+243,
-+60,
-+128,
-+64,
-+255,
-+98,
-+141,
-+0,
-+52,
-+192,
-+243,
-+0,
-+0,
-+0,
-+254,
-+0,
-+240,
-+53,
-+10,
-+0,
-+240,
-+60,
-+0,
-+0,
-+254,
-+0,
-+240,
-+1,
-+4,
-+0,
-+240,
-+64,
-+147,
-+5,
-+106,
-+0,
-+144,
-+177,
-+0,
-+88,
-+246,
-+163,
-+140,
-+1,
-+4,
-+128,
-+245,
-+99,
-+141,
-+10,
-+4,
-+88,
-+246,
-+162,
-+138,
-+1,
-+68,
-+0,
-+247,
-+162,
-+138,
-+36,
-+162,
-+88,
-+254,
-+162,
-+138,
-+3,
-+164,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+226,
-+137,
-+32,
-+2,
-+195,
-+243,
-+60,
-+0,
-+32,
-+247,
-+226,
-+137,
-+42,
-+114,
-+0,
-+255,
-+34,
-+138,
-+33,
-+18,
-+195,
-+243,
-+60,
-+0,
-+32,
-+247,
-+34,
-+138,
-+42,
-+130,
-+16,
-+246,
-+98,
-+138,
-+40,
-+114,
-+16,
-+246,
-+98,
-+138,
-+41,
-+146,
-+32,
-+246,
-+98,
-+138,
-+41,
-+146,
-+32,
-+246,
-+226,
-+137,
-+41,
-+146,
-+40,
-+246,
-+34,
-+138,
-+41,
-+146,
-+32,
-+247,
-+163,
-+141,
-+63,
-+178,
-+32,
-+247,
-+227,
-+141,
-+62,
-+162,
-+0,
-+254,
-+0,
-+240,
-+8,
-+4,
-+0,
-+240,
-+128,
-+11,
-+128,
-+253,
-+35,
-+240,
-+9,
-+100,
-+192,
-+243,
-+128,
-+10,
-+128,
-+253,
-+163,
-+141,
-+128,
-+115,
-+192,
-+243,
-+152,
-+10,
-+88,
-+246,
-+163,
-+141,
-+4,
-+100,
-+208,
-+246,
-+35,
-+139,
-+0,
-+100,
-+32,
-+255,
-+34,
-+139,
-+53,
-+202,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+0,
-+139,
-+0,
-+4,
-+0,
-+240,
-+0,
-+160,
-+240,
-+246,
-+163,
-+141,
-+48,
-+98,
-+0,
-+247,
-+99,
-+139,
-+63,
-+210,
-+0,
-+247,
-+98,
-+139,
-+1,
-+212,
-+88,
-+254,
-+98,
-+139,
-+1,
-+212,
-+192,
-+243,
-+128,
-+11,
-+32,
-+255,
-+99,
-+139,
-+62,
-+98,
-+192,
-+243,
-+188,
-+10,
-+88,
-+246,
-+98,
-+139,
-+1,
-+212,
-+240,
-+246,
-+98,
-+139,
-+50,
-+210,
-+0,
-+247,
-+163,
-+128,
-+59,
-+146,
-+0,
-+247,
-+160,
-+128,
-+1,
-+36,
-+88,
-+254,
-+160,
-+128,
-+1,
-+36,
-+192,
-+243,
-+128,
-+11,
-+0,
-+247,
-+163,
-+128,
-+58,
-+98,
-+64,
-+255,
-+35,
-+240,
-+0,
-+100,
-+192,
-+243,
-+128,
-+10,
-+64,
-+255,
-+163,
-+128,
-+0,
-+164,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+160,
-+128,
-+1,
-+36,
-+240,
-+246,
-+160,
-+128,
-+50,
-+34,
-+8,
-+255,
-+227,
-+143,
-+54,
-+242,
-+192,
-+243,
-+60,
-+128,
-+40,
-+255,
-+227,
-+142,
-+54,
-+178,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+0,
-+240,
-+39,
-+10,
-+0,
-+240,
-+60,
-+128,
-+8,
-+255,
-+163,
-+143,
-+45,
-+226,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+0,
-+240,
-+44,
-+10,
-+0,
-+240,
-+60,
-+0,
-+0,
-+254,
-+0,
-+240,
-+40,
-+10,
-+0,
-+240,
-+60,
-+128,
-+8,
-+255,
-+163,
-+142,
-+2,
-+162,
-+192,
-+243,
-+60,
-+128,
-+90,
-+0,
-+169,
-+3,
-+14,
-+96,
-+4,
-+31,
-+169,
-+3,
-+30,
-+96,
-+1,
-+31,
-+73,
-+64,
-+52,
-+64,
-+45,
-+64,
-+2,
-+64,
-+10,
-+64,
-+64,
-+198,
-+1,
-+7,
-+8,
-+232,
-+63,
-+0,
-+0,
-+0,
-+6,
-+232,
-+253,
-+255,
-+255,
-+255,
-+0,
-+246,
-+0,
-+0,
-+0,
-+4,
-+215,
-+64,
-+3,
-+96,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+30,
-+106,
-+132,
-+24,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+143,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+135,
-+0,
-+131,
-+102,
-+0,
-+158,
-+71,
-+0,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+30,
-+106,
-+132,
-+24,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+112,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+104,
-+0,
-+131,
-+102,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+30,
-+106,
-+134,
-+24,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+123,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+112,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+3,
-+99,
-+131,
-+71,
-+68,
-+232,
-+32,
-+0,
-+0,
-+0,
-+0,
-+99,
-+2,
-+99,
-+23,
-+102,
-+7,
-+106,
-+127,
-+156,
-+178,
-+255,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+30,
-+106,
-+134,
-+24,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+72,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+61,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+25,
-+102,
-+9,
-+106,
-+2,
-+30,
-+41,
-+3,
-+26,
-+87,
-+162,
-+64,
-+64,
-+198,
-+1,
-+23,
-+127,
-+158,
-+95,
-+255,
-+239,
-+3,
-+0,
-+254,
-+128,
-+143,
-+94,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+95,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+208,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+209,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+47,
-+0,
-+8,
-+255,
-+227,
-+23,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+52,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+239,
-+3,
-+0,
-+254,
-+128,
-+143,
-+14,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+15,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+16,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+17,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+13,
-+0,
-+8,
-+255,
-+227,
-+3,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+4,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+32,
-+246,
-+192,
-+11,
-+1,
-+16,
-+32,
-+246,
-+2,
-+140,
-+47,
-+240,
-+32,
-+247,
-+35,
-+141,
-+63,
-+178,
-+64,
-+254,
-+35,
-+141,
-+2,
-+68,
-+192,
-+243,
-+128,
-+11,
-+32,
-+255,
-+35,
-+240,
-+58,
-+226,
-+192,
-+243,
-+188,
-+10,
-+0,
-+254,
-+0,
-+141,
-+4,
-+4,
-+0,
-+240,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+240,
-+246,
-+35,
-+141,
-+48,
-+66,
-+0,
-+247,
-+227,
-+143,
-+52,
-+242,
-+32,
-+247,
-+227,
-+142,
-+52,
-+178,
-+90,
-+0,
-+161,
-+3,
-+6,
-+64,
-+23,
-+64,
-+96,
-+8,
-+70,
-+98,
-+97,
-+8,
-+70,
-+98,
-+98,
-+8,
-+70,
-+98,
-+99,
-+8,
-+70,
-+98,
-+100,
-+8,
-+70,
-+98,
-+101,
-+8,
-+70,
-+98,
-+255,
-+159,
-+8,
-+250,
-+23,
-+102,
-+7,
-+106,
-+112,
-+30,
-+33,
-+3,
-+};
-diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
-new file mode 100644
-index 0000000000..2901b6568d
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform8.h
-@@ -0,0 +1,3070 @@
-+static const unsigned char rpi_hevc_transform8 [] = {
-+21,
-+106,
-+0,
-+144,
-+47,
-+1,
-+37,
-+106,
-+0,
-+144,
-+66,
-+1,
-+53,
-+106,
-+0,
-+144,
-+192,
-+4,
-+69,
-+106,
-+0,
-+144,
-+192,
-+4,
-+85,
-+106,
-+0,
-+144,
-+220,
-+5,
-+169,
-+3,
-+62,
-+64,
-+79,
-+64,
-+3,
-+232,
-+32,
-+0,
-+0,
-+0,
-+12,
-+248,
-+0,
-+136,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
-+64,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+248,
-+0,
-+168,
-+0,
-+0,
-+192,
-+248,
-+0,
-+0,
-+0,
-+96,
-+3,
-+232,
-+32,
-+0,
-+0,
-+0,
-+7,
-+232,
-+0,
-+2,
-+0,
-+0,
-+8,
-+232,
-+0,
-+4,
-+0,
-+0,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+8,
-+4,
-+0,
-+4,
-+232,
-+64,
-+0,
-+0,
-+0,
-+5,
-+232,
-+0,
-+8,
-+0,
-+0,
-+128,
-+69,
-+113,
-+66,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+8,
-+4,
-+0,
-+128,
-+69,
-+113,
-+70,
-+128,
-+144,
-+40,
-+0,
-+4,
-+255,
-+48,
-+192,
-+128,
-+3,
-+32,
-+8,
-+16,
-+0,
-+76,
-+254,
-+48,
-+192,
-+9,
-+4,
-+32,
-+8,
-+0,
-+0,
-+4,
-+254,
-+0,
-+144,
-+128,
-+2,
-+0,
-+8,
-+2,
-+0,
-+128,
-+144,
-+23,
-+0,
-+4,
-+255,
-+48,
-+192,
-+128,
-+3,
-+32,
-+8,
-+20,
-+0,
-+76,
-+254,
-+48,
-+192,
-+4,
-+4,
-+32,
-+8,
-+0,
-+0,
-+140,
-+248,
-+44,
-+0,
-+0,
-+0,
-+32,
-+48,
-+4,
-+0,
-+128,
-+69,
-+113,
-+66,
-+242,
-+140,
-+211,
-+192,
-+34,
-+31,
-+41,
-+3,
-+70,
-+192,
-+80,
-+7,
-+164,
-+255,
-+36,
-+204,
-+96,
-+2,
-+0,
-+248,
-+62,
-+0,
-+3,
-+255,
-+55,
-+208,
-+120,
-+3,
-+224,
-+3,
-+190,
-+11,
-+16,
-+139,
-+246,
-+91,
-+0,
-+103,
-+90,
-+0,
-+70,
-+192,
-+80,
-+7,
-+164,
-+255,
-+36,
-+204,
-+224,
-+2,
-+0,
-+248,
-+62,
-+0,
-+3,
-+255,
-+55,
-+208,
-+120,
-+3,
-+224,
-+3,
-+190,
-+11,
-+16,
-+139,
-+246,
-+91,
-+0,
-+103,
-+90,
-+0,
-+225,
-+64,
-+242,
-+64,
-+3,
-+232,
-+128,
-+0,
-+0,
-+0,
-+7,
-+232,
-+0,
-+2,
-+0,
-+0,
-+57,
-+239,
-+224,
-+247,
-+255,
-+255,
-+72,
-+192,
-+95,
-+207,
-+88,
-+122,
-+88,
-+124,
-+137,
-+64,
-+26,
-+64,
-+4,
-+232,
-+64,
-+0,
-+0,
-+0,
-+149,
-+96,
-+161,
-+64,
-+152,
-+64,
-+128,
-+144,
-+35,
-+0,
-+72,
-+232,
-+0,
-+4,
-+0,
-+0,
-+65,
-+232,
-+32,
-+0,
-+0,
-+0,
-+128,
-+144,
-+27,
-+0,
-+4,
-+232,
-+0,
-+8,
-+0,
-+0,
-+69,
-+96,
-+145,
-+64,
-+168,
-+64,
-+128,
-+144,
-+19,
-+0,
-+72,
-+232,
-+0,
-+4,
-+0,
-+0,
-+65,
-+232,
-+32,
-+0,
-+0,
-+0,
-+128,
-+144,
-+11,
-+0,
-+74,
-+232,
-+0,
-+8,
-+0,
-+0,
-+242,
-+140,
-+221,
-+192,
-+57,
-+239,
-+32,
-+8,
-+0,
-+0,
-+41,
-+3,
-+239,
-+3,
-+12,
-+248,
-+0,
-+128,
-+0,
-+0,
-+192,
-+248,
-+4,
-+0,
-+12,
-+248,
-+0,
-+132,
-+64,
-+0,
-+192,
-+248,
-+4,
-+0,
-+0,
-+96,
-+255,
-+159,
-+154,
-+255,
-+0,
-+232,
-+0,
-+4,
-+0,
-+0,
-+255,
-+159,
-+165,
-+255,
-+4,
-+255,
-+48,
-+204,
-+16,
-+3,
-+224,
-+251,
-+62,
-+0,
-+4,
-+255,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+16,
-+0,
-+76,
-+254,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+20,
-+0,
-+128,
-+64,
-+6,
-+232,
-+64,
-+0,
-+0,
-+0,
-+140,
-+248,
-+47,
-+0,
-+0,
-+0,
-+224,
-+99,
-+0,
-+0,
-+32,
-+247,
-+240,
-+207,
-+16,
-+3,
-+32,
-+247,
-+176,
-+207,
-+17,
-+19,
-+32,
-+247,
-+112,
-+207,
-+18,
-+35,
-+32,
-+247,
-+48,
-+207,
-+19,
-+51,
-+32,
-+247,
-+240,
-+206,
-+20,
-+67,
-+32,
-+247,
-+176,
-+206,
-+21,
-+83,
-+32,
-+247,
-+112,
-+206,
-+22,
-+99,
-+32,
-+247,
-+48,
-+206,
-+23,
-+115,
-+32,
-+247,
-+240,
-+205,
-+24,
-+131,
-+32,
-+247,
-+176,
-+205,
-+25,
-+147,
-+32,
-+247,
-+112,
-+205,
-+26,
-+163,
-+32,
-+247,
-+48,
-+205,
-+27,
-+179,
-+32,
-+247,
-+240,
-+204,
-+28,
-+195,
-+32,
-+247,
-+176,
-+204,
-+29,
-+211,
-+32,
-+247,
-+112,
-+204,
-+30,
-+227,
-+32,
-+247,
-+48,
-+204,
-+31,
-+243,
-+4,
-+255,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+16,
-+0,
-+76,
-+254,
-+51,
-+204,
-+128,
-+3,
-+224,
-+251,
-+20,
-+0,
-+0,
-+237,
-+32,
-+0,
-+0,
-+0,
-+140,
-+248,
-+47,
-+0,
-+0,
-+0,
-+224,
-+99,
-+0,
-+0,
-+111,
-+3,
-+4,
-+254,
-+0,
-+128,
-+0,
-+4,
-+0,
-+248,
-+0,
-+0,
-+2,
-+232,
-+32,
-+0,
-+0,
-+0,
-+140,
-+248,
-+32,
-+0,
-+0,
-+0,
-+224,
-+35,
-+0,
-+0,
-+64,
-+232,
-+0,
-+2,
-+0,
-+0,
-+193,
-+232,
-+0,
-+1,
-+0,
-+0,
-+1,
-+106,
-+116,
-+30,
-+90,
-+0,
-+169,
-+3,
-+73,
-+64,
-+52,
-+64,
-+45,
-+64,
-+2,
-+64,
-+10,
-+64,
-+64,
-+198,
-+1,
-+7,
-+8,
-+232,
-+63,
-+0,
-+0,
-+0,
-+6,
-+232,
-+253,
-+255,
-+255,
-+255,
-+0,
-+246,
-+0,
-+0,
-+0,
-+4,
-+215,
-+64,
-+3,
-+96,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+137,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+129,
-+0,
-+131,
-+102,
-+0,
-+158,
-+67,
-+0,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+108,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+100,
-+0,
-+131,
-+102,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+161,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+150,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+3,
-+99,
-+131,
-+71,
-+68,
-+232,
-+32,
-+0,
-+0,
-+0,
-+0,
-+99,
-+2,
-+99,
-+23,
-+102,
-+7,
-+106,
-+127,
-+156,
-+182,
-+255,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+112,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+101,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+25,
-+102,
-+9,
-+106,
-+2,
-+30,
-+41,
-+3,
-+26,
-+87,
-+162,
-+64,
-+64,
-+198,
-+1,
-+23,
-+127,
-+158,
-+103,
-+255,
-+239,
-+3,
-+0,
-+254,
-+0,
-+143,
-+92,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+143,
-+93,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+143,
-+94,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+95,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+208,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+209,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+142,
-+210,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+0,
-+142,
-+211,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+107,
-+0,
-+8,
-+255,
-+99,
-+23,
-+0,
-+212,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+23,
-+0,
-+228,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+227,
-+23,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+52,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+99,
-+52,
-+0,
-+164,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+52,
-+0,
-+148,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+239,
-+3,
-+0,
-+254,
-+0,
-+143,
-+12,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+143,
-+13,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+143,
-+14,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+15,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+16,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+17,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+64,
-+142,
-+18,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+0,
-+142,
-+19,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+33,
-+0,
-+8,
-+255,
-+99,
-+3,
-+0,
-+212,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+3,
-+0,
-+228,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+227,
-+3,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+4,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+99,
-+4,
-+0,
-+164,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+163,
-+4,
-+0,
-+148,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+32,
-+246,
-+192,
-+11,
-+1,
-+16,
-+32,
-+246,
-+2,
-+137,
-+47,
-+240,
-+40,
-+246,
-+2,
-+140,
-+47,
-+240,
-+128,
-+245,
-+99,
-+140,
-+5,
-+4,
-+0,
-+247,
-+99,
-+140,
-+1,
-+20,
-+88,
-+246,
-+99,
-+140,
-+1,
-+20,
-+0,
-+247,
-+35,
-+136,
-+62,
-+226,
-+32,
-+247,
-+35,
-+136,
-+32,
-+210,
-+0,
-+247,
-+34,
-+136,
-+63,
-+2,
-+208,
-+246,
-+34,
-+136,
-+0,
-+4,
-+0,
-+247,
-+99,
-+136,
-+58,
-+162,
-+32,
-+247,
-+99,
-+136,
-+33,
-+146,
-+0,
-+247,
-+98,
-+136,
-+59,
-+18,
-+208,
-+246,
-+98,
-+136,
-+0,
-+20,
-+0,
-+247,
-+162,
-+136,
-+33,
-+2,
-+88,
-+246,
-+98,
-+137,
-+2,
-+68,
-+88,
-+246,
-+162,
-+137,
-+3,
-+68,
-+208,
-+254,
-+227,
-+136,
-+60,
-+242,
-+192,
-+243,
-+188,
-+11,
-+208,
-+254,
-+227,
-+136,
-+56,
-+178,
-+192,
-+243,
-+188,
-+10,
-+32,
-+255,
-+226,
-+136,
-+38,
-+58,
-+192,
-+243,
-+60,
-+0,
-+208,
-+254,
-+227,
-+136,
-+59,
-+242,
-+192,
-+243,
-+60,
-+128,
-+32,
-+255,
-+226,
-+136,
-+49,
-+58,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+226,
-+136,
-+34,
-+34,
-+192,
-+243,
-+60,
-+128,
-+32,
-+255,
-+226,
-+136,
-+37,
-+58,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+192,
-+136,
-+1,
-+4,
-+0,
-+240,
-+0,
-+160,
-+0,
-+255,
-+194,
-+8,
-+0,
-+52,
-+195,
-+243,
-+0,
-+128,
-+0,
-+255,
-+202,
-+40,
-+0,
-+52,
-+195,
-+243,
-+0,
-+128,
-+0,
-+254,
-+0,
-+240,
-+35,
-+10,
-+0,
-+240,
-+60,
-+0,
-+0,
-+254,
-+192,
-+136,
-+1,
-+4,
-+0,
-+240,
-+0,
-+160,
-+0,
-+255,
-+226,
-+140,
-+34,
-+34,
-+195,
-+243,
-+60,
-+0,
-+32,
-+255,
-+227,
-+140,
-+36,
-+58,
-+192,
-+243,
-+60,
-+0,
-+0,
-+254,
-+192,
-+136,
-+0,
-+4,
-+0,
-+240,
-+0,
-+160,
-+16,
-+246,
-+226,
-+136,
-+35,
-+50,
-+16,
-+246,
-+226,
-+136,
-+35,
-+50,
-+32,
-+246,
-+226,
-+136,
-+35,
-+50,
-+32,
-+254,
-+226,
-+136,
-+35,
-+58,
-+192,
-+243,
-+60,
-+0,
-+11,
-+96,
-+0,
-+254,
-+0,
-+240,
-+1,
-+4,
-+0,
-+240,
-+64,
-+115,
-+5,
-+106,
-+0,
-+144,
-+173,
-+1,
-+27,
-+96,
-+0,
-+254,
-+0,
-+240,
-+1,
-+4,
-+0,
-+240,
-+64,
-+147,
-+5,
-+106,
-+0,
-+144,
-+227,
-+0,
-+64,
-+246,
-+163,
-+140,
-+1,
-+4,
-+0,
-+246,
-+192,
-+175,
-+63,
-+2,
-+0,
-+246,
-+192,
-+174,
-+59,
-+2,
-+0,
-+246,
-+128,
-+175,
-+62,
-+2,
-+0,
-+246,
-+128,
-+174,
-+58,
-+2,
-+0,
-+246,
-+64,
-+175,
-+61,
-+2,
-+0,
-+246,
-+64,
-+174,
-+57,
-+2,
-+0,
-+255,
-+43,
-+240,
-+4,
-+212,
-+192,
-+243,
-+128,
-+11,
-+64,
-+254,
-+43,
-+240,
-+1,
-+228,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+244,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+180,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+164,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+191,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+235,
-+143,
-+52,
-+242,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+2,
-+212,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+191,
-+226,
-+192,
-+243,
-+188,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+180,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+2,
-+68,
-+32,
-+247,
-+35,
-+141,
-+190,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+171,
-+143,
-+52,
-+226,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+4,
-+180,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+191,
-+226,
-+192,
-+243,
-+188,
-+10,
-+128,
-+253,
-+43,
-+240,
-+3,
-+212,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+35,
-+141,
-+1,
-+196,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+189,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+107,
-+143,
-+52,
-+210,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+4,
-+148,
-+192,
-+243,
-+128,
-+11,
-+64,
-+254,
-+43,
-+240,
-+1,
-+164,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+180,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+240,
-+1,
-+244,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+228,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+187,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+235,
-+142,
-+52,
-+178,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+2,
-+148,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+187,
-+162,
-+192,
-+243,
-+188,
-+10,
-+64,
-+254,
-+43,
-+141,
-+0,
-+244,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+2,
-+68,
-+32,
-+247,
-+35,
-+141,
-+186,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+171,
-+142,
-+52,
-+162,
-+192,
-+243,
-+60,
-+128,
-+0,
-+255,
-+43,
-+240,
-+4,
-+244,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+43,
-+240,
-+187,
-+162,
-+192,
-+243,
-+188,
-+10,
-+128,
-+253,
-+43,
-+240,
-+3,
-+148,
-+192,
-+243,
-+128,
-+10,
-+64,
-+254,
-+35,
-+141,
-+1,
-+132,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+32,
-+247,
-+35,
-+141,
-+185,
-+66,
-+240,
-+246,
-+35,
-+141,
-+50,
-+66,
-+0,
-+255,
-+107,
-+142,
-+52,
-+146,
-+192,
-+243,
-+60,
-+128,
-+64,
-+255,
-+98,
-+141,
-+0,
-+52,
-+192,
-+243,
-+0,
-+0,
-+0,
-+254,
-+0,
-+240,
-+53,
-+10,
-+0,
-+240,
-+60,
-+0,
-+0,
-+254,
-+0,
-+240,
-+1,
-+4,
-+0,
-+240,
-+64,
-+147,
-+5,
-+106,
-+0,
-+144,
-+177,
-+0,
-+88,
-+246,
-+163,
-+140,
-+1,
-+4,
-+128,
-+245,
-+99,
-+141,
-+10,
-+4,
-+88,
-+246,
-+162,
-+138,
-+1,
-+68,
-+0,
-+247,
-+162,
-+138,
-+36,
-+162,
-+88,
-+254,
-+162,
-+138,
-+3,
-+164,
-+192,
-+243,
-+128,
-+11,
-+0,
-+255,
-+226,
-+137,
-+32,
-+2,
-+195,
-+243,
-+60,
-+0,
-+32,
-+247,
-+226,
-+137,
-+42,
-+114,
-+0,
-+255,
-+34,
-+138,
-+33,
-+18,
-+195,
-+243,
-+60,
-+0,
-+32,
-+247,
-+34,
-+138,
-+42,
-+130,
-+16,
-+246,
-+98,
-+138,
-+40,
-+114,
-+16,
-+246,
-+98,
-+138,
-+41,
-+146,
-+32,
-+246,
-+98,
-+138,
-+41,
-+146,
-+32,
-+246,
-+226,
-+137,
-+41,
-+146,
-+40,
-+246,
-+34,
-+138,
-+41,
-+146,
-+32,
-+247,
-+163,
-+141,
-+63,
-+178,
-+32,
-+247,
-+227,
-+141,
-+62,
-+162,
-+0,
-+254,
-+0,
-+240,
-+8,
-+4,
-+0,
-+240,
-+128,
-+11,
-+128,
-+253,
-+35,
-+240,
-+9,
-+100,
-+192,
-+243,
-+128,
-+10,
-+128,
-+253,
-+163,
-+141,
-+128,
-+115,
-+192,
-+243,
-+152,
-+10,
-+88,
-+246,
-+163,
-+141,
-+4,
-+100,
-+208,
-+246,
-+35,
-+139,
-+0,
-+100,
-+32,
-+255,
-+34,
-+139,
-+53,
-+202,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+0,
-+139,
-+0,
-+4,
-+0,
-+240,
-+0,
-+160,
-+240,
-+246,
-+163,
-+141,
-+48,
-+98,
-+0,
-+247,
-+99,
-+139,
-+63,
-+210,
-+0,
-+247,
-+98,
-+139,
-+1,
-+212,
-+88,
-+254,
-+98,
-+139,
-+1,
-+212,
-+192,
-+243,
-+128,
-+11,
-+32,
-+255,
-+99,
-+139,
-+62,
-+98,
-+192,
-+243,
-+188,
-+10,
-+88,
-+246,
-+98,
-+139,
-+1,
-+212,
-+240,
-+246,
-+98,
-+139,
-+50,
-+210,
-+0,
-+247,
-+163,
-+128,
-+59,
-+146,
-+0,
-+247,
-+160,
-+128,
-+1,
-+36,
-+88,
-+254,
-+160,
-+128,
-+1,
-+36,
-+192,
-+243,
-+128,
-+11,
-+0,
-+247,
-+163,
-+128,
-+58,
-+98,
-+64,
-+255,
-+35,
-+240,
-+0,
-+100,
-+192,
-+243,
-+128,
-+10,
-+64,
-+255,
-+163,
-+128,
-+0,
-+164,
-+192,
-+243,
-+128,
-+10,
-+88,
-+246,
-+160,
-+128,
-+1,
-+36,
-+240,
-+246,
-+160,
-+128,
-+50,
-+34,
-+8,
-+255,
-+227,
-+143,
-+54,
-+242,
-+192,
-+243,
-+60,
-+128,
-+40,
-+255,
-+227,
-+142,
-+54,
-+178,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+0,
-+240,
-+39,
-+10,
-+0,
-+240,
-+60,
-+128,
-+8,
-+255,
-+163,
-+143,
-+45,
-+226,
-+192,
-+243,
-+60,
-+128,
-+0,
-+254,
-+0,
-+240,
-+44,
-+10,
-+0,
-+240,
-+60,
-+0,
-+0,
-+254,
-+0,
-+240,
-+40,
-+10,
-+0,
-+240,
-+60,
-+128,
-+8,
-+255,
-+163,
-+142,
-+2,
-+162,
-+192,
-+243,
-+60,
-+128,
-+90,
-+0,
-+169,
-+3,
-+14,
-+96,
-+4,
-+31,
-+169,
-+3,
-+30,
-+96,
-+1,
-+31,
-+73,
-+64,
-+52,
-+64,
-+45,
-+64,
-+2,
-+64,
-+10,
-+64,
-+64,
-+198,
-+1,
-+7,
-+8,
-+232,
-+63,
-+0,
-+0,
-+0,
-+6,
-+232,
-+253,
-+255,
-+255,
-+255,
-+0,
-+246,
-+0,
-+0,
-+0,
-+4,
-+215,
-+64,
-+3,
-+96,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+30,
-+106,
-+132,
-+24,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+143,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+135,
-+0,
-+131,
-+102,
-+0,
-+158,
-+71,
-+0,
-+2,
-+248,
-+0,
-+35,
-+0,
-+0,
-+64,
-+56,
-+0,
-+0,
-+4,
-+248,
-+0,
-+36,
-+0,
-+0,
-+64,
-+56,
-+8,
-+0,
-+0,
-+240,
-+64,
-+0,
-+132,
-+3,
-+30,
-+106,
-+132,
-+24,
-+128,
-+240,
-+0,
-+0,
-+132,
-+3,
-+128,
-+144,
-+112,
-+0,
-+131,
-+98,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+104,
-+0,
-+131,
-+102,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+30,
-+106,
-+134,
-+24,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+123,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+112,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+3,
-+99,
-+131,
-+71,
-+68,
-+232,
-+32,
-+0,
-+0,
-+0,
-+0,
-+99,
-+2,
-+99,
-+23,
-+102,
-+7,
-+106,
-+127,
-+156,
-+178,
-+255,
-+0,
-+248,
-+64,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+30,
-+106,
-+134,
-+24,
-+128,
-+248,
-+0,
-+0,
-+112,
-+0,
-+192,
-+243,
-+211,
-+31,
-+128,
-+144,
-+72,
-+0,
-+188,
-+64,
-+67,
-+232,
-+0,
-+2,
-+0,
-+0,
-+0,
-+255,
-+64,
-+0,
-+0,
-+20,
-+200,
-+243,
-+0,
-+0,
-+128,
-+144,
-+61,
-+0,
-+195,
-+232,
-+0,
-+2,
-+0,
-+0,
-+12,
-+128,
-+7,
-+192,
-+130,
-+248,
-+0,
-+0,
-+112,
-+192,
-+224,
-+16,
-+195,
-+31,
-+132,
-+248,
-+1,
-+0,
-+112,
-+0,
-+224,
-+16,
-+203,
-+31,
-+25,
-+102,
-+9,
-+106,
-+2,
-+30,
-+41,
-+3,
-+26,
-+87,
-+162,
-+64,
-+64,
-+198,
-+1,
-+23,
-+127,
-+158,
-+95,
-+255,
-+239,
-+3,
-+0,
-+254,
-+128,
-+143,
-+94,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+95,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+208,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+209,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+47,
-+0,
-+8,
-+255,
-+227,
-+23,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+52,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+239,
-+3,
-+0,
-+254,
-+128,
-+143,
-+14,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+143,
-+15,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+192,
-+142,
-+16,
-+0,
-+0,
-+240,
-+12,
-+0,
-+0,
-+254,
-+128,
-+142,
-+17,
-+0,
-+0,
-+240,
-+12,
-+0,
-+128,
-+144,
-+13,
-+0,
-+8,
-+255,
-+227,
-+3,
-+0,
-+244,
-+192,
-+51,
-+0,
-+0,
-+8,
-+255,
-+35,
-+4,
-+0,
-+180,
-+192,
-+51,
-+0,
-+0,
-+111,
-+3,
-+32,
-+246,
-+192,
-+11,
-+1,
-+16,
-+32,
-+246,
-+2,
-+140,
-+47,
-+240,
-+32,
-+247,
-+35,
-+141,
-+63,
-+178,
-+64,
-+254,
-+35,
-+141,
-+2,
-+68,
-+192,
-+243,
-+128,
-+11,
-+32,
-+255,
-+35,
-+240,
-+58,
-+226,
-+192,
-+243,
-+188,
-+10,
-+0,
-+254,
-+0,
-+141,
-+4,
-+4,
-+0,
-+240,
-+128,
-+10,
-+88,
-+246,
-+35,
-+141,
-+3,
-+68,
-+240,
-+246,
-+35,
-+141,
-+48,
-+66,
-+0,
-+247,
-+227,
-+143,
-+52,
-+242,
-+32,
-+247,
-+227,
-+142,
-+52,
-+178,
-+90,
-+0,
-+161,
-+3,
-+6,
-+64,
-+23,
-+64,
-+96,
-+8,
-+70,
-+98,
-+97,
-+8,
-+70,
-+98,
-+98,
-+8,
-+70,
-+98,
-+99,
-+8,
-+70,
-+98,
-+100,
-+8,
-+70,
-+98,
-+101,
-+8,
-+70,
-+98,
-+255,
-+159,
-+8,
-+250,
-+23,
-+102,
-+7,
-+106,
-+112,
-+30,
-+33,
-+3,
-+};
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-new file mode 100644
-index 0000000000..0255f5dd44
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,149 @@
-+/*
-+Copyright (c) 2012, Broadcom Europe Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#ifdef RPI
-+
-+#include <stdio.h>
-+#include <string.h>
-+#include <stdlib.h>
-+#include <fcntl.h>
-+#include <unistd.h>
-+#include <assert.h>
-+#include <stdint.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/ioctl.h>
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/vcio"
-+
-+#include "rpi_mailbox.h"
-+//#include <interface/vctypes/vc_image_structs.h>
-+
-+/*
-+ * use ioctl to send mbox property message
-+ */
-+
-+static int mbox_property(int file_desc, void *buf)
-+{
-+   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+   if (ret_val < 0) {
-+      printf("ioctl_set_msg failed:%d\n", ret_val);
-+   }
-+
-+#ifdef DEBUG
-+   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-+   for (i=0; i<size/4; i++)
-+      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-+#endif
-+   return ret_val;
-+}
-+
-+unsigned mbox_mem_lock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000d; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mbox_mem_unlock(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000e; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+#define GET_VCIMAGE_PARAMS 0x30044
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
-+{
-+    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
-+    uint32_t * p = buf;
-+    void * rimg;
-+    int rv;
-+
-+    *p++ = 0; // size
-+    *p++ = 0; // process request
-+    *p++ = GET_VCIMAGE_PARAMS;
-+    *p++ = sizeof(*img);
-+    *p++ = sizeof(*img);
-+    rimg = p;
-+    memcpy(p, img, sizeof(*img));
-+    p += sizeof(*img) / sizeof(*p);
-+    *p++ = 0;  // End tag
-+    buf[0] = (p - buf) * sizeof(*p);
-+
-+    rv = mbox_property(fd, buf);
-+    memcpy(img, rimg, sizeof(*img));
-+
-+    return rv;
-+}
-+
-+int mbox_open() {
-+   int file_desc;
-+
-+   // open a char device file used for communicating with kernel mbox driver
-+   file_desc = open(DEVICE_FILE_NAME, 0);
-+   if (file_desc < 0) {
-+      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-+      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-+   }
-+   return file_desc;
-+}
-+
-+void mbox_close(int file_desc) {
-+  close(file_desc);
-+}
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-new file mode 100644
-index 0000000000..b3168788d2
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,58 @@
-+#ifndef RPI_MAILBOX_H
-+#define RPI_MAILBOX_H
-+
-+/* The image structure. */
-+typedef struct vc_image_extra_uv_s {
-+  void *u, *v;
-+  int vpitch;
-+} VC_IMAGE_EXTRA_UV_T;
-+
-+typedef union {
-+    VC_IMAGE_EXTRA_UV_T uv;
-+//  VC_IMAGE_EXTRA_RGBA_T rgba;
-+//  VC_IMAGE_EXTRA_PAL_T pal;
-+//  VC_IMAGE_EXTRA_TF_T tf;
-+//  VC_IMAGE_EXTRA_BAYER_T bayer;
-+//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
-+//  VC_IMAGE_EXTRA_CODEC_T codec;
-+//  VC_IMAGE_EXTRA_OPENGL_T opengl;
-+} VC_IMAGE_EXTRA_T;
-+
-+
-+typedef struct VC_IMAGE_T {
-+  unsigned short                  type;           /* should restrict to 16 bits */
-+  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
-+  unsigned short                  width;          /* width in pixels */
-+  unsigned short                  height;         /* height in pixels */
-+  int                             pitch;          /* pitch of image_data array in bytes */
-+  int                             size;           /* number of bytes available in image_data array */
-+  void                           *image_data;     /* pixel data */
-+  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
-+  void                           *metadata;       /* metadata header for the image */
-+  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
-+  int                             mem_handle;     /* the mem handle for relocatable memory storage */
-+  int                             metadata_size;  /* size of metadata of each channel in bytes */
-+  int                             channel_offset; /* offset of consecutive channels in bytes */
-+  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
-+  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
-+  uint8_t                         current_channel;/* the channel this header is currently pointing to */
-+  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
-+  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
-+                                                            into a linked-mulitchannel image */
-+  uint8_t                         channel_index;         /* index of the channel this header represents while
-+                                                            it is being linked. */
-+  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
-+} VC_IMAGE_T;
-+
-+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
-+
-+
-+extern int mbox_open(void);
-+extern void mbox_close(int file_desc);
-+
-+extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
-+extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
-+
-+#endif
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-new file mode 100644
-index 0000000000..5d411b78e4
---- /dev/null
-+++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,941 @@
-+#ifdef RPI
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "libavutil/avassert.h"
-+
-+#include "config.h"
-+
-+#include <pthread.h>
-+#include <time.h>
-+
-+#include <interface/vcsm/user-vcsm.h>
-+
-+#include "rpi_mailbox.h"
-+#include "rpi_qpu.h"
-+#include "rpi_shader.h"
-+#include "rpi_hevc_transform8.h"
-+#include "rpi_hevc_transform10.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
-+#pragma GCC diagnostic pop
-+
-+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
-+#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
-+
-+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
-+// Beware this is expensive and will probably throw off all other timing by >10%
-+#define RPI_TRACE_QPU_PROFILE_ALL       0
-+
-+// QPU "noflush" flags
-+// a mixture of flushing & profiling
-+
-+#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
-+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
-+#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
-+#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
-+#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
-+
-+#define vcos_verify_ge0(x) ((x)>=0)
-+
-+// Size in 32bit words
-+#define QPU_CODE_SIZE 4098
-+#define VPU_CODE_SIZE 2048
-+
-+static const short rpi_transMatrix2even[32][16] = { // Even rows first
-+{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
-+{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
-+{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
-+{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
-+{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
-+{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
-+{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
-+{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
-+{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
-+{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
-+{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
-+{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
-+{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
-+{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
-+{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
-+{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
-+// Odd rows
-+{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
-+{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-+{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
-+{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
-+{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
-+{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
-+{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
-+{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
-+{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
-+{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
-+{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
-+{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
-+{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
-+{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
-+{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
-+{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
-+};
-+
-+// Code/constants on GPU
-+struct GPU
-+{
-+  unsigned int qpu_code[QPU_CODE_SIZE];
-+  unsigned int vpu_code8[VPU_CODE_SIZE];
-+  unsigned int vpu_code10[VPU_CODE_SIZE];
-+  short transMatrix2even[16*16*2];
-+};
-+
-+#define CFE_ENTS_PER_A 8
-+// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
-+// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
-+// allow 128
-+#define CFE_ENT_COUNT  128
-+#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
-+
-+struct rpi_cache_flush_env_s {
-+//    unsigned int n;
-+//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
-+  struct vcsm_user_clean_invalid2_s v;
-+};
-+
-+#define WAIT_COUNT_MAX 16
-+
-+typedef struct trace_time_one_s
-+{
-+  int count;
-+  int64_t start[WAIT_COUNT_MAX];
-+  int64_t total[WAIT_COUNT_MAX];
-+} trace_time_one_t;
-+
-+typedef struct trace_time_wait_s
-+{
-+  unsigned int jcount;
-+  int64_t start0;
-+  int64_t last_update;
-+  trace_time_one_t active;
-+  trace_time_one_t wait;
-+} trace_time_wait_t;
-+
-+typedef struct vq_wait_s
-+{
-+  sem_t sem;
-+  struct vq_wait_s * next;
-+} vq_wait_t;
-+
-+#define VQ_WAIT_POOL_SIZE 16
-+typedef struct vq_wait_pool_s
-+{
-+  vq_wait_t * head;
-+  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
-+} vq_wait_pool_t;
-+
-+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
-+
-+typedef struct gpu_env_s
-+{
-+  int open_count;
-+  int init_count;
-+  int mb;
-+  int vpu_i_cache_flushed;
-+  GPU_MEM_PTR_T code_gm_ptr;
-+  vq_wait_pool_t wait_pool;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  trace_time_wait_t ttw;
-+#endif
-+} gpu_env_t;
-+
-+// Stop more than one thread trying to allocate memory or use the processing resources at once
-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static gpu_env_t * gpu = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+
-+static int64_t ns_time(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
-+}
-+
-+
-+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
-+
-+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
-+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
-+#define T_ARG(t) T_SEC(t), T_MS(t)
-+#define T_FMT "%u.%03u"
-+
-+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
-+{
-+  // Update totals for levels that are still pending
-+  for (int i = 0; i < tto->count; ++i) {
-+    tto->total[i] += now - tto->start[i];
-+    tto->start[i] = now;
-+  }
-+
-+  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
-+         prefix,
-+         T_ARG(now - start0 - tto->total[0]),
-+         T_ARG(tto->total[0]),
-+         T_ARG(tto->total[1]),
-+         T_ARG(tto->total[2]),
-+         T_ARG(tto->total[3]));
-+}
-+
-+
-+static void tto_start(trace_time_one_t * const tto, const int64_t now)
-+{
-+  av_assert0(tto->count < WAIT_COUNT_MAX);
-+  tto->start[tto->count++] = now;
-+}
-+
-+static void tto_end(trace_time_one_t * const tto, const int64_t now)
-+{
-+  const int n = --tto->count;
-+  av_assert0(n >= 0);
-+  tto->total[n] += now - tto->start[n];
-+}
-+
-+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
-+{
-+  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
-+  tto_print(&ttw->active, now, ttw->start0, "Active");
-+  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
-+}
-+
-+#endif
-+
-+// GPU memory alloc fns (internal)
-+
-+// GPU_MEM_PTR_T alloc fns
-+static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
-+  p->numbytes = (numbytes + 255) & ~255;  // Round up
-+  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mbox_mem_lock(mb, p->vc_handle);
-+  av_assert0(p->vc);
-+//  printf("***** %s, %d\n", __func__, numbytes);
-+
-+  return 0;
-+}
-+
-+static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mbox_mem_lock(mb, p->vc_handle);
-+  av_assert0(p->vc);
-+//  printf("***** %s, %d\n", __func__, numbytes);
-+  return 0;
-+}
-+
-+static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
-+  mbox_mem_unlock(mb, p->vc_handle);
-+  vcsm_unlock_ptr(p->arm);
-+  vcsm_free(p->vcsm_handle);
-+  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
-+//  printf("***** %s\n", __func__);
-+}
-+
-+
-+// GPU init, free, lock, unlock
-+
-+static void gpu_term(void)
-+{
-+  gpu_env_t * const ge = gpu;
-+
-+  // We have to hope that eveything has terminated...
-+  gpu = NULL;
-+
-+  vc_gpuserv_deinit();
-+
-+  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
-+
-+  vcsm_exit();
-+
-+  mbox_close(ge->mb);
-+
-+  vq_wait_pool_deinit(&ge->wait_pool);
-+
-+  free(ge);
-+}
-+
-+
-+// Connect to QPU, returns 0 on success.
-+static int gpu_init(gpu_env_t ** const gpu) {
-+  volatile struct GPU* ptr;
-+  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
-+  *gpu = NULL;
-+
-+  if (ge == NULL)
-+    return -1;
-+
-+  if ((ge->mb = mbox_open()) < 0)
-+    return -1;
-+
-+  vq_wait_pool_init(&ge->wait_pool);
-+
-+  vcsm_init();
-+
-+  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
-+  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
-+
-+  // Zero everything so we have zeros between the code bits
-+  memset((void *)ptr, 0, sizeof(*ptr));
-+
-+  // Now copy over the QPU code into GPU memory
-+  {
-+    int num_bytes = (char *)mc_end - (char *)rpi_shader;
-+    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
-+  }
-+  // And the VPU code
-+  {
-+    int num_bytes = sizeof(rpi_hevc_transform8);
-+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
-+  }
-+  {
-+    int num_bytes = sizeof(rpi_hevc_transform10);
-+    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
-+  }
-+  // And the transform coefficients
-+  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+
-+  *gpu = ge;
-+  return 0;
-+}
-+
-+
-+
-+static void gpu_unlock(void) {
-+  pthread_mutex_unlock(&gpu_mutex);
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static gpu_env_t * gpu_lock(void) {
-+  pthread_mutex_lock(&gpu_mutex);
-+
-+  av_assert0(gpu != NULL);
-+  return gpu;
-+}
-+
-+static gpu_env_t * gpu_lock_ref(void)
-+{
-+  pthread_mutex_lock(&gpu_mutex);
-+
-+  if (gpu == NULL) {
-+    int rv = gpu_init(&gpu);
-+    if (rv != 0) {
-+      gpu_unlock();
-+      return NULL;
-+    }
-+  }
-+
-+  ++gpu->open_count;
-+  return gpu;
-+}
-+
-+static void gpu_unlock_unref(gpu_env_t * const ge)
-+{
-+  if (--ge->open_count == 0)
-+    gpu_term();
-+
-+  gpu_unlock();
-+}
-+
-+static inline gpu_env_t * gpu_ptr(void)
-+{
-+  av_assert0(gpu != NULL);
-+  return gpu;
-+}
-+
-+// Public gpu fns
-+
-+// Allocate memory on GPU
-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+// Returns 0 on success.
-+// This allocates memory that will not be cached in ARM's data cache.
-+// Therefore safe to use without data cache flushing.
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+  int r;
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  if (ge == NULL)
-+    return -1;
-+  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
-+  gpu_unlock();
-+  return r;
-+}
-+
-+// This allocates data that will be
-+//    Cached in ARM L2
-+//    Uncached in VPU L2
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+  int r;
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  if (ge == NULL)
-+    return -1;
-+  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
-+  gpu_unlock();
-+  return r;
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T * const p) {
-+  gpu_env_t * const ge = gpu_lock();
-+  gpu_free_internal(ge->mb, p);
-+  gpu_unlock_unref(ge);
-+}
-+
-+unsigned int vpu_get_fn(const unsigned int bit_depth) {
-+  // Make sure that the gpu is initialized
-+  av_assert0(gpu != NULL);
-+  switch (bit_depth){
-+    case 8:
-+      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
-+    case 10:
-+      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
-+    default:
-+      av_assert0(0);
-+  }
-+  return 0;
-+}
-+
-+unsigned int vpu_get_constants(void) {
-+  av_assert0(gpu != NULL);
-+  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
-+}
-+
-+int gpu_get_mailbox(void)
-+{
-+  av_assert0(gpu);
-+  return gpu->mb;
-+}
-+
-+void gpu_ref(void)
-+{
-+  gpu_lock_ref();
-+  gpu_unlock();
-+}
-+
-+void gpu_unref(void)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  gpu_unlock_unref(ge);
-+}
-+
-+// ----------------------------------------------------------------------------
-+//
-+// Cache flush functions
-+
-+#define CACHE_EL_MAX 16
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init()
-+{
-+  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
-+            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
-+  if (rfe == NULL)
-+    return NULL;
-+
-+  rfe->v.op_count = 0;
-+  return rfe;
-+}
-+
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
-+{
-+  if (rfe != NULL)
-+    free(rfe);
-+}
-+
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
-+{
-+    int rc = 0;
-+    if (rfe->v.op_count != 0) {
-+        if (vcsm_clean_invalid2(&rfe->v) != 0)
-+        {
-+          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno);
-+          rc = -1;
-+        }
-+        rfe->v.op_count = 0;
-+    }
-+    return rc;
-+}
-+
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
-+{
-+  int rc = rpi_cache_flush_execute(rfe);;
-+
-+  free(rfe);
-+  return rc;
-+}
-+
-+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
-+{
-+  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+
-+  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
-+
-+  b->invalidate_mode = mode;
-+  b->block_count = blocks;
-+  b->start_address = gm->arm + offset0;
-+  b->block_size = block_size;
-+  b->inter_block_stride = block_stride;
-+}
-+
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset, const unsigned int size)
-+{
-+  // Deal with empty pointer trivially
-+  if (gm == NULL || size == 0)
-+    return;
-+
-+  av_assert0(offset <= gm->numbytes);
-+  av_assert0(size <= gm->numbytes);
-+  av_assert0(offset + size <= gm->numbytes);
-+
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
-+}
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
-+{
-+  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
-+}
-+
-+
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
-+{
-+#if !RPI_ONE_BUF
-+#error Fixme! (NIF)
-+#endif
-+  if (gpu_is_buf1(frame)) {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
-+  }
-+  else
-+  {
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
-+    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
-+  }
-+}
-+
-+// Flush an area of a frame
-+// Width, height, x0, y0 in luma pels
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma)
-+{
-+  const unsigned int y_offset = frame->linesize[0] * y0;
-+  const unsigned int y_size = frame->linesize[0] * height;
-+  // Round UV up/down to get everything
-+  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
-+  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
-+
-+#if 0
-+  // *** frame->height is cropped height so not good
-+  // As all unsigned they will also reject -ve
-+  // Test individually as well as added to reject overflow
-+  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
-+  av_assert0(n <= (unsigned int)frame->height);
-+  av_assert0(start_line + n <= (unsigned int)frame->height);
-+#endif
-+
-+  if (!gpu_is_buf1(frame))
-+  {
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
-+    }
-+  }
-+  else if (!av_rpi_is_sand_frame(frame))
-+  {
-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
-+    }
-+  }
-+  else
-+  {
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
-+    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
-+    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
-+    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
-+
-+    if (do_chroma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
-+      b->block_size = uv_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+    if (do_luma)
-+    {
-+      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+      b->invalidate_mode = mode;
-+      b->block_count = block_count;
-+      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
-+      b->block_size = y_size;
-+      b->inter_block_stride = stride1 * stride2;
-+    }
-+  }
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
-+{
-+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
-+  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
-+  rpi_cache_flush_finish(rfe);
-+}
-+
-+
-+// ----------------------------------------------------------------------------
-+
-+
-+// Wait abstractions - mostly so we can easily add profile code
-+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_init(&wp->pool[i].sem, 0, 0);
-+    wp->pool[i].next = wp->pool + i + 1;
-+  }
-+  wp->head = wp->pool + 0;
-+  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
-+}
-+
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
-+{
-+  unsigned int i;
-+  wp->head = NULL;
-+  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+    sem_destroy(&wp->pool[i].sem);
-+    wp->pool[i].next = NULL;
-+  }
-+}
-+
-+
-+// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(void)
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  vq_wait_t * const wait = ge->wait_pool.head;
-+  ge->wait_pool.head = wait->next;
-+  wait->next = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  tto_start(&ge->ttw.active, ns_time());
-+#endif
-+
-+  gpu_unlock();
-+  return wait;
-+}
-+
-+static void vq_wait_delete(vq_wait_t * const wait)
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+  wait->next = ge->wait_pool.head;
-+  ge->wait_pool.head = wait;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    trace_time_wait_t * const ttw = &ge->ttw;
-+    const int64_t now = ns_time();
-+    ++ttw->jcount;
-+    tto_end(&ttw->wait, now);
-+
-+    if (ttw->start0 == 0)
-+    {
-+      ttw->start0 = ttw->active.start[0];
-+      ttw->last_update = ttw->start0;
-+    }
-+    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
-+    {
-+      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
-+      ttw_print(ttw, now);
-+    }
-+  }
-+#endif
-+  gpu_unlock_unref(ge);
-+}
-+
-+static void vq_wait_wait(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+      const int64_t now = ns_time();
-+      gpu_env_t * const ge = gpu_lock();
-+      tto_start(&ge->ttw.wait, now);
-+      gpu_unlock();
-+  }
-+#endif
-+
-+  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
-+    /* loop */;
-+}
-+
-+static void vq_wait_post(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+  {
-+    gpu_env_t *const ge = gpu_lock();
-+    tto_end(&ge->ttw.active, ns_time());
-+    gpu_unlock();
-+  }
-+#endif
-+
-+  sem_post(&wait->sem);
-+}
-+
-+
-+
-+// Header comments were wrong for these two
-+#define VPU_QPU_MASK_QPU  1
-+#define VPU_QPU_MASK_VPU  2
-+
-+#define VPU_QPU_JOB_MAX 4
-+struct vpu_qpu_job_env_s
-+{
-+  unsigned int n;
-+  unsigned int mask;
-+  struct gpu_job_s j[VPU_QPU_JOB_MAX];
-+};
-+
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_env_t * vpu_qpu_job_new(void)
-+{
-+  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
-+  return vqj;
-+}
-+
-+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
-+{
-+  memset(vqj, 0, sizeof(*vqj));
-+  free(vqj);
-+}
-+
-+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
-+{
-+  struct gpu_job_s * const j = vqj->j + vqj->n++;
-+  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
-+  return j;
-+}
-+
-+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
-+{
-+  if (vpu_code != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_VPU;
-+
-+    j->command = EXECUTE_VPU;
-+    // The bottom two bits of the execute address contain no-flush flags
-+    // b0 will flush the VPU I-cache if unset so we nearly always want that set
-+    // as we never reload code
-+    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
-+    j->u.v.q[1] = r0;
-+    j->u.v.q[2] = r1;
-+    j->u.v.q[3] = r2;
-+    j->u.v.q[4] = r3;
-+    j->u.v.q[5] = r4;
-+    j->u.v.q[6] = r5;
-+    gpu->vpu_i_cache_flushed = 1;
-+  }
-+}
-+
-+// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
-+{
-+  if (n != 0) {
-+    struct gpu_job_s *const j = new_job(vqj);
-+    vqj->mask |= VPU_QPU_MASK_QPU;
-+
-+    j->command = EXECUTE_QPU;
-+    j->u.q.jobs = n;
-+#if RPI_TRACE_QPU_PROFILE_ALL
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
-+#else
-+    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
-+#endif
-+    j->u.q.timeout = 5000;
-+    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  }
-+}
-+
-+// Convert callback to sem post
-+static void vpu_qpu_job_callback_wait(void * v)
-+{
-+  vq_wait_post(v);
-+}
-+
-+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
-+{
-+  vq_wait_t * wait;
-+
-+  if (vqj->mask == 0) {
-+    *wait_h = NULL;
-+    return;
-+  }
-+
-+  // We are going to want a sync object
-+  wait = vq_wait_new();
-+
-+  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+  // If we only posted one thing or only QPU jobs
-+  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+  {
-+    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+    av_assert0(j->callback.func == 0);
-+
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+  else
-+  {
-+    struct gpu_job_s *const j = new_job(vqj);
-+
-+    j->command = EXECUTE_SYNC;
-+    j->u.s.mask = vqj->mask;
-+    j->callback.func = vpu_qpu_job_callback_wait;
-+    j->callback.cookie = wait;
-+  }
-+
-+  vqj->mask = 0;
-+  *wait_h = wait;
-+}
-+
-+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
-+{
-+  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
-+}
-+
-+// Simple wrapper of start + delete
-+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
-+{
-+  int rv;
-+  rv = vpu_qpu_job_start(vqj);
-+  vpu_qpu_job_delete(vqj);
-+  return rv;
-+}
-+
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
-+{
-+  if (wait_h != NULL)
-+  {
-+    vq_wait_t * const wait = *wait_h;
-+    if (wait != NULL) {
-+      *wait_h = NULL;
-+      vq_wait_wait(wait);
-+      vq_wait_delete(wait);
-+    }
-+  }
-+}
-+
-+int vpu_qpu_init()
-+{
-+  gpu_env_t * const ge = gpu_lock_ref();
-+  if (ge == NULL)
-+    return -1;
-+
-+  if (ge->init_count++ == 0)
-+  {
-+    vc_gpuserv_init();
-+  }
-+
-+  gpu_unlock();
-+  return 0;
-+}
-+
-+void vpu_qpu_term()
-+{
-+  gpu_env_t * const ge = gpu_lock();
-+
-+  if (--ge->init_count == 0) {
-+    vc_gpuserv_deinit();
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+    ttw_print(&ge->ttw, ns_time());
-+#endif
-+  }
-+
-+  gpu_unlock_unref(ge);
-+}
-+
-+uint32_t qpu_fn(const int * const mc_fn)
-+{
-+  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
-+}
-+
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
-+{
-+  // Dummy values we can catch with emulation
-+  qf->y_pxx = ~1U;
-+  qf->y_bxx = ~2U;
-+  qf->y_p00 = ~3U;
-+  qf->y_b00 = ~4U;
-+  qf->c_pxx = ~5U;
-+  qf->c_bxx = ~6U;
-+
-+  switch (bit_depth) {
-+    case 8:
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y_b00);
-+      qf->c_pxx = qpu_fn(mc_filter_c_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c_b);
-+      break;
-+    case 10:
-+      qf->c_pxx = qpu_fn(mc_filter_c10_p);
-+      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
-+      qf->c_bxx = qpu_fn(mc_filter_c10_b);
-+      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
-+      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
-+      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
-+      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
-+      break;
-+    default:
-+      return -1;
-+  }
-+  return 0;
-+}
-+
-+#endif // RPI
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-new file mode 100644
-index 0000000000..9389047f8e
---- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,208 @@
-+#ifndef RPI_QPU_H
-+#define RPI_QPU_H
-+
-+#define RPI_ONE_BUF 1
-+
-+typedef struct gpu_mem_ptr_s {
-+  unsigned char *arm; // Pointer to memory mapped on ARM side
-+  int vc_handle;   // Videocore handle of relocatable memory
-+  int vcsm_handle; // Handle for use by VCSM
-+  int vc;       // Address for use in GPU code
-+  int numbytes; // Size of memory block
-+} GPU_MEM_PTR_T;
-+
-+// General GPU functions
-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T * const p);
-+
-+#include "libavutil/frame.h"
-+#if !RPI_ONE_BUF
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+    return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
-+    return p->vc;
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
-+}
-+
-+#else
-+
-+static inline int gpu_is_buf1(const AVFrame * const frame)
-+{
-+    return frame->buf[1] == NULL;
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
-+{
-+    return av_buffer_get_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
-+{
-+    return av_buffer_pool_opaque(frame->buf[n]);
-+}
-+
-+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
-+{
-+    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
-+    return gm->vc + (frame->data[n] - gm->arm);
-+}
-+
-+
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 0);
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 1);
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    return get_vc_address3(frame, 2);
-+}
-+
-+#if 0
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.numbytes = frame->data[1] - frame->data[0];
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 0);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[1] - frame->data[0];
-+        g.vc += frame->data[1] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 1);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+    if (gpu_is_buf1(frame))
-+    {
-+        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+        g.arm += frame->data[2] - frame->data[0];
-+        g.vc += frame->data[2] - frame->data[0];
-+        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
-+        return g;
-+    }
-+    else
-+        return *gpu_buf3_gmem(frame, 2);
-+}
-+#endif
-+#endif
-+
-+// Cache flush stuff
-+
-+struct rpi_cache_flush_env_s;
-+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(void);
-+// Free env without flushing
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & clear but do not free the env
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & free the env
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
-+
-+typedef enum
-+{
-+    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
-+    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
-+    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
-+} rpi_cache_flush_mode_t;
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
-+  const unsigned int offset, const unsigned int size);
-+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+  const unsigned int uv_shift, const int do_luma, const int do_chroma);
-+
-+// init, add, finish for one gm ptr
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
-+
-+
-+// QPU specific functions
-+
-+typedef struct HEVCRpiQpu {
-+    uint32_t c_pxx;
-+    uint32_t c_pxx_l1;
-+    uint32_t c_bxx;
-+    uint32_t y_pxx;
-+    uint32_t y_bxx;
-+    uint32_t y_p00;
-+    uint32_t y_b00;
-+} HEVCRpiQpu;
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
-+
-+uint32_t qpu_fn(const int * const mc_fn);
-+
-+#define QPU_N_GRP    4
-+#define QPU_N_MAX    12
-+
-+#define QPU_MAIL_EL_VALS  2
-+
-+struct vpu_qpu_wait_s;
-+typedef struct vq_wait_s * vpu_qpu_wait_h;
-+
-+// VPU specific functions
-+
-+struct vpu_qpu_job_env_s;
-+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
-+
-+vpu_qpu_job_h vpu_qpu_job_new(void);
-+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
-+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
-+  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
-+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
-+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
-+
-+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
-+extern unsigned int vpu_get_constants(void);
-+
-+// Waits for previous post_codee to complete and Will null out *wait_h after use
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_init(void);
-+void vpu_qpu_term(void);
-+
-+extern int gpu_get_mailbox(void);
-+void gpu_ref(void);
-+void gpu_unref(void);
-+
-+#endif
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
-new file mode 100644
-index 0000000000..2c6541a8fb
---- /dev/null
-+++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,1570 @@
-+#include "rpi_shader.h"
-+
-+#ifdef _MSC_VER
-+   #include <stdint.h>
-+   /* cast through uintptr_t to avoid warnings */
-+   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-+#else
-+   #define POINTER_TO_UINT(X) ((unsigned int)(X))
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" { /* the types are probably wrong... */
-+#endif
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#ifdef _MSC_VER
-+__declspec(align(8))
-+#elif defined(__GNUC__)
-+__attribute__((aligned(8)))
-+#endif
-+unsigned int rpi_shader[] = {
++unsigned int ff_hevc_rpi_shader[] = {
 +// ::mc_setup_c_q0
 +// ::mc_start
 +/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
@@ -28869,2552 +17987,21710 @@ index 0000000000..2c6541a8fb
 +/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_end
 +};
-+#ifdef __HIGHC__
-+#pragma Align_to(8, rpi_shader)
++#ifdef __HIGHC__
++#pragma Align_to(8, rpi_shader)
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
+new file mode 100644
+index 0000000000..ddb351782d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.h
+@@ -0,0 +1,63 @@
++#ifndef rpi_hevc_shader_H
++#define rpi_hevc_shader_H
++
++extern unsigned int ff_hevc_rpi_shader[];
++
++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
++#define mc_start (ff_hevc_rpi_shader + 0)
++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
++#define mc_filter_c_p (ff_hevc_rpi_shader + 142)
++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 272)
++#define mc_filter_c_b (ff_hevc_rpi_shader + 402)
++#define mc_sync_q0 (ff_hevc_rpi_shader + 590)
++#define mc_sync_q1 (ff_hevc_rpi_shader + 608)
++#define mc_sync_q2 (ff_hevc_rpi_shader + 620)
++#define mc_sync_q3 (ff_hevc_rpi_shader + 632)
++#define mc_sync_q4 (ff_hevc_rpi_shader + 644)
++#define mc_sync_q5 (ff_hevc_rpi_shader + 662)
++#define mc_sync_q6 (ff_hevc_rpi_shader + 674)
++#define mc_sync_q7 (ff_hevc_rpi_shader + 686)
++#define mc_sync_q8 (ff_hevc_rpi_shader + 698)
++#define mc_sync_q9 (ff_hevc_rpi_shader + 716)
++#define mc_sync_q10 (ff_hevc_rpi_shader + 728)
++#define mc_sync_q11 (ff_hevc_rpi_shader + 740)
++#define mc_exit_c_qn (ff_hevc_rpi_shader + 752)
++#define mc_exit_y_qn (ff_hevc_rpi_shader + 752)
++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 770)
++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 770)
++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 790)
++#define mc_setup_y_qn (ff_hevc_rpi_shader + 792)
++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1032)
++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1162)
++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1292)
++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1382)
++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1462)
++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1464)
++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1600)
++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1728)
++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1856)
++#define mc_sync10_q0 (ff_hevc_rpi_shader + 2042)
++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2060)
++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2072)
++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2084)
++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2096)
++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2114)
++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2126)
++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2138)
++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2150)
++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2168)
++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2180)
++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2192)
++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2204)
++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2204)
++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2224)
++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2224)
++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2242)
++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2244)
++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2494)
++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2624)
++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2716)
++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2846)
++#define mc_end (ff_hevc_rpi_shader + 2926)
++
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
+new file mode 100644
+index 0000000000..f8572cdebe
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.qasm
+@@ -0,0 +1,1741 @@
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4.  As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# However in the current world there seems to be no benefit (and a small
++# overhead) in setting this bigger than 2.
++
++.set PREREAD,                      4
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8,               16
++.set C_BLK_HEIGHT_16,              8
++.set Y_BLK_HEIGHT_8,               16
++.set Y_BLK_HEIGHT_16,              8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
++
++.set N_QPU_8,                      12
++.set N_QPU_16,                     12
++
++# register allocation
++#
++
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
++
++# ra4-7
++# C:   L0 H filter out FIFO
++# otherwise -- free --
++
++# ra8-11
++# temp in some places - check usage
++# Y:   (with rb8-11) horiz out FIFO
++
++# ra12-15
++# -- free --
++
++# uniform: width:height
++.set ra_width_height,              ra16
++.set ra_width,                     ra16.16b
++.set ra_height,                    ra16.16a
++
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2,                      ra17
++.set ra_y2,                        ra17.16a
++.set ra_y,                         ra17.16b
++
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1,             ra18
++.set ra_wt_off_l1,                 ra18.16b
++.set ra_wt_mul_l1,                 ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next,                 ra19
++.set ra_y_next,                    ra19.16b
++.set ra_y2_next,                   ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff100100,                 ra20
++.set ra_k256,                      ra20.16a
++.set ra_k0,                        ra20.8a
++.set ra_k1,                        ra20.8b
++.set ra_k16,                       ra20.8c
++.set ra_k255,                      ra20.8d
++
++# Loop: xshifts
++.set ra_xshift,                    ra21.16a
++.set ra_xshift_next,               ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0,             ra22
++.set ra_wt_mul_l0,                 ra22.16a
++.set ra_wt_off_l0,                 ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax,           ra23
++.set ra_pmax,                      ra23.16a
++.set ra_blk_height,                ra23.8c
++# -- free --                       ra23.8d
++
++# Loop:  src frame base (L0)
++.set ra_base,                      ra24
++
++# Loop: src frame base (L1)
++.set ra_base2,                     ra25
++
++# Loop: next src frame base (L0)
++.set ra_base_next,                 ra26
++
++# -- free --                       ra27
++# -- free --                       ra28
++# -- free --                       ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link,                      ra30
++
++# -- free --                       ra31
++
++.set rb_xshift2,                   rb0
++.set rb_xshift2_next,              rb1
++
++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x,                    rb2
++
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++.set rb_ef,                        rb3
++
++# rb4-7
++# C-B: L1 H filter out FIFO
++# Y:   (with ra2.8x) Y vertical filter coeffs
++
++# rb8-11
++# C:   Vertical filter coeffs
++# Y:   (with ra8-11) horiz out FIFO
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off,                    rb12
++
++# Setup: denom + 6 + 9
++.set rb_wt_den_p15,                rb13
++
++# -- free --                       rb14
++# -- free --                       rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch,                     rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu,                     rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount,                    rb18
++
++# frame_base2_next
++.set rb_base2_next,                rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
++.set rb_xpitch,                    rb20
++
++# -- free --                       rb21
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask,                     rb22
++
++# Loop: destination address
++.set rb_dest,                      rb23
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base,                 rb24
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x,                     rb25
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set rb_dma0,                      rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base,                 rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init,                  rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1,                      rb29
++
++# Setup: pic_height - 1
++.set rb_max_y,                     rb30
++
++# -- free --                       rb31
++
++
++
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16,                    -16
++.set i_shift21,                    -11
++.set i_shift23,                     -9
++.set i_shift30,                     -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
++  mov r2, qpu_num
++.if v_bit_depth <= 8
++  # 8 bit version
++  asr r1, r2, 2
++  shl r1, r1, 6
++  and r0, r2, 3
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++  shl r0, r0, 5
++
++.else
++  # 16 bit version
++  # Limited to 8 QPUs if blk height > 8
++  asr r1, r2, 1
++.if v_blk_height <= 8
++  shl r1, r1, 4
++.else
++  shl r1, r1, 5
++.endif
++  and r0, r2, 1
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
++  add r_vpm, r0, r1
++
++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
++  shl r0, r0, 6
++.endif
++  add r_dma, r0, r1  # DMA out
++.endm
++
++
++.macro m_setup_q0
++  srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
++################################################################################
++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_pmask,           0xff
++.set v_blk_height,      C_BLK_HEIGHT_8
++.else
++.set v_x_shift,         2
++.set v_pmask,           0xffff
++.set v_blk_height,      C_BLK_HEIGHT_16
++.endif
++
++  mov tmurs, 1                                  # No swap TMUs
++
++# Load first request location
++  mov ra0, unif                                 # next_x_y
++
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
++
++  mov ra_base, unif                             # Store frame c base
++
++# Read image dimensions
++  sub r0, unif, 1                               # pic c width
++  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
++  sub rb_max_y, unif, 1                         # pic c height
++
++# load constants
++  mov ra_kff100100, 0xff100100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++
++# get source pitch
++  mov rb_xpitch, unif                           # stride2
++  mov rb_pitch, unif                            # stride1
++  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
++  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
++
++  and r0, 1, elem_num
++  nop                   ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++  add rb_elem_x, r0, elem_num
++.else
++  add r0, r0, elem_num
++  add rb_elem_x, r0, r0
++.endif
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
++  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
++  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
++  min r0, r0, rb_max_x
++
++# Get shift
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.else
++  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++.endif
++
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1
++  add ra_base, ra_base, r0
++
++  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
++
++# Compute part of VPM to use for DMA output
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++# And again for L1, but only worrying about frame2 stuff
++
++# Load first request location
++  mov ra0, unif                                 # next_x_y
++
++  mov ra_base2, unif                            # [ra0 delay] Store frame c base
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  shl r0, ra0.16b, v_x_shift
++  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r2, ra_y2
++  add ra_base2, ra_base2, r0
++
++# Do preloads
++# r0 = ra_y, r2 = ra_y2
++  mov r3, PREREAD       ; mov r0, ra_y
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++  mov ra_link, unif                             # link
++# touch registers to keep simulator happy
++  # ra/b4..7: B0 -> B stash registers
++  mov ra4, 0 ; mov rb4, 0
++  bra -, ra_link
++  mov ra5, 0 ; mov rb5, 0
++  mov ra6, 0 ; mov rb6, 0
++  mov ra7, 0 ; mov rb7, 0
++# >>> ra_link
++.endm
++
++::mc_setup_c_q0
++  m_setup_q0
++::mc_setup_c_qn
++  m_setup_c 8
++
++################################################################################
++
++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++
++# At this point we have already issued two pairs of texture requests for the current block
++# ra_x, ra_x16_base point to the current coordinates for this block
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_x_mul,           2
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_x_mul,           4
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift,        rb_xshift2              # b side more convienient
++.set vrx_xshift_next,   ra_xshift_next
++.set vra_y_next,        ra_y_next
++.set vrx_base_next,     ra_base_next
++.set vra_y,             ra_y
++.set vra_base,          ra_base
++.set vr_txs,            t0s
++.else
++.set vrx_xshift,        ra_xshift               # a side more convienient
++.set vrx_xshift_next,   rb_xshift2_next
++.set vra_y_next,        ra_y2_next
++.set vrx_base_next,     rb_base2_next
++.set vra_y,             ra_y2
++.set vra_base,          ra_base2
++.set vr_txs,            t1s
++.endif
++
++# per-channel shifts were calculated on the *previous* invocation
++# get base addresses and per-channel shifts for *next* invocation
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
++
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
++
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
++  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
++  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++
++.if v_bit_depth <= 8
++  shl vrx_xshift_next, r0, 3
++  and r0, r0, -4
++.endif
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
++  add vrx_base_next, r3, r0     ; mov r1, ra_height
++
++# set up VPM write
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
++
++# ; unpack filter coefficients
++
++  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
++
++  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
++
++  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++
++  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
++  sub ra3, rb_wt_den_p15, ra_k1
++
++# r5           = 0 (loop counter)
++# ra9          = alias for rb_max_y
++# ra_wt_mul_l0 = weight L0
++# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
++# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++.if v_tmu == 0
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++.else
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++.endif
++
++  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++  min r3, r3, ra9       ; mov.ifnc r0, r2
++
++  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
++
++# apply horizontal filter
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are vaild for all QPUs
++
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++
++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
++# Have to dup block as we need to move the brr - code is more common than it
++# looks at first glance
++.if v_bit_depth <= 8
++  brr.anyn -, r:1b
++  add r2, r2, r3        ; mov ra5, ra6
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++.else
++  add r2, r2, r3        ; mov ra5, ra6
++  brr.anyn -, r:1b
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub r2, r2, r0        ; mul24 r0, ra4, rb8
++  asr ra7, r2, v_bit_depth - 8
++.endif
++# >>> .anyn 1b
++
++  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
++  sub r1, r1, r0
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  shl r1, r1, 8         ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, ra3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++# At 10 bits
++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
++# (P)
++# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
++# ... should be OK
++#
++# (B)
++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
++# So signed overflow if we sign extend here :-(
++#
++# In practice this doesn't happen (we need a maximal offset and a very unlucky
++# filter).
++#
++# This could be fixed by offsetting the filters s.t. they are unsigned until
++# weight mul and then removing the offset with the weighting offset (I think
++# this should work) or splitting the rounding & offsetting
++
++::mc_filter_c_p
++  m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++  m_filter_c_p 1, 8
++
++################################################################################
++
++# mc_filter_c_b
++
++# At this point we have already issued two pairs of texture requests for the current block
++# ra_x, ra_x16_base point to the current coordinates for this block
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++.set v_x_mul,           (1 << v_x_shift)
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
++
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
++
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
++  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
++  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
++
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.endif
++
++  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++
++# set up VPM write
++
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
++
++  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
++  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
++  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
++
++# L1 - uniform layout could possibly be optimized
++
++  shl r0, ra3.16b, v_x_shift                    # r0=x*2
++  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
++  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
++  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
++  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
++
++  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++  and r1, r0, r1        ; mov rb10, ra3.8c
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
++  add rb_base2_next, r3, r0
++
++  mov ra9, rb_max_y     ; mov rb11, ra3.8d
++  shl r1, ra_wt_off_l1, rb_wt_den_p15
++  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
++
++# r5        loop counter
++# ra0       H coeffs L0
++# ra1       H coeffs L1
++# ra2       V coeffs L0
++# ra3       temp
++# ra4-7     L0 H FIFO
++# rb4-7     L1 H FIFO
++# rb8-rb11  V coeffs L1
++# ra9       rb_max_y alias
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
++  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++  add ra_y, 1, ra_y     ; mov r3, ra_y
++
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
++
++# L0 H-filter
++# H FIFO scrolls are spread all over this loop
++  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
++
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++.if v_bit_depth <= 8
++  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++.else
++  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
++  asr ra3, r2, (v_bit_depth - 8)
++.endif
++
++  shr r2, r4, rb_xshift2 ; mov ra5, ra6
++  shr r1, r2, v_v_shift ; mov r3, ra_y2
++  add ra_y2, r3, ra_k1  ; mov rb6, rb7
++
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
++
++# L1 H-filter
++
++  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++# V filters - start in branch delay slots of H
++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
++  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++  brr.anyn -, r:1b
++  mov ra6, ra7          ; mul24 r3, ra7, rb10
++  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++# >>> .anyn 1b
++
++  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
++  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++  sub r2, r1, r0        ; mul24 r0, ra4, rb8
++  sub r1, r3, r0        ; mul24 r0, ra5, rb9
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
++  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++
++  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++
++  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
++  add r1, r1, r2        ; mov r3, ra_blk_height
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_b
++  m_filter_c_b 8
++
++################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  nop                   ; nop           ; ldtmu0
++  mov -, vw_wait        ; nop           ; ldtmu1
++.else
++  mov.setf r3, PREREAD - 1
++:1
++  brr.anynz -, r:1b
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  sub.setf r3, r3, 1
++ # >>>
++  mov  -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 -  fns should never be called
++.if n_qpu < n_quads * 4
++  mov ra_link, unif     # Can only branch to an a reg (not r0)
++  mov -, vw_wait        # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in,  12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  bra -, ra_link
++  sacq -, n_sem_quad_in
++  srel -, n_sem_out
++  srel -, n_sem_quad_out
++
++.else
++  bra -, ra_link
++  srel -, n_sem_sync
++  sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++  srel -, n_sem_out
++.else
++  nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++  m_sync_q 0, v_quads8
++::mc_sync_q1
++  m_sync_q 1, v_quads8
++::mc_sync_q2
++  m_sync_q 2, v_quads8
++::mc_sync_q3
++  m_sync_q 3, v_quads8
++::mc_sync_q4
++  m_sync_q 4, v_quads8
++::mc_sync_q5
++  m_sync_q 5, v_quads8
++::mc_sync_q6
++  m_sync_q 6, v_quads8
++::mc_sync_q7
++  m_sync_q 7, v_quads8
++::mc_sync_q8
++  m_sync_q 8, v_quads8
++::mc_sync_q9
++  m_sync_q 9, v_quads8
++::mc_sync_q10
++  m_sync_q 10, v_quads8
++::mc_sync_q11
++  m_sync_q 11, v_quads8
++
++# mc_exit()
++# Chroma & Luma the same now
++
++.macro m_exit_qn
++  m_exit_drain
++  nop                   ; nop           ; thrend
++  nop
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++  m_exit_qn
++
++
++
++# mc_interrupt_exit12()
++
++.macro m_exit_q0
++  m_exit_drain
++  sacq -, 12
++  nop                   ; nop           ; thrend
++  mov interrupt, 1
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_q0
++::mc_exit_y_q0
++  m_exit_q0
++
++# LUMA CODE
++
++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
++# For P frames we make the second x,y coordinates offset by +8
++
++
++################################################################################
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t pic_h;
++#    uint16_t pic_w;
++#    uint32_t stride2;
++#    uint32_t stride1;
++#    uint32_t wdenom;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_pmask,           0xff
++.set v_blk_height,      Y_BLK_HEIGHT_8
++.else
++.set v_x_shift,         1
++.set v_pmask,           0xffff
++.set v_blk_height,      Y_BLK_HEIGHT_16
++.endif
++
++
++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
++  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
++  mov ra9, unif                                 # ref_y_base
++  mov ra1, unif                                 # x2_y2
++  mov ra11, unif                                # ref_y2_base
++
++# load constants
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
++
++
++  mov ra_kff100100, 0xff100100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++
++# Compute part of VPM to use
++
++# Read image dimensions
++  mov ra3, unif                                 # width_height
++  mov rb_xpitch, unif                           # stride2
++.if v_x_shift == 0
++  sub rb_max_x, ra3.16b, 1
++.else
++  sub r0, ra3.16b, 1
++  shl rb_max_x, r0, v_x_shift
++.endif
++  sub rb_max_y, ra3.16a, 1
++  mov rb_pitch, unif                            # stride1
++
++# get destination pitch
++  mov r1, vdw_setup_1(0)
++  or  rb_dma1_base, r1, rb_pitch
++
++# Compute base address for first and second access
++  mov r3, elem_num
++  add r0, ra0.16b, r3                           # Load x + elem_num
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl ra_xshift_next, r0, 3 # Compute shifts
++
++# X is byte offset - we can only load words - mask
++
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                                # Add stripe offsets
++  add ra_base, ra9, r0
++
++  # r3 still contains elem_num
++  add r0, ra1.16b, r3                           # Load x
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
++
++  # r2 still contains mask
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                                # Add stripe offsets
++  add ra_base2, ra11, r0
++
++# Do preloads
++  nop                   ; mov r0, ra0.16a       # ; r0 = y
++  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
++
++  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
++
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++  mov ra_link, unif                             # Next fn
++
++# touch vertical context to keep simulator happy
++  mov ra8,  0           ; mov rb8,  0
++  bra -, ra_link
++  mov ra9,  0           ; mov rb9,  0
++  mov ra10, 0           ; mov rb10, 0
++  mov ra11, 0           ; mov rb11, 0
++# >>> ra_link
++.endm
++
++::mc_setup_y_q0
++  m_setup_q0
++::mc_setup_y_qn
++  m_setup_y 8
++
++################################################################################
++#
++# Start of per-block setup code
++# P and B blocks share the same setup code to save on Icache space
++
++# luma_setup_delay3 done in delay slots of branch that got us here
++
++# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
++
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t mymx21;
++#    uint32_t wo1;
++#    uint32_t wo2;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++  brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++  brr ra_link, r:per_block_setup_10
++.endif
++  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
++  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
++  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3         # Compute shifts
++  and r0, r0, -4
++  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
++  add ra_base_next, ra_base_next, r0            # [ra1 delay]
++
++  add r0, ra1.16b, r3                           # Load x2
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
++  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
++  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
++  add rb_base2_next, rb_base2_next, r0
++
++# get width,height of block (unif load above), r1 = width * pel_size
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++  add rb_lcount, r0, 7
++  shl r0,   r0, v_dma_h_shift
++  add r0,   r0, r1                              # Combine width and height of destination area
++  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
++
++# get filter coefficients and discard unused B frame values
++  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
++  shl ra8, r0, 3        ; mov r3, ra_k255
++
++# Pack the 1st 4 filter coefs for H & V tightly
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++
++  mov r1,0x00010100  # -ve                      [ra8 delay]
++  ror ra2.8a, r1, ra8.8d
++  ror ra0.8a, r1, ra8.8c
++
++  mov r1, 0x01040400
++  ror ra2.8b, r1, ra8.8d
++  ror ra0.8b, r1, ra8.8c
++
++  mov r1,0x050b0a00  # -ve
++  ror ra2.8c, r1, ra8.8d
++  ror ra0.8c, r1, ra8.8c
++
++  mov r1,0x11283a40
++  ror ra2.8d, r1, ra8.8d
++  ror ra0.8d, r1, ra8.8c
++
++# In the 2nd vertical half we use b registers due to using a-side fifo regs
++
++  mov r1,0x3a281100
++  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++
++  mov r1,0x0a0b0500  # -ve
++  ror r0, r1, ra8.8d
++  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++
++  mov r1,0x04040100
++  ror r0, r1, ra8.8d
++  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++
++  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
++
++  mov r1,0x01010000  # -ve
++  ror r0, r1, ra8.8d
++
++  bra -, ra_link
++  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++
++  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
++# >>> branch ra_link
++
++# r5 = 0
++# ra_wt_mul_l1  = weight L1
++# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
++# rb_wt_den_p15 = weight denom + 6 + 9
++# rb_wt_mul_l0  = weight L0
++.endm
++
++:per_block_setup_8
++  m_per_block_setup 8
++
++
++
++################################################################################
++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# In a P block, y2_x2 should be y_x+8
++# At this point we have already issued two pairs of texture requests for the current block
++
++.macro m_filter_y_pxx, v_bit_depth
++  m_luma_setup v_bit_depth
++
++  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++
++# r5 = 0 (loop count)
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# N.B. Whilst y == y2 as far as this loop is concerned we will start
++# the grab for the next block before we finish with this block and that
++# might be B where y != y2 so we must do full processing on both y and y2
++
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y          ; mov ra7, ra8
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++
++# apply horizontal filter
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++  mov ra10, ra11        ; mov rb10, rb11
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
++
++  # apply vertical filter and write to VPM
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0
++# At this point r1 is a 22-bit signed quantity: 8 (original sample),
++#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
++# The top 8 bits have rubbish in them as mul24 is unsigned
++# The low 6 bits need discard before weighting
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
++  asr r1, r1, 14
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
++
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++
++# >>> branch.anyn yloop
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_pxx
++  m_filter_y_pxx 8
++
++
++################################################################################
++
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# In a P block, only the first half of coefficients contain used information.
++# At this point we have already issued two pairs of texture requests for the current block
++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
++# Or possibly by taking advantage of symmetry?
++
++.macro m_filter_y_bxx, v_bit_depth
++  m_luma_setup v_bit_depth
++
++:1
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y          ; mov ra7, ra8
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++
++# apply horizontal filter
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++  mov ra10, ra11        ; mov rb10, rb11
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
++
++  # apply vertical filter and write to VPM
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0        ; mov r2, rb_wt_off
++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
++# Top 8 bits are bad - low 6 bits should be discarded
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++
++  asr r1, r1, 14
++  nop                   ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++
++  add r1, r1, r0        ; mov r3, ra_blk_height
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_bxx
++  m_filter_y_bxx 8
++
++################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++#    qpu_mc_src_t next_src1;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t wo1;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++  mov ra0, unif         ; mov r3, elem_num      # y_x
++  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
++  add r0, ra0.16b, r3
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3                     # Compute shifts
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++  shl r1, ra_width, v_x_shift
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
++  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
++  add rb_dma0, r0, rb_dma0_base
++
++  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
++
++:1
++  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++  m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++  m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++  mov r0, 7
++  sub rb_i_tmu, rb_i_tmu, r0
++  sub rb_lcount, rb_lcount, r0
++  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++  shl rb_wt_off, rb_wt_off, r0
++  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++  add r1, r0, r1
++  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++  m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++  m_setup_q0
++::mc_setup_c10_qn
++  m_setup_c 10
++
++::mc_filter_c10_p
++  m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++  m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++  m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++  m_sync_q 0, v_quads10
++::mc_sync10_q1
++  m_sync_q 1, v_quads10
++::mc_sync10_q2
++  m_sync_q 2, v_quads10
++::mc_sync10_q3
++  m_sync_q 3, v_quads10
++::mc_sync10_q4
++  m_sync_q 4, v_quads10
++::mc_sync10_q5
++  m_sync_q 5, v_quads10
++::mc_sync10_q6
++  m_sync_q 6, v_quads10
++::mc_sync10_q7
++  m_sync_q 7, v_quads10
++::mc_sync10_q8
++  m_sync_q 8, v_quads10
++::mc_sync10_q9
++  m_sync_q 9, v_quads10
++::mc_sync10_q10
++  m_sync_q 10, v_quads10
++::mc_sync10_q11
++  m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++  m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++  m_exit_qn
++
++::mc_setup_y10_q0
++  m_setup_q0
++::mc_setup_y10_qn
++  m_setup_y 10
++
++:per_block_setup_10
++  m_per_block_setup 10
++
++::mc_filter_y10_pxx
++  m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++  m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++  m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++  m_filter_y_b00 10
++
++
++
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
+diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
+new file mode 100644
+index 0000000000..9f8983da52
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_cmd.h
+@@ -0,0 +1,128 @@
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++    int16_t y;
++    int16_t x;
++    qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++    qpu_mc_src_t next_src;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x;
++    uint32_t coeffs_y;
++    uint32_t wo_u;
++    uint32_t wo_v;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x1;
++    uint32_t coeffs_y1;
++    uint32_t weight_u1;
++    uint32_t weight_v1;
++    qpu_mc_src_t next_src2;
++    uint32_t coeffs_x2;
++    uint32_t coeffs_y2;
++    uint32_t wo_u2;
++    uint32_t wo_v2;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++    qpu_mc_src_t next_src1;
++    uint32_t pic_cw;            // C Width (== Y width / 2)
++    uint32_t pic_ch;            // C Height (== Y Height / 2)
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    qpu_mc_src_t next_src2;
++    uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
++    union {
++        qpu_mc_pred_c_p_t p;
++        qpu_mc_pred_c_b_t b;
++        qpu_mc_pred_c_s_t s;
++    };
++} qpu_mc_pred_c_t;
++
++
++typedef struct qpu_mc_pred_y_p_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t h;
++    uint16_t w;
++    uint32_t mymx21;
++    uint32_t wo1;
++    uint32_t wo2;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t wo1;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t pic_h;
++    uint16_t pic_w;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++    union {
++        qpu_mc_pred_y_p_t p;
++        qpu_mc_pred_y_p00_t p00;
++        qpu_mc_pred_y_s_t s;
++    };
++} qpu_mc_pred_y_t;
++
++typedef union qpu_mc_pred_cmd_u {
++    qpu_mc_pred_y_t y;
++    qpu_mc_pred_c_t c;
++    uint32_t data[1];
++} qpu_mc_pred_cmd_t;
++
++#define QPU_MC_PRED_N_Y8        12
++#define QPU_MC_PRED_N_C8        12
++
++#define QPU_MC_PRED_N_Y10       12
++#define QPU_MC_PRED_N_C10       12
++
++#pragma pack(pop)
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
+new file mode 100644
+index 0000000000..0c80cf4de0
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.c
+@@ -0,0 +1,62 @@
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++
++typedef struct shader_track_s
++{
++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    const struct qpu_mc_src_s *last_l0;
++    const struct qpu_mc_src_s *last_l1;
++    uint32_t width;  // pic_width * PW
++    uint32_t height;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++    return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++    int rv;
++    // As it happens we can take the 2nd filter term & divide it by 8
++    // (dropping fractions) to get the fractional move
++    rv = 8 - ((x >> 11) & 0xf);
++    av_assert2(rv >= 0 && rv <= 7);
++    return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++    return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCRpiContext *const s, int32_t x)
++{
++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCRpiContext *const s, int32_t x)
++{
++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++    return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_hevc_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_hevc_shader_template_fn.h"
++
+diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
+new file mode 100644
+index 0000000000..304d73ea4a
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.h
+@@ -0,0 +1,22 @@
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++struct HEVCRpiContext;
++struct HEVCRpiInterPredEnv;
++
++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
+new file mode 100644
+index 0000000000..b9e7c07fe3
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template_fn.h
+@@ -0,0 +1,477 @@
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++        const pixel s = *(const pixel *)src;
++        pixel * d = (pixel *)dst;
++        for (unsigned int j = 0; j < w; j += PW) {
++            *d++ = s;
++        }
++    }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
++        memcpy(dst, src, w);
++    }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++                         uint8_t * dst, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > st->width) {
++        if (x >= st->width)
++            x = st->width - PW;
++        dr = (x + w) - st->width;
++        w = st->width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > st->height) {
++        if (y >= st->height)
++            y = st->height - 1;
++        db = (y + h) - st->height;
++        h = st->height - y;
++    }
++
++    dst += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++    if (dr != 0)
++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++    w += dl + dr;
++    dst -= dl;
++
++    if (dt != 0)
++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++    if (db != 0)
++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++    const int width = st->width;
++    const int height = st->height;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > width) {
++        if (x >= width)
++            x = width - PW;
++        dr = (x + w) - width;
++        w = width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > height) {
++        if (y >= height)
++            y = height - 1;
++        db = (y + h) - height;
++        h = height - y;
++    }
++
++    dst_u += dl + dt * dst_stride;
++    dst_v += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++    {
++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++    }
++    if (dr != 0)
++    {
++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++    }
++    w += dl + dr;
++    dst_u -= dl;
++    dst_v -= dl;
++
++    if (dt != 0)
++    {
++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++    }
++    if (db != 0)
++    {
++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++    }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++    if (is_c) {
++        x *= 2;
++        w *= 2;
++    }
++
++    for (int i = y; i != y + h; ++i) {
++        for (int j = x; j != x + w; ++j) {
++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++            if (j < 0 || i < 0)
++                printf("..%c", sep);
++            else
++                printf("%02x%c", *(const pixel*)p, sep);
++#else
++            if (j < 0 || i < 0)
++                printf("...%c", sep);
++            else
++                printf("%03x%c", *(const pixel*)p, sep);
++#endif
++        }
++        printf("\n");
++    }
++}
++
++
++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
++                  const HEVCRpiInterPredEnv *const ipe_y,
++                  const HEVCRpiInterPredEnv *const ipe_c)
++{
++    for (int c_idx = 0; c_idx < 2; ++c_idx)
++    {
++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++        unsigned int exit_n = 0;
++
++        if (ipe == NULL || !ipe->used) {
++            continue;
++        }
++
++        do {
++            for (unsigned int i = 0; i != ipe->n; ++i) {
++                const HEVCRpiInterPredQ * const q = ipe->q + i;
++                shader_track_t * const st = tracka + i;
++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++                for (;;) {
++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++                    if (link == q->code_setup) {
++                        if (c_idx == 0) {
++                            // Luma
++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++                            st->height = c->pic_h;
++                            st->width = c->pic_w * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                        else {
++                            // Chroma
++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++                            st->height = c->pic_ch;
++                            st->width = c->pic_cw * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                    }
++                    else if (link == s->qpu.y_pxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++                        const int w1 = FFMIN(c->w, 8);
++                        const int w2 = c->w - w1;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        if (w2 > 0) {
++                            FUNC(get_patch_y)(st,
++                                        patch_y2, PATCH_STRIDE,
++                                        st->last_l1,
++                                        16, c->h + 7);
++                        }
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++                        if (w2 > 0) {
++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++                        }
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_bxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h + 7);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_p00) {
++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++                        st->last_l0 = &c->next_src1;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_b00) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        av_assert0(c->w <= 16 && c->h <= 64);
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++                           patch_y3, patch_y1, PATCH_STRIDE,
++                           c->h, 0, 0, c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), 0, 0, c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx_l1) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l1 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_bxx) {
++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++                        const int mx1 = fctom(c->coeffs_x1);
++                        const int my1 = fctom(c->coeffs_y1);
++                        const int mx2 = fctom(c->coeffs_x2);
++                        const int my2 = fctom(c->coeffs_y2);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72];
++                        uint8_t patch_v1[PATCH_STRIDE * 72];
++                        uint8_t patch_u2[PATCH_STRIDE * 72];
++                        uint8_t patch_v2[PATCH_STRIDE * 72];
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == q->code_sync) {
++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++                        break;
++                    }
++                    else if (link == q->code_exit) {
++                        // We expect exit to occur without other sync
++                        av_assert0(i == exit_n);
++                        ++exit_n;
++                        break;
++                    }
++                    else {
++                        av_assert0(0);
++                    }
++                }
++
++                st->qpu_mc_curr = cmd;
++            }
++        } while (exit_n == 0);
++    }
++}
++
++#undef FUNC
++#undef pixel
++
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+new file mode 100644
+index 0000000000..a08a1d6bef
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,927 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# HEVC VPU Transform
++#             fe
++# Transform matrix can be thought of as
++#   output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++#   (a b c d) (1 2  2  1)
++#             (3 4 -4 -3)
++#             (5 6  6  5)
++#             (7 8 -8 -7)
++#
++#  x=(a c)(1 2) = 1a+5c 2a+6c
++#         (5 6)
++#
++#  y=(b d)(3 4) = 3b+7d 4b+8d
++#         (7 8)
++#
++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++#  Final results are (u , v[::-1])
++#
++#
++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++#  Apply the even matrix first and stop before rounding
++#  Then apply the odd matrix in a full manner:
++#
++#   First step is to compute partial products with the first input (16 cycles)
++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
++#   2a 4b 6c 8d
++#   2a -4b 6c -8d
++#   1a -3b 5c -7d
++#
++#   Second step is to sum partial products into final position (8 cycles)
++#   1a+3b+5c+7d
++#   2a+4b+6c+8d
++#   2a-4b+6c-8d
++#   1a-3b+5c-7d
++#
++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++#   For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++  cmp r5,1
++  beq memclear16
++  cmp r5,2
++  beq hevc_deblock_16x16
++  cmp r5,3
++  beq hevc_uv_deblock_16x16
++  cmp r5,4
++  beq hevc_uv_deblock_16x16_with_clear
++  cmp r5,5
++  beq hevc_run_command_list
++
++  push r6-r15, lr # TODO cut down number of used registers
++  mov r14,r3 # coeffs32
++  mov r15,r4 # num32
++  mov r3, 16*2 # Stride of transMatrix2 in bytes
++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  # Now use r0 to describe which matrix we are working on.
++  # Allows us to prefetch the next block of coefficients for efficiency.
++  mov r0,0 # This describes the location where we read our coefficients from
++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++  mov r7,16*16*2 # Total block size
++  mov r8,64*16 # Value used to swap from current to next VRF location
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  mov r4,64 # Constant used for rounding first pass
++  mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++  # At start of block r0,r1 point to the current block (that has already been loaded)
++block_loop:
++  eor r0,r8
++  add r1,r7
++  # Prefetch the next block
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  eor r0,r8
++  sub r1,r7
++
++  # Transform the current block
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
++
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
++
++  # Save results - note there has been a transposition during the processing so we save columns
++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++  # Move onto next block
++  eor r0,r8
++  add r1,r7
++
++  addcmpbgt r2,-1,0,block_loop
++
++  # Now go and do any 32x32 transforms
++  b hevc_trans_32x32
++
++  pop r6-r15, pc
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++col_trans_odd_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_odd_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++#
++hevc_trans_32x32:
++  mov r1,r14 # coeffs
++  mov r2,r15 # num
++
++  # Fetch odd transform matrix
++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++  #add r0, 16*16*2
++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++  mov r7, 16*16*2 # Total block size
++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
++  # set r8 to 32byte aligned stack pointer
++  add r8,sp,31
++  lsr r8,5
++  lsl r8,5
++  mov r9,r8  # Backup of the temporary storage
++  mov r10,r1 # Backup of the coefficient buffer
++block_loop32:
++
++  # COLUMN TRANSFORM
++  mov r4, 64 # Constant used for rounding first pass
++  mov r5, 9 # left shift used for rounding first pass
++
++  # Transform the first 16 columns
++  mov r1,r10  # Input Coefficient buffer
++  mov r8,r9   # Output temporary storage
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  # ROW TRANSFORM
++  mov r4, TRANS_RND2 # Constant used for rounding second pass
++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++  mov r1,r9  # Input temporary storage
++  mov r8,r10   # Output Coefficient buffer
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  add r10, 32*32*2 # move onto next block of coefficients
++  addcmpbgt r2,-1,0,block_loop32
++
++  add sp,sp,32*32*2+32 # Restore stack
++
++  pop r6-r15, pc
++
++trans32:
++  push lr
++  # We can no longer afford the VRF space to do prefetching when doing 32x32
++  # Fetch the even rows
++  vldh HX(0++,0),(r1 += r3) REP 16
++  # Fetch the odd rows
++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++  # Transform the even rows using even matrix
++  mov r0, 0 # Even rows
++  bl col_trans_16
++
++  # Now transform the odd rows using odd matrix
++  mov r0, 64*16 # Odd rows
++  bl col_trans_odd_16
++
++  # Now apply butterfly to compute the first 16 results
++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  # 16bit results now in HX(48,32)
++  mov r0,r8
++  mov r6,32*2
++  vsth VX(48,32++),(r0+=r6) REP 16
++
++  # Now apply butterfly to compute the second 16 results (in reverse order)
++  vsub HY(63,0),HY(0 ,0),HY(16,0)
++  vsub HY(62,0),HY(1 ,0),HY(17,0)
++  vsub HY(61,0),HY(2 ,0),HY(18,0)
++  vsub HY(60,0),HY(3 ,0),HY(19,0)
++  vsub HY(59,0),HY(4 ,0),HY(20,0)
++  vsub HY(58,0),HY(5 ,0),HY(21,0)
++  vsub HY(57,0),HY(6 ,0),HY(22,0)
++  vsub HY(56,0),HY(7 ,0),HY(23,0)
++  vsub HY(55,0),HY(8 ,0),HY(24,0)
++  vsub HY(54,0),HY(9 ,0),HY(25,0)
++  vsub HY(53,0),HY(10,0),HY(26,0)
++  vsub HY(52,0),HY(11,0),HY(27,0)
++  vsub HY(51,0),HY(12,0),HY(28,0)
++  vsub HY(50,0),HY(13,0),HY(29,0)
++  vsub HY(49,0),HY(14,0),HY(30,0)
++  vsub HY(48,0),HY(15,0),HY(31,0)
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  add r0,r8,32
++  vsth VX(48,32++),(r0+=r6) REP 16
++  pop pc
++
++memclear16:
++  # r0 is address
++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
++  vmov HX(0++,0),0 REP 16
++  mov r2,32
++loop:
++  vsth HX(0++,0),(r0+=r2) REP 16
++  add r0,16*16*2
++  sub r1,16*16
++  cmp r1,0
++  bgt loop
++  b lr
++
++
++################################################################################
++# HEVC VPU Deblock
++#
++# Vertical edges before horizontal
++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
++#
++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
++# The VPU code works in units of 16x16 blocks.
++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
++# One final horizontal filter is required at the end.
++# PCM is not allowed in this code.
++#
++#
++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
++
++.set P0,63
++.set P1,62
++.set P2,61
++.set P3,60
++.set Q0,59
++.set Q1,58
++.set Q2,57
++.set Q3,56
++
++.set dp,32
++.set dq,33
++.set d,34
++.set decision,35
++.set beta,36
++.set beta2,37
++.set beta3,38
++.set ptest,39
++.set qtest,40
++.set pqtest,41
++.set thresh,42
++.set deltatest, 44
++.set deltap1, 45
++.set tc25, 46
++.set setup,47
++.set tc,48
++.set tc25,49
++.set tc2, 50
++.set do_filter, 51
++.set delta, 52
++.set tc10, 53
++.set delta0, 54
++.set delta1, 55
++.set zeros, 0
++.set setup_input, 1
++.set deltaq1, 2
++
++
++
++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
++# Row has num16 16x16 blocks across
++# Beta goes from 0 to 64
++# tc goes from 0 to 24
++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
++#   has 8 bytes per edge
++#   has 16 bytes per direction
++#   has 32 bytes per 16x16 block
++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
++hevc_deblock_16x16:
++  push r6-r15, lr
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++
++process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl vert_filter
++  sub r3,8
++  b start_deblock_loop
++deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt start_again
++  pop r6-r15, pc
++start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++vert_filter:
++  push lr
++
++  vmov HX(P3,0), V(16,12)+r3
++  vmov HX(P2,0), V(16,13)+r3
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++  vmov HX(Q2,0), V(16,18)+r3
++  vmov HX(Q3,0), V(16,19)+r3
++
++  bl do_luma_filter
++
++  vadds V(16,13)+r3, HX(P2,0), 0
++  vadds V(16,14)+r3, HX(P1,0), 0
++  vadds V(16,15)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds V(16,16)+r3, HX(Q0,0), 0
++  vadds V(16,17)+r3, HX(Q1,0), 0
++  vadds V(16,18)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++horz_filter:
++  push lr
++
++  vmov HX(P3,0), H(12,0)+r3
++  vmov HX(P2,0), H(13,0)+r3
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++  vmov HX(Q2,0), H(18,0)+r3
++  vmov HX(Q3,0), H(19,0)+r3
++
++  bl do_luma_filter
++
++  vadds H(13,0)+r3, HX(P2,0), 0
++  vadds H(14,0)+r3, HX(P1,0), 0
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++  vadds H(17,0)+r3, HX(Q1,0), 0
++  vadds H(18,0)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_luma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
++  valtl HX(beta,0),H(setup,0),H(setup,0)
++  valtu HX(tc,0),H(setup,0),H(setup,0)
++  vmul HX(tc25,0), HX(tc,0), 5
++  vadd HX(tc25,0),HX(tc25,0), 1
++  vasr HX(tc25,0), HX(tc25,0), 1
++
++  # Compute decision
++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
++
++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
++
++  vadd HX(d,0), HX(dp,0), HX(dq,0)
++  vasr HX(beta2,0),HX(beta,0),2
++  vasr HX(beta3,0),HX(beta,0),3
++
++  # Compute flags that are negative if all conditions pass
++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
++
++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
++  vmov HX(decision,0), 1 IFNN
++  vadd H(decision,0),H(decision,3),0 IFN
++  vadd H(decision,16),H(decision,19),0 IFN
++  vmov -,HX(decision,0) SETF   # N marks strong filter
++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
++
++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
++  vmov HX(decision,0),0 IFNN # Z marks no filter
++
++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
++  # First extract out even terms
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
++  # Now expand back
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
++
++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
++
++  # Do a quick check to see if there is anything to do
++  mov r11, 0 # Signal no filtering
++  vmov -,1 IFNZ SUMS r5
++  cmp r5,0
++  beq filtering_done
++  mov r11, 1 # Signal some filtering
++  # And whether there is any strong filtering
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq normal_filtering
++
++  ##############################################################################
++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
++
++  # Take a copy of the original pixels for use in decision calculation
++  vmov HX(P0,32),HX(P0,0)
++  vmov HX(Q0,32),HX(Q0,0)
++  vmov HX(P1,32),HX(P1,0)
++  vmov HX(Q1,32),HX(Q1,0)
++  vmov HX(P2,32),HX(P2,0)
++  vmov HX(Q2,32),HX(Q2,0)
++
++  vadd -,HX(P2,32),4 CLRA SACC
++  vshl -,HX(P1,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl HX(delta,0),HX(Q1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
++
++  vadd -,HX(P2,32),2 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vshl HX(delta,0),HX(Q0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
++
++  vadd -,HX(Q0,32),4 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vmul -,HX(P2,32),3 SACC
++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
++  #vmov HX(P2,0),3 IFN
++
++  # Now reverse all P/Qs
++
++  vadd -,HX(Q2,32),4 CLRA SACC
++  vshl -,HX(Q1,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl HX(delta,0),HX(P1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
++
++  vadd -,HX(Q2,32),2 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vshl HX(delta,0),HX(P0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
++
++  vadd -,HX(P0,32),4 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vmul -,HX(Q2,32),3 SACC
++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
++
++  ##############################################################################
++  # Normal filtering
++normal_filtering:
++  # Invert the decision flags
++  # make instruction more complicated as assembler has error and loses SETF
++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
++
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq filtering_done
++
++  vasr HX(tc2,0), HX(tc,0), 1
++  vmul HX(tc10,0), HX(tc,0), 10
++
++  vasr HX(thresh,0), HX(beta,0), 1
++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
++
++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
++  # Expand ptest and qtest together
++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
++
++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
++  vmov -,8 CLRA SACC
++  vmul -,HX(delta0,0), 9 SACC
++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
++  vasr HX(delta0,0), HX(delta0,0), 4
++  vdist HX(deltatest,0), HX(delta0,0), 0
++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
++
++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
++
++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
++  vadd HX(deltap1,0), HX(deltap1,0), 1
++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
++  vasr HX(deltap1,0), HX(deltap1,0), 1
++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
++
++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
++  vrsub -, HX(delta0,0), 0 SACC
++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
++
++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
++
++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
++
++  vmov -,HX(deltatest,0) SETF
++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
++
++  #vmov HX(P2,0),1 IFN
++
++filtering_done:
++  b lr
++
++
++hevc_uv_deblock_16x16:
++  push r6-r15, lr
++  mov r14,0
++  b hevc_uv_start
++hevc_uv_deblock_16x16_with_clear:
++  push r6-r15, lr
++  mov r14,1
++  b hevc_uv_start
++
++hevc_uv_start:
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++# r14 is 1 if we should clear the old contents, or 0 if not
++
++uv_process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  cmp r14,1
++  bne uv_skip0
++  vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++  vstb H(zeros,0),(r4)
++uv_skip0:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl uv_vert_filter
++  sub r3,8
++  b uv_start_deblock_loop
++uv_deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  cmp r14,1
++  bne uv_skip1
++  vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++  vstb H(zeros,0),(r4)
++uv_skip1:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip3
++  vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++  vstb H(zeros,0),-16(r4)
++uv_skip3:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,uv_skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++uv_start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt uv_deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip2
++  vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths
++  vstb H(zeros,0),-16(r4)
++uv_skip2:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,uv_skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt uv_start_again
++  pop r6-r15, pc
++uv_start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b uv_process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++uv_vert_filter:
++  push lr
++
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++
++  bl do_chroma_filter
++
++  vadds V(16,15)+r3, HX(P0,0), 0
++  vadds V(16,16)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++uv_horz_filter:
++  push lr
++
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++
++  bl do_chroma_filter
++
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_chroma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
++  valtl HX(tc,0),H(setup,0),H(setup,0)
++
++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
++  vsub -,HX(P1,0),HX(Q1,0) SACC
++  vmov HX(delta,0),4 SACC
++  vasr HX(delta,0),HX(delta,0),3
++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
++  b lr
++
++# r0 = list
++# r1 = number
++hevc_run_command_list:
++  push r6-r7, lr
++  mov r6, r0
++  mov r7, r1
++loop_cmds:
++  ld r0,(r6) # How to encode r6++?
++  add r6,4
++  ld r1,(r6)
++  add r6,4
++  ld r2,(r6)
++  add r6,4
++  ld r3,(r6)
++  add r6,4
++  ld r4,(r6)
++  add r6,4
++  ld r5,(r6)
++  add r6,4
++  bl hevc_trans_16x16
++  sub r7,1
++  cmp r7,0
++  bgt loop_cmds
++
++  pop r6-r7, pc
+diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
+new file mode 100644
+index 0000000000..ee4e357f38
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
+@@ -0,0 +1,3110 @@
++static const unsigned char rpi_hevc_transform10 [] = {
++21,
++106,
++0,
++144,
++47,
++1,
++37,
++106,
++0,
++144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++240,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++2,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++6,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++57,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++2,
++0,
++0,
++101,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
++169,
++3,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++137,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++129,
++0,
++131,
++102,
++0,
++158,
++67,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++108,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++100,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++161,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++182,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++112,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++101,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++103,
++255,
++239,
++3,
++0,
++254,
++0,
++143,
++92,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++93,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++210,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++211,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++107,
++0,
++8,
++255,
++99,
++23,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++23,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++52,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++52,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++0,
++143,
++12,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++13,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++18,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++19,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++33,
++0,
++8,
++255,
++99,
++3,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++3,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++4,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++4,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++137,
++47,
++240,
++40,
++246,
++2,
++140,
++47,
++240,
++128,
++245,
++99,
++140,
++5,
++4,
++0,
++247,
++99,
++140,
++1,
++20,
++88,
++246,
++99,
++140,
++1,
++20,
++0,
++247,
++35,
++136,
++62,
++226,
++32,
++247,
++35,
++136,
++32,
++210,
++0,
++247,
++34,
++136,
++63,
++2,
++208,
++246,
++34,
++136,
++0,
++4,
++0,
++247,
++99,
++136,
++58,
++162,
++32,
++247,
++99,
++136,
++33,
++146,
++0,
++247,
++98,
++136,
++59,
++18,
++208,
++246,
++98,
++136,
++0,
++20,
++0,
++247,
++162,
++136,
++33,
++2,
++88,
++246,
++98,
++137,
++2,
++68,
++88,
++246,
++162,
++137,
++3,
++68,
++208,
++254,
++227,
++136,
++60,
++242,
++192,
++243,
++188,
++11,
++208,
++254,
++227,
++136,
++56,
++178,
++192,
++243,
++188,
++10,
++32,
++255,
++226,
++136,
++38,
++58,
++192,
++243,
++60,
++0,
++208,
++254,
++227,
++136,
++59,
++242,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++49,
++58,
++192,
++243,
++60,
++128,
++0,
++255,
++226,
++136,
++34,
++34,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++37,
++58,
++192,
++243,
++60,
++128,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++194,
++8,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++255,
++202,
++40,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++254,
++0,
++240,
++35,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++226,
++140,
++34,
++34,
++195,
++243,
++60,
++0,
++32,
++255,
++227,
++140,
++36,
++58,
++192,
++243,
++60,
++0,
++0,
++254,
++192,
++136,
++0,
++4,
++0,
++240,
++0,
++160,
++16,
++246,
++226,
++136,
++35,
++50,
++16,
++246,
++226,
++136,
++35,
++50,
++32,
++246,
++226,
++136,
++35,
++50,
++32,
++254,
++226,
++136,
++35,
++58,
++192,
++243,
++60,
++0,
++11,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++115,
++5,
++106,
++0,
++144,
++173,
++1,
++27,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++227,
++0,
++64,
++246,
++163,
++140,
++1,
++4,
++0,
++246,
++192,
++175,
++63,
++2,
++0,
++246,
++192,
++174,
++59,
++2,
++0,
++246,
++128,
++175,
++62,
++2,
++0,
++246,
++128,
++174,
++58,
++2,
++0,
++246,
++64,
++175,
++61,
++2,
++0,
++246,
++64,
++174,
++57,
++2,
++0,
++255,
++43,
++240,
++4,
++212,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++228,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++191,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++143,
++52,
++242,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++212,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++180,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++190,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++143,
++52,
++226,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++180,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++212,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++196,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++189,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++143,
++52,
++210,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++148,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++164,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++228,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++187,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++142,
++52,
++178,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++148,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++244,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++186,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++142,
++52,
++162,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++244,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++148,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++132,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++185,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++142,
++52,
++146,
++192,
++243,
++60,
++128,
++64,
++255,
++98,
++141,
++0,
++52,
++192,
++243,
++0,
++0,
++0,
++254,
++0,
++240,
++53,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++177,
++0,
++88,
++246,
++163,
++140,
++1,
++4,
++128,
++245,
++99,
++141,
++10,
++4,
++88,
++246,
++162,
++138,
++1,
++68,
++0,
++247,
++162,
++138,
++36,
++162,
++88,
++254,
++162,
++138,
++3,
++164,
++192,
++243,
++128,
++11,
++0,
++255,
++226,
++137,
++32,
++2,
++195,
++243,
++60,
++0,
++32,
++247,
++226,
++137,
++42,
++114,
++0,
++255,
++34,
++138,
++33,
++18,
++195,
++243,
++60,
++0,
++32,
++247,
++34,
++138,
++42,
++130,
++16,
++246,
++98,
++138,
++40,
++114,
++16,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++226,
++137,
++41,
++146,
++40,
++246,
++34,
++138,
++41,
++146,
++32,
++247,
++163,
++141,
++63,
++178,
++32,
++247,
++227,
++141,
++62,
++162,
++0,
++254,
++0,
++240,
++8,
++4,
++0,
++240,
++128,
++11,
++128,
++253,
++35,
++240,
++9,
++100,
++192,
++243,
++128,
++10,
++128,
++253,
++163,
++141,
++128,
++115,
++192,
++243,
++152,
++10,
++88,
++246,
++163,
++141,
++4,
++100,
++208,
++246,
++35,
++139,
++0,
++100,
++32,
++255,
++34,
++139,
++53,
++202,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++139,
++0,
++4,
++0,
++240,
++0,
++160,
++240,
++246,
++163,
++141,
++48,
++98,
++0,
++247,
++99,
++139,
++63,
++210,
++0,
++247,
++98,
++139,
++1,
++212,
++88,
++254,
++98,
++139,
++1,
++212,
++192,
++243,
++128,
++11,
++32,
++255,
++99,
++139,
++62,
++98,
++192,
++243,
++188,
++10,
++88,
++246,
++98,
++139,
++1,
++212,
++240,
++246,
++98,
++139,
++50,
++210,
++0,
++247,
++163,
++128,
++59,
++146,
++0,
++247,
++160,
++128,
++1,
++36,
++88,
++254,
++160,
++128,
++1,
++36,
++192,
++243,
++128,
++11,
++0,
++247,
++163,
++128,
++58,
++98,
++64,
++255,
++35,
++240,
++0,
++100,
++192,
++243,
++128,
++10,
++64,
++255,
++163,
++128,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++160,
++128,
++1,
++36,
++240,
++246,
++160,
++128,
++50,
++34,
++8,
++255,
++227,
++143,
++54,
++242,
++192,
++243,
++60,
++128,
++40,
++255,
++227,
++142,
++54,
++178,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++39,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++143,
++45,
++226,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++44,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++40,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++142,
++2,
++162,
++192,
++243,
++60,
++128,
++90,
++0,
++169,
++3,
++14,
++96,
++4,
++31,
++169,
++3,
++30,
++96,
++1,
++31,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++158,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++131,
++102,
++0,
++158,
++81,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++122,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++114,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++128,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++117,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++168,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++72,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++61,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++75,
++255,
++239,
++3,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++47,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++13,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++140,
++47,
++240,
++32,
++247,
++35,
++141,
++63,
++178,
++64,
++254,
++35,
++141,
++2,
++68,
++192,
++243,
++128,
++11,
++32,
++255,
++35,
++240,
++58,
++226,
++192,
++243,
++188,
++10,
++0,
++254,
++0,
++141,
++4,
++4,
++0,
++240,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++240,
++246,
++35,
++141,
++48,
++66,
++0,
++247,
++227,
++143,
++52,
++242,
++32,
++247,
++227,
++142,
++52,
++178,
++90,
++0,
++161,
++3,
++6,
++64,
++23,
++64,
++96,
++8,
++70,
++98,
++97,
++8,
++70,
++98,
++98,
++8,
++70,
++98,
++99,
++8,
++70,
++98,
++100,
++8,
++70,
++98,
++101,
++8,
++70,
++98,
++255,
++159,
++244,
++249,
++23,
++102,
++7,
++106,
++112,
++30,
++33,
++3,
++};
+diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
+new file mode 100644
+index 0000000000..56d5206827
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,3110 @@
++static const unsigned char rpi_hevc_transform8 [] = {
++21,
++106,
++0,
++144,
++47,
++1,
++37,
++106,
++0,
++144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++240,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++8,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++4,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++57,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++8,
++0,
++0,
++69,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
++169,
++3,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++137,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++129,
++0,
++131,
++102,
++0,
++158,
++67,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++108,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++100,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++161,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++182,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++112,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++101,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++103,
++255,
++239,
++3,
++0,
++254,
++0,
++143,
++92,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++93,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++210,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++211,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++107,
++0,
++8,
++255,
++99,
++23,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++23,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++52,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++52,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++0,
++143,
++12,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++13,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++18,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++19,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++33,
++0,
++8,
++255,
++99,
++3,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++3,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++4,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++4,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++137,
++47,
++240,
++40,
++246,
++2,
++140,
++47,
++240,
++128,
++245,
++99,
++140,
++5,
++4,
++0,
++247,
++99,
++140,
++1,
++20,
++88,
++246,
++99,
++140,
++1,
++20,
++0,
++247,
++35,
++136,
++62,
++226,
++32,
++247,
++35,
++136,
++32,
++210,
++0,
++247,
++34,
++136,
++63,
++2,
++208,
++246,
++34,
++136,
++0,
++4,
++0,
++247,
++99,
++136,
++58,
++162,
++32,
++247,
++99,
++136,
++33,
++146,
++0,
++247,
++98,
++136,
++59,
++18,
++208,
++246,
++98,
++136,
++0,
++20,
++0,
++247,
++162,
++136,
++33,
++2,
++88,
++246,
++98,
++137,
++2,
++68,
++88,
++246,
++162,
++137,
++3,
++68,
++208,
++254,
++227,
++136,
++60,
++242,
++192,
++243,
++188,
++11,
++208,
++254,
++227,
++136,
++56,
++178,
++192,
++243,
++188,
++10,
++32,
++255,
++226,
++136,
++38,
++58,
++192,
++243,
++60,
++0,
++208,
++254,
++227,
++136,
++59,
++242,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++49,
++58,
++192,
++243,
++60,
++128,
++0,
++255,
++226,
++136,
++34,
++34,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++37,
++58,
++192,
++243,
++60,
++128,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++194,
++8,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++255,
++202,
++40,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++254,
++0,
++240,
++35,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++226,
++140,
++34,
++34,
++195,
++243,
++60,
++0,
++32,
++255,
++227,
++140,
++36,
++58,
++192,
++243,
++60,
++0,
++0,
++254,
++192,
++136,
++0,
++4,
++0,
++240,
++0,
++160,
++16,
++246,
++226,
++136,
++35,
++50,
++16,
++246,
++226,
++136,
++35,
++50,
++32,
++246,
++226,
++136,
++35,
++50,
++32,
++254,
++226,
++136,
++35,
++58,
++192,
++243,
++60,
++0,
++11,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++115,
++5,
++106,
++0,
++144,
++173,
++1,
++27,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++227,
++0,
++64,
++246,
++163,
++140,
++1,
++4,
++0,
++246,
++192,
++175,
++63,
++2,
++0,
++246,
++192,
++174,
++59,
++2,
++0,
++246,
++128,
++175,
++62,
++2,
++0,
++246,
++128,
++174,
++58,
++2,
++0,
++246,
++64,
++175,
++61,
++2,
++0,
++246,
++64,
++174,
++57,
++2,
++0,
++255,
++43,
++240,
++4,
++212,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++228,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++191,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++143,
++52,
++242,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++212,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++180,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++190,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++143,
++52,
++226,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++180,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++212,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++196,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++189,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++143,
++52,
++210,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++148,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++164,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++228,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++187,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++142,
++52,
++178,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++148,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++244,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++186,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++142,
++52,
++162,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++244,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++148,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++132,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++185,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++142,
++52,
++146,
++192,
++243,
++60,
++128,
++64,
++255,
++98,
++141,
++0,
++52,
++192,
++243,
++0,
++0,
++0,
++254,
++0,
++240,
++53,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++177,
++0,
++88,
++246,
++163,
++140,
++1,
++4,
++128,
++245,
++99,
++141,
++10,
++4,
++88,
++246,
++162,
++138,
++1,
++68,
++0,
++247,
++162,
++138,
++36,
++162,
++88,
++254,
++162,
++138,
++3,
++164,
++192,
++243,
++128,
++11,
++0,
++255,
++226,
++137,
++32,
++2,
++195,
++243,
++60,
++0,
++32,
++247,
++226,
++137,
++42,
++114,
++0,
++255,
++34,
++138,
++33,
++18,
++195,
++243,
++60,
++0,
++32,
++247,
++34,
++138,
++42,
++130,
++16,
++246,
++98,
++138,
++40,
++114,
++16,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++226,
++137,
++41,
++146,
++40,
++246,
++34,
++138,
++41,
++146,
++32,
++247,
++163,
++141,
++63,
++178,
++32,
++247,
++227,
++141,
++62,
++162,
++0,
++254,
++0,
++240,
++8,
++4,
++0,
++240,
++128,
++11,
++128,
++253,
++35,
++240,
++9,
++100,
++192,
++243,
++128,
++10,
++128,
++253,
++163,
++141,
++128,
++115,
++192,
++243,
++152,
++10,
++88,
++246,
++163,
++141,
++4,
++100,
++208,
++246,
++35,
++139,
++0,
++100,
++32,
++255,
++34,
++139,
++53,
++202,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++139,
++0,
++4,
++0,
++240,
++0,
++160,
++240,
++246,
++163,
++141,
++48,
++98,
++0,
++247,
++99,
++139,
++63,
++210,
++0,
++247,
++98,
++139,
++1,
++212,
++88,
++254,
++98,
++139,
++1,
++212,
++192,
++243,
++128,
++11,
++32,
++255,
++99,
++139,
++62,
++98,
++192,
++243,
++188,
++10,
++88,
++246,
++98,
++139,
++1,
++212,
++240,
++246,
++98,
++139,
++50,
++210,
++0,
++247,
++163,
++128,
++59,
++146,
++0,
++247,
++160,
++128,
++1,
++36,
++88,
++254,
++160,
++128,
++1,
++36,
++192,
++243,
++128,
++11,
++0,
++247,
++163,
++128,
++58,
++98,
++64,
++255,
++35,
++240,
++0,
++100,
++192,
++243,
++128,
++10,
++64,
++255,
++163,
++128,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++160,
++128,
++1,
++36,
++240,
++246,
++160,
++128,
++50,
++34,
++8,
++255,
++227,
++143,
++54,
++242,
++192,
++243,
++60,
++128,
++40,
++255,
++227,
++142,
++54,
++178,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++39,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++143,
++45,
++226,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++44,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++40,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++142,
++2,
++162,
++192,
++243,
++60,
++128,
++90,
++0,
++169,
++3,
++14,
++96,
++4,
++31,
++169,
++3,
++30,
++96,
++1,
++31,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++158,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++131,
++102,
++0,
++158,
++81,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++137,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++122,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++114,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++128,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++117,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++168,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++139,
++24,
++0,
++255,
++64,
++0,
++0,
++20,
++196,
++243,
++0,
++0,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++72,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++61,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++75,
++255,
++239,
++3,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++47,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++13,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++140,
++47,
++240,
++32,
++247,
++35,
++141,
++63,
++178,
++64,
++254,
++35,
++141,
++2,
++68,
++192,
++243,
++128,
++11,
++32,
++255,
++35,
++240,
++58,
++226,
++192,
++243,
++188,
++10,
++0,
++254,
++0,
++141,
++4,
++4,
++0,
++240,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++240,
++246,
++35,
++141,
++48,
++66,
++0,
++247,
++227,
++143,
++52,
++242,
++32,
++247,
++227,
++142,
++52,
++178,
++90,
++0,
++161,
++3,
++6,
++64,
++23,
++64,
++96,
++8,
++70,
++98,
++97,
++8,
++70,
++98,
++98,
++8,
++70,
++98,
++99,
++8,
++70,
++98,
++100,
++8,
++70,
++98,
++101,
++8,
++70,
++98,
++255,
++159,
++244,
++249,
++23,
++102,
++7,
++106,
++112,
++30,
++33,
++3,
++};
+diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
+new file mode 100644
+index 0000000000..00bd911a86
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.c
+@@ -0,0 +1,5630 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Wassim Hamidouche
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++#include "libavutil/display.h"
++#include "libavutil/internal.h"
++#include "libavutil/mastering_display_metadata.h"
++#include "libavutil/md5.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/stereo3d.h"
++
++#include "bswapdsp.h"
++#include "bytestream.h"
++#include "cabac_functions.h"
++#include "golomb.h"
++#include "hevc.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_parse.h"
++#include "rpi_hevcdec.h"
++#include "profiles.h"
++
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "pthread.h"
++#include "libavutil/atomic.h"
++
++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++    return a & ((1 << p) - 1);
++}
++#   define av_mod_uintp2   av_mod_uintp2_c
++#endif
++
++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
++
++// UV & Y both have min 4x4 pred (no 2x2 chroma)
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
++#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
++
++#define QPU_C_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
++#define QPU_Y_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
++
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8] = {
++        ENCODE_COEFFS(  0,  64,   0,  0),
++        ENCODE_COEFFS(  2,  58,  10,  2),
++        ENCODE_COEFFS(  4,  54,  16,  2),
++        ENCODE_COEFFS(  6,  46,  28,  4),
++        ENCODE_COEFFS(  4,  36,  36,  4),
++        ENCODE_COEFFS(  4,  28,  46,  6),
++        ENCODE_COEFFS(  2,  16,  54,  4),
++        ENCODE_COEFFS(  2,  10,  58,  2)
++};
++
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++    const uint8_t bit_depth;
++    const uint8_t n;
++    const int * const * setup_fns;
++    const int * const * sync_fns;
++    const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++    ipe_chan_info_t luma;
++    ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
++   {  // 8
++      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++   },
++   {  // 9
++      .luma =   {0},
++      .chroma = {0}
++   },
++   {  // 10
++      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++   }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++    const unsigned int n = ici->n;
++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
++
++    ipe->n = n;
++    ipe->max_fill = q1_size - ipe->min_gap;
++    for(unsigned int i = 0; i < n; i++) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base =
++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++        q->code_setup = qpu_fn(ici->setup_fns[i]);
++        q->code_sync = qpu_fn(ici->sync_fns[i]);
++        q->code_exit = qpu_fn(ici->exit_fns[i]);
++    }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
++{
++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++}
++
++// Unsigned Trivial MOD
++static inline unsigned int utmod(const unsigned int x, const unsigned int n)
++{
++    return x >= n ? x - n : x;
++}
++
++// returns pq->job_n++
++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
++{
++    unsigned int const x2 = pq->job_n;
++    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
++    return x2;
++}
++
++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
++{
++    pq->terminate = 0;
++    pq->job_n = 0;
++    pq->context = s;
++    pq->worker = worker;
++    pq->psem_out = psem_out;
++    pq->pass_n = n;
++    pq->started = 0;
++    sem_init(&pq->sem_in, 0, 0);
++}
++
++static void pass_queue_kill(HEVCRpiPassQueue * const pq)
++{
++    sem_destroy(&pq->sem_in);
++}
++
++static inline void rpi_sem_wait(sem_t * const sem)
++{
++    while (sem_wait(sem) != 0) {
++        av_assert0(errno == EINTR);
++    }
++}
++
++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
++{
++    sem_post(&pq->sem_in);
++}
++
++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    // Do the various passes - common with the worker code
++    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
++        s->passq[i].worker(s, jb);
++    }
++}
++
++
++#if 0
++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
++{
++    int x;
++    sem_getvalue((sem_t *)&jbc->sem_out, &x);
++    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
++}
++#endif
++
++
++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJob * jb;
++    HEVCRpiJobGlobal * const jbg = jbc->jbg;
++
++    pthread_mutex_lock(&jbg->lock);
++    // Check local 1st
++    if ((jb = jbc->jb1) != NULL)
++    {
++        // Only 1 - very easy :-)
++        jbc->jb1 = NULL;
++    }
++    else
++    {
++        // Now look for global free chain
++        if ((jb = jbg->free1) != NULL)
++        {
++            // Found one - unlink it
++            jbg->free1 = jb->next;
++            jb->next = NULL;
++        }
++        else
++        {
++            // Out of places to look - wait for one to become free - add to Qs
++
++            // Global
++            // If "good" lc then add after the last "good" el in the chain
++            // otherwise add to the tail
++            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
++            {
++                // Add to end as we had to wait last time or wait Q empty
++                if ((lc->jw_prev = jbg->wait_tail) == NULL)
++                    jbg->wait_head = lc;
++                else
++                    lc->jw_prev->jw_next = lc;
++                lc->jw_next = NULL;
++                jbg->wait_tail = lc;
++            }
++            else
++            {
++                // This is a "good" lc that we need to poke into the middle
++                // of the Q
++                // We know that the Q isn't empty and there is at least one
++                // !last_progess_good el in it from the previous test
++
++                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
++
++                if (p == NULL)
++                {
++                    // No current good els - add to head
++                    lc->jw_next = jbg->wait_head;
++                    jbg->wait_head = lc;
++                }
++                else
++                {
++                    lc->jw_next = p->jw_next;
++                    p->jw_next = lc;
++                }
++
++                lc->jw_next->jw_prev = lc;
++                lc->jw_prev = p;
++            }
++
++            // If "good" then we are now the last good waiting el
++            if (lc->last_progress_good)
++                jbg->wait_good = lc;
++
++            // Local
++            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
++                jbc->lcw_head = lc;
++            else
++                lc->ljw_prev->ljw_next = lc;
++            lc->ljw_next = NULL;
++            jbc->lcw_tail = lc;
++        }
++    }
++
++    pthread_mutex_unlock(&jbg->lock);
++
++    if (jb == NULL)  // Need to wait
++    {
++        rpi_sem_wait(&lc->jw_sem);
++        jb = lc->jw_job;  // Set by free code
++    }
++
++    return jb;
++}
++
++
++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
++{
++    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
++    HEVCRpiJobCtl * jbc = jb->jbc_local;
++    HEVCRpiLocalContext * lc = NULL;
++
++    pthread_mutex_lock(&jbg->lock);
++
++    if (jbc != NULL)
++    {
++        av_assert1(jbc->jb1 == NULL);
++
++        // Release to Local if nothing waiting there
++        if ((lc = jbc->lcw_head) == NULL)
++            jbc->jb1 = jb;
++    }
++    else
++    {
++        // Release to global if nothing waiting there
++        if ((lc = jbg->wait_head) == NULL)
++        {
++            jb->next = jbg->free1;
++            jbg->free1 = jb;
++        }
++        else
++        {
++            // ? seems somehow mildy ugly...
++            jbc = lc->context->jbc;
++        }
++    }
++
++    if (lc != NULL)
++    {
++        // Something was waiting
++
++        // Unlink
++        // Global
++        if (lc->jw_next == NULL)
++            jbg->wait_tail = lc->jw_prev;
++        else
++            lc->jw_next->jw_prev = lc->jw_prev;
++
++        if (lc->jw_prev == NULL)
++            jbg->wait_head = lc->jw_next;
++        else
++            lc->jw_prev->jw_next = lc->jw_next;
++
++        // Local
++        if (lc->ljw_next == NULL)
++            jbc->lcw_tail = lc->ljw_prev;
++        else
++            lc->ljw_next->ljw_prev = lc->ljw_prev;
++
++        if (lc->ljw_prev == NULL)
++            jbc->lcw_head = lc->ljw_next;
++        else
++            lc->ljw_prev->ljw_next = lc->ljw_next;
++
++        // Update good if required
++        if (jbg->wait_good == lc)
++            jbg->wait_good = lc->jw_prev;
++
++        // Prod
++        lc->jw_job = jb;
++        sem_post(&lc->jw_sem);
++    }
++
++    pthread_mutex_unlock(&jbg->lock);
++}
++
++static void job_lc_kill(HEVCRpiLocalContext * const lc)
++{
++    sem_destroy(&lc->jw_sem);
++}
++
++static void job_lc_init(HEVCRpiLocalContext * const lc)
++{
++    lc->jw_next = NULL;
++    lc->jw_prev = NULL;
++    lc->ljw_next = NULL;
++    lc->ljw_prev = NULL;
++    lc->jw_job = NULL;
++    sem_init(&lc->jw_sem,  0, 0);
++}
++
++// Returns:
++//  0 if we have waited for MV or expect to wait for recon
++//  1 if we haven't waited for MV & do not need to wait for recon
++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
++{
++    if (jb->waited) // reset by rpi_begin
++        return 0;
++    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
++    {
++        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
++                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
++            return 0;
++    }
++    return 1;
++}
++
++// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl *const jbc = s->jbc;
++    HEVCRpiJob * const jb = lc->jb0;
++
++    av_assert1(jb != NULL);
++
++    if (jb->ctu_ts_last < 0) {
++        return;
++    }
++
++    lc->last_progress_good = progress_good(s, jb);
++    jb->waited = !lc->last_progress_good;
++    lc->jb0 = NULL;
++
++    if (s->offload_recon)
++    {
++        pthread_mutex_lock(&jbc->in_lock);
++        jbc->offloadq[jbc->offload_in] = jb;
++        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
++        pthread_mutex_unlock(&jbc->in_lock);
++
++        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
++    }
++    else
++    {
++        pass_queue_do_all(s, jb);  // Consumes job before return
++    }
++}
++
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++//
++// Now safe against multiple callers - needed for tiles
++// "normal" and WPP will only call here one at a time
++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++
++    // It is legit for us to already have a job allocated - do nothing in this case
++    if (lc->jb0 != NULL)
++        return;
++
++    if (s->offload_recon)
++        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
++
++    lc->jb0 = job_alloc(jbc, lc);
++
++    rpi_begin(s, lc->jb0, lc->ts);
++}
++
++// Free up a job without submission
++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++    HEVCRpiJob * const jb = lc->jb0;
++
++    if (jb == NULL) {
++        return;
++    }
++
++    lc->jb0 = NULL;
++
++    job_free(jbc, jb);
++
++    // If offload then poke sem_out too
++    if (s->offload_recon) {
++        sem_post(&jbc->sem_out);
++    }
++}
++
++
++// Call this to wait for all jobs to have completed at the end of a frame
++// Slightly icky as there is no clean way to wait for a sem to count up
++// Not reentrant - call on main thread only
++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++    int i = 0;
++
++    // We shouldn't reach here with an unsubmitted job
++    av_assert1(lc->jb0 == NULL);
++
++    // If no offload then there can't be anything to wait for
++    if (!s->offload_recon) {
++        return;
++    }
++
++    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
++    {
++        for (i = 0; i != RPI_MAX_JOBS; ++i) {
++            rpi_sem_wait(&jbc->sem_out);
++        }
++        for (i = 0; i != RPI_MAX_JOBS; ++i) {
++            sem_post(&jbc->sem_out);
++        }
++    }
++}
++
++static void * pass_worker(void *arg)
++{
++    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
++    HEVCRpiContext *const s = pq->context;
++
++    for (;;)
++    {
++        rpi_sem_wait(&pq->sem_in);
++
++        if (pq->terminate)
++            break;
++
++        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
++        // * should really set jb->passes_done here
++
++        sem_post(pq->psem_out);
++    }
++    return NULL;
++}
++
++static void pass_queues_start_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
++        pqs[i].started = 1;
++    }
++}
++
++static void pass_queues_term_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++        pqs[i].terminate = 1;
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        if (pqs[i].started)
++            sem_post(&pqs[i].sem_in);
++    }
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        if (pqs[i].started) {
++            pthread_join(pqs[i].thread, NULL);
++            pqs[i].started = 0;
++        }
++    }
++}
++
++static void pass_queues_kill_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++        pass_queue_kill(pqs + i);
++}
++
++
++static void worker_pic_free_one(HEVCRpiJob * const jb)
++{
++    // Free coeff stuff - allocation not the same for all buffers
++    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++    if (cf->s[0].buf != NULL)
++        av_freep(&cf->mptr);
++    if (cf->s[2].buf != NULL)
++        gpu_free(&cf->gptr);
++    memset(cf, 0, sizeof(*cf));
++}
++
++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
++{
++    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++        goto fail;
++    cf->s[2].buf = (int16_t *)cf->gptr.arm;
++    cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++    // Must be 64 byte aligned for our zero zapping code so over-allocate &
++    // round
++    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++        goto fail;
++    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++    return 0;
++
++fail:
++    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
++    worker_pic_free_one(jb);
++    return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++    unsigned int i;
++    for (i = 0; i != 4; ++i) {
++        cf->s[i].n = 0;
++    }
++}
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
++{
++    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++    cfe->n += n;
++    return coeffs;
++}
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field)
++{
++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
++        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
++        sem_t * sem = NULL;
++
++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
++            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
++
++            av_assert1(pwait->req == -1 && pwait->next == NULL);
++            jb->waited = 1;  // Remember that we had to wait for later scheduling
++
++            pwait->req = val;
++            pwait->next = NULL;
++            if (pstate->first == NULL)
++                pstate->first = pwait;
++            else
++                pstate->last->next = pwait;
++            pstate->last = pwait;
++            sem = &pwait->sem;
++        }
++        pthread_mutex_unlock(&pstate->lock);
++
++        if (sem != NULL) {
++            rpi_sem_wait(sem);
++        }
++    }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
++{
++    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
++
++    ((int *)s->ref->tf.progress->data)[field] = val;
++
++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++    {
++        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
++        HEVCRpiFrameProgressWait * pwait;
++
++        while ((pwait = *ppwait) != NULL) {
++            if (pwait->req > val)
++            {
++                ppwait = &pwait->next;
++                pstate->last = pwait;
++            }
++            else
++            {
++                *ppwait = pwait->next;
++                pwait->req = -1;
++                pwait->next = NULL;
++                sem_post(&pwait->sem);
++            }
++        }
++    }
++    pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
++{
++    pstate->first = NULL;
++    pstate->last = NULL;
++    pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++    pwait->req = -1;
++    pwait->next = NULL;
++    sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
++{
++    av_assert1(pstate->first == NULL);
++    pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++    sem_destroy(&pwait->sem);
++}
++
++
++/**
++ * NOTE: Each function hls_foo correspond to the function foo in the
++ * specification (HLS stands for High Level Syntax).
++ */
++
++/**
++ * Section 5.7
++ */
++
++/* free everything allocated  by pic_arrays_init() */
++static void pic_arrays_free(HEVCRpiContext *s)
++{
++#ifdef RPI_DEBLOCK_VPU
++    {
++        int i;
++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++
++            if (dvq->vpu_cmds_arm) {
++                gpu_free(&dvq->deblock_vpu_gmem);
++              dvq->vpu_cmds_arm = 0;
++            }
++        }
++    }
++#endif
++    av_freep(&s->sao);
++    av_freep(&s->deblock);
++
++    av_freep(&s->skip_flag);
++    av_freep(&s->tab_ct_depth);
++
++    av_freep(&s->tab_ipm);
++    av_freep(&s->cbf_luma);
++    av_freep(&s->is_pcm);
++
++    av_freep(&s->qp_y_tab);
++    av_freep(&s->tab_slice_address);
++    av_freep(&s->filter_slice_edges);
++
++    av_freep(&s->horizontal_bs);
++    av_freep(&s->vertical_bs);
++
++    av_freep(&s->sh.entry_point_offset);
++    av_freep(&s->sh.size);
++    av_freep(&s->sh.offset);
++
++    av_buffer_pool_uninit(&s->tab_mvf_pool);
++    av_buffer_pool_uninit(&s->rpl_tab_pool);
++}
++
++/* allocate arrays that depend on frame dimensions */
++static int pic_arrays_init(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++    int log2_min_cb_size = sps->log2_min_cb_size;
++    int width            = sps->width;
++    int height           = sps->height;
++    int pic_size_in_ctb  = ((width  >> log2_min_cb_size) + 1) *
++                           ((height >> log2_min_cb_size) + 1);
++    int ctb_count        = sps->ctb_width * sps->ctb_height;
++    int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
++
++#ifdef RPI_DEBLOCK_VPU
++    {
++        int i;
++        s->enable_rpi_deblock = !sps->sao_enabled;
++        s->setup_width = (sps->width+15) / 16;
++        s->setup_height = (sps->height+15) / 16;
++        s->uv_setup_width = ( (sps->width >> ctx_hshift(s, 1)) + 15) / 16;
++        s->uv_setup_height = ( (sps->height >> ctx_vshift(s, 1)) + 15) / 16;
++
++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
++        {
++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
++            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
++            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
++            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
++            const unsigned int total_size =- cmd_size + y_size + uv_size;
++            int p_vc;
++            uint8_t * p_arm;
++#if RPI_VPU_DEBLOCK_CACHED
++            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
++#else
++            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
++#endif
++            p_vc = dvq->deblock_vpu_gmem.vc;
++            p_arm = dvq->deblock_vpu_gmem.arm;
++
++            // Zap all
++            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
++
++            // Subdivide
++            dvq->vpu_cmds_arm = (void*)p_arm;
++            dvq->vpu_cmds_vc = p_vc;
++
++            p_arm += cmd_size;
++            p_vc += cmd_size;
++
++            dvq->y_setup_arm = (void*)p_arm;
++            dvq->y_setup_vc = (void*)p_vc;
++
++            p_arm += y_size;
++            p_vc += y_size;
++
++            dvq->uv_setup_arm = (void*)p_arm;
++            dvq->uv_setup_vc = (void*)p_vc;
++        }
++
++        s->dvq_n = 0;
++        s->dvq = s->dvq_ents + s->dvq_n;
++    }
++#endif
++
++    s->bs_width  = (width  >> 2) + 1;
++    s->bs_height = (height >> 2) + 1;
++
++    s->sao           = av_mallocz_array(ctb_count, sizeof(*s->sao));
++    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
++    if (!s->sao || !s->deblock)
++        goto fail;
++
++    s->skip_flag    = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
++    s->tab_ct_depth = av_malloc_array(sps->min_cb_height, sps->min_cb_width);
++    if (!s->skip_flag || !s->tab_ct_depth)
++        goto fail;
++
++    s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height);
++    s->tab_ipm  = av_mallocz(min_pu_size);
++    s->is_pcm   = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1);
++    if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm)
++        goto fail;
++
++    s->filter_slice_edges = av_mallocz(ctb_count);
++    s->tab_slice_address  = av_malloc_array(pic_size_in_ctb,
++                                      sizeof(*s->tab_slice_address));
++    s->qp_y_tab           = av_malloc_array(pic_size_in_ctb,
++                                      sizeof(*s->qp_y_tab));
++    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
++        goto fail;
++
++    s->horizontal_bs = av_mallocz_array(s->bs_width, s->bs_height);
++    s->vertical_bs   = av_mallocz_array(s->bs_width, s->bs_height);
++    if (!s->horizontal_bs || !s->vertical_bs)
++        goto fail;
++
++    s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField),
++                                          av_buffer_allocz);
++    s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab),
++                                          av_buffer_allocz);
++    if (!s->tab_mvf_pool || !s->rpl_tab_pool)
++        goto fail;
++
++    return 0;
++
++fail:
++    pic_arrays_free(s);
++    return AVERROR(ENOMEM);
++}
++
++static void default_pred_weight_table(HEVCRpiContext * const s)
++{
++  unsigned int i;
++  s->sh.luma_log2_weight_denom = 0;
++  s->sh.chroma_log2_weight_denom = 0;
++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++      s->sh.luma_weight_l0[i] = 1;
++      s->sh.luma_offset_l0[i] = 0;
++      s->sh.chroma_weight_l0[i][0] = 1;
++      s->sh.chroma_offset_l0[i][0] = 0;
++      s->sh.chroma_weight_l0[i][1] = 1;
++      s->sh.chroma_offset_l0[i][1] = 0;
++  }
++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++      s->sh.luma_weight_l1[i] = 1;
++      s->sh.luma_offset_l1[i] = 0;
++      s->sh.chroma_weight_l1[i][0] = 1;
++      s->sh.chroma_offset_l1[i][0] = 0;
++      s->sh.chroma_weight_l1[i][1] = 1;
++      s->sh.chroma_offset_l1[i][1] = 0;
++  }
++}
++
++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
++{
++    int i = 0;
++    int j = 0;
++    uint8_t luma_weight_l0_flag[16];
++    uint8_t chroma_weight_l0_flag[16];
++    uint8_t luma_weight_l1_flag[16];
++    uint8_t chroma_weight_l1_flag[16];
++    int luma_log2_weight_denom;
++
++    luma_log2_weight_denom = get_ue_golomb_long(gb);
++    if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7)
++        av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom);
++    s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3);
++    if (ctx_cfmt(s) != 0) {
++        int delta = get_se_golomb(gb);
++        s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3);
++    }
++
++    for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++        luma_weight_l0_flag[i] = get_bits1(gb);
++        if (!luma_weight_l0_flag[i]) {
++            s->sh.luma_weight_l0[i] = 1 << s->sh.luma_log2_weight_denom;
++            s->sh.luma_offset_l0[i] = 0;
++        }
++    }
++    if (ctx_cfmt(s) != 0) {
++        for (i = 0; i < s->sh.nb_refs[L0]; i++)
++            chroma_weight_l0_flag[i] = get_bits1(gb);
++    } else {
++        for (i = 0; i < s->sh.nb_refs[L0]; i++)
++            chroma_weight_l0_flag[i] = 0;
++    }
++    for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++        if (luma_weight_l0_flag[i]) {
++            int delta_luma_weight_l0 = get_se_golomb(gb);
++            s->sh.luma_weight_l0[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l0;
++            s->sh.luma_offset_l0[i] = get_se_golomb(gb);
++        }
++        if (chroma_weight_l0_flag[i]) {
++            for (j = 0; j < 2; j++) {
++                int delta_chroma_weight_l0 = get_se_golomb(gb);
++                int delta_chroma_offset_l0 = get_se_golomb(gb);
++
++                if (   (int8_t)delta_chroma_weight_l0 != delta_chroma_weight_l0
++                    || delta_chroma_offset_l0 < -(1<<17) || delta_chroma_offset_l0 > (1<<17)) {
++                    return AVERROR_INVALIDDATA;
++                }
++
++                s->sh.chroma_weight_l0[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l0;
++                s->sh.chroma_offset_l0[i][j] = av_clip((delta_chroma_offset_l0 - ((128 * s->sh.chroma_weight_l0[i][j])
++                                                                                    >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
++            }
++        } else {
++            s->sh.chroma_weight_l0[i][0] = 1 << s->sh.chroma_log2_weight_denom;
++            s->sh.chroma_offset_l0[i][0] = 0;
++            s->sh.chroma_weight_l0[i][1] = 1 << s->sh.chroma_log2_weight_denom;
++            s->sh.chroma_offset_l0[i][1] = 0;
++        }
++    }
++    if (s->sh.slice_type == HEVC_SLICE_B) {
++        for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++            luma_weight_l1_flag[i] = get_bits1(gb);
++            if (!luma_weight_l1_flag[i]) {
++                s->sh.luma_weight_l1[i] = 1 << s->sh.luma_log2_weight_denom;
++                s->sh.luma_offset_l1[i] = 0;
++            }
++        }
++        if (ctx_cfmt(s) != 0) {
++            for (i = 0; i < s->sh.nb_refs[L1]; i++)
++                chroma_weight_l1_flag[i] = get_bits1(gb);
++        } else {
++            for (i = 0; i < s->sh.nb_refs[L1]; i++)
++                chroma_weight_l1_flag[i] = 0;
++        }
++        for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++            if (luma_weight_l1_flag[i]) {
++                int delta_luma_weight_l1 = get_se_golomb(gb);
++                s->sh.luma_weight_l1[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l1;
++                s->sh.luma_offset_l1[i] = get_se_golomb(gb);
++            }
++            if (chroma_weight_l1_flag[i]) {
++                for (j = 0; j < 2; j++) {
++                    int delta_chroma_weight_l1 = get_se_golomb(gb);
++                    int delta_chroma_offset_l1 = get_se_golomb(gb);
++
++                    if (   (int8_t)delta_chroma_weight_l1 != delta_chroma_weight_l1
++                        || delta_chroma_offset_l1 < -(1<<17) || delta_chroma_offset_l1 > (1<<17)) {
++                        return AVERROR_INVALIDDATA;
++                    }
++
++                    s->sh.chroma_weight_l1[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l1;
++                    s->sh.chroma_offset_l1[i][j] = av_clip((delta_chroma_offset_l1 - ((128 * s->sh.chroma_weight_l1[i][j])
++                                                                                        >> s->sh.chroma_log2_weight_denom) + 128), -128, 127);
++                }
++            } else {
++                s->sh.chroma_weight_l1[i][0] = 1 << s->sh.chroma_log2_weight_denom;
++                s->sh.chroma_offset_l1[i][0] = 0;
++                s->sh.chroma_weight_l1[i][1] = 1 << s->sh.chroma_log2_weight_denom;
++                s->sh.chroma_offset_l1[i][1] = 0;
++            }
++        }
++    }
++    return 0;
++}
++
++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
++{
++    const HEVCRpiSPS *sps = s->ps.sps;
++    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
++    int prev_delta_msb = 0;
++    unsigned int nb_sps = 0, nb_sh;
++    int i;
++
++    rps->nb_refs = 0;
++    if (!sps->long_term_ref_pics_present_flag)
++        return 0;
++
++    if (sps->num_long_term_ref_pics_sps > 0)
++        nb_sps = get_ue_golomb_long(gb);
++    nb_sh = get_ue_golomb_long(gb);
++
++    if (nb_sps > sps->num_long_term_ref_pics_sps)
++        return AVERROR_INVALIDDATA;
++    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
++        return AVERROR_INVALIDDATA;
++
++    rps->nb_refs = nb_sh + nb_sps;
++
++    for (i = 0; i < rps->nb_refs; i++) {
++        uint8_t delta_poc_msb_present;
++
++        if (i < nb_sps) {
++            uint8_t lt_idx_sps = 0;
++
++            if (sps->num_long_term_ref_pics_sps > 1)
++                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
++
++            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
++            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
++        } else {
++            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
++            rps->used[i] = get_bits1(gb);
++        }
++
++        delta_poc_msb_present = get_bits1(gb);
++        if (delta_poc_msb_present) {
++            int64_t delta = get_ue_golomb_long(gb);
++            int64_t poc;
++
++            if (i && i != nb_sps)
++                delta += prev_delta_msb;
++
++            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
++            if (poc != (int32_t)poc)
++                return AVERROR_INVALIDDATA;
++            rps->poc[i] = poc;
++            prev_delta_msb = delta;
++        }
++    }
++
++    return 0;
++}
++
++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
++                                 const HEVCRpiSPS *sps)
++{
++    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
++    const HEVCWindow *ow = &sps->output_window;
++    unsigned int num = 0, den = 0;
++
++    avctx->pix_fmt             = sps->pix_fmt;
++    avctx->coded_width         = sps->width;
++    avctx->coded_height        = sps->height;
++    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
++    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
++    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
++    avctx->profile             = sps->ptl.general_ptl.profile_idc;
++    avctx->level               = sps->ptl.general_ptl.level_idc;
++
++    ff_set_sar(avctx, sps->vui.sar);
++
++    if (sps->vui.video_signal_type_present_flag)
++        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
++                                                            : AVCOL_RANGE_MPEG;
++    else
++        avctx->color_range = AVCOL_RANGE_MPEG;
++
++    if (sps->vui.colour_description_present_flag) {
++        avctx->color_primaries = sps->vui.colour_primaries;
++        avctx->color_trc       = sps->vui.transfer_characteristic;
++        avctx->colorspace      = sps->vui.matrix_coeffs;
++    } else {
++        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
++        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
++        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
++    }
++
++    if (vps->vps_timing_info_present_flag) {
++        num = vps->vps_num_units_in_tick;
++        den = vps->vps_time_scale;
++    } else if (sps->vui.vui_timing_info_present_flag) {
++        num = sps->vui.vui_num_units_in_tick;
++        den = sps->vui.vui_time_scale;
++    }
++
++    if (num != 0 && den != 0)
++        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
++                  num, den, 1 << 30);
++}
++
++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
++
++    // Admit to no h/w formats
++
++    *fmt++ = sps->pix_fmt;
++    *fmt = AV_PIX_FMT_NONE;
++
++    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
++}
++
++static int is_sps_supported(const HEVCRpiSPS * const sps)
++{
++    return av_rpi_is_sand_format(sps->pix_fmt) &&
++           sps->width <= HEVC_RPI_MAX_WIDTH &&
++           sps->height <= HEVC_RPI_MAX_HEIGHT;
++}
++
++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
++                   const enum AVPixelFormat pix_fmt)
++{
++    int ret;
++
++    pic_arrays_free(s);
++    s->ps.sps = NULL;
++    s->ps.vps = NULL;
++
++    if (sps == NULL)
++        return 0;
++
++    if (!is_sps_supported(sps))
++        return AVERROR_DECODER_NOT_FOUND;
++
++    ret = pic_arrays_init(s, sps);
++    if (ret < 0)
++        goto fail;
++
++    export_stream_params(s->avctx, &s->ps, sps);
++
++    s->avctx->pix_fmt = pix_fmt;
++
++    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
++    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
++    ff_videodsp_init (&s->vdsp,    sps->bit_depth);
++
++    // * We don't support cross_component_prediction_enabled_flag but as that
++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
++    //   only deal with sand which is never 4:4:4
++    //   [support wouldn't be hard]
++
++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++
++    av_freep(&s->sao_pixel_buffer_h[0]);
++    av_freep(&s->sao_pixel_buffer_v[0]);
++
++    if (sps->sao_enabled)
++    {
++        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
++        unsigned int c_idx;
++        size_t vsize[3] = {0};
++        size_t hsize[3] = {0};
++
++        for(c_idx = 0; c_idx < c_count; c_idx++) {
++            int w = sps->width >> ctx_hshift(s, c_idx);
++            int h = sps->height >> ctx_vshift(s, c_idx);
++            // ctb height & width are a min of 8 so this must a multiple of 16
++            // so no point rounding up!
++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++        }
++
++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++        // when we have plaited chroma
++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++    }
++
++    s->ps.sps = sps;
++    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++
++    return 0;
++
++fail:
++    pic_arrays_free(s);
++    s->ps.sps = NULL;
++    return ret;
++}
++
++static int hls_slice_header(HEVCRpiContext *s)
++{
++    GetBitContext *gb = &s->HEVClc->gb;
++    SliceHeader *sh   = &s->sh;
++    int i, ret;
++
++    // Coded parameters
++    sh->first_slice_in_pic_flag = get_bits1(gb);
++    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++        if (IS_IDR(s))
++            ff_hevc_rpi_clear_refs(s);
++    }
++    sh->no_output_of_prior_pics_flag = 0;
++    if (IS_IRAP(s))
++        sh->no_output_of_prior_pics_flag = get_bits1(gb);
++
++    sh->pps_id = get_ue_golomb_long(gb);
++    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
++        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
++        return AVERROR_INVALIDDATA;
++    }
++    if (!sh->first_slice_in_pic_flag &&
++        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
++        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
++        return AVERROR_INVALIDDATA;
++    }
++    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
++    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
++        sh->no_output_of_prior_pics_flag = 1;
++
++    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
++        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
++        const HEVCRpiSPS *last_sps = s->ps.sps;
++
++        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
++            if (sps->width != last_sps->width || sps->height != last_sps->height ||
++                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
++                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
++                sh->no_output_of_prior_pics_flag = 0;
++        }
++        ff_hevc_rpi_clear_refs(s);
++
++        ret = set_sps(s, sps, get_format(s, sps));
++        if (ret < 0)
++            return ret;
++
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++    }
++
++    sh->dependent_slice_segment_flag = 0;
++    if (!sh->first_slice_in_pic_flag) {
++        int slice_address_length;
++
++        if (s->ps.pps->dependent_slice_segments_enabled_flag)
++            sh->dependent_slice_segment_flag = get_bits1(gb);
++
++        slice_address_length = av_ceil_log2(s->ps.sps->ctb_width *
++                                            s->ps.sps->ctb_height);
++        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
++        if (sh->slice_segment_addr >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "Invalid slice segment address: %u.\n",
++                   sh->slice_segment_addr);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (!sh->dependent_slice_segment_flag) {
++            sh->slice_addr = sh->slice_segment_addr;
++            s->slice_idx++;
++        }
++    } else {
++        sh->slice_segment_addr = sh->slice_addr = 0;
++        s->slice_idx           = 0;
++        s->slice_initialized   = 0;
++    }
++
++    if (!sh->dependent_slice_segment_flag) {
++        s->slice_initialized = 0;
++
++        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
++            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
++
++        sh->slice_type = get_ue_golomb_long(gb);
++        if (!(sh->slice_type == HEVC_SLICE_I ||
++              sh->slice_type == HEVC_SLICE_P ||
++              sh->slice_type == HEVC_SLICE_B)) {
++            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
++                   sh->slice_type);
++            return AVERROR_INVALIDDATA;
++        }
++        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
++            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
++            return AVERROR_INVALIDDATA;
++        }
++
++        // when flag is not present, picture is inferred to be output
++        sh->pic_output_flag = 1;
++        if (s->ps.pps->output_flag_present_flag)
++            sh->pic_output_flag = get_bits1(gb);
++
++        if (s->ps.sps->separate_colour_plane_flag)
++            sh->colour_plane_id = get_bits(gb, 2);
++
++        if (!IS_IDR(s)) {
++            int poc, pos;
++
++            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
++            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
++            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
++                av_log(s->avctx, AV_LOG_WARNING,
++                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
++                if (s->avctx->err_recognition & AV_EF_EXPLODE)
++                    return AVERROR_INVALIDDATA;
++                poc = s->poc;
++            }
++            s->poc = poc;
++
++            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
++            pos = get_bits_left(gb);
++            if (!sh->short_term_ref_pic_set_sps_flag) {
++                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
++                if (ret < 0)
++                    return ret;
++
++                sh->short_term_rps = &sh->slice_rps;
++            } else {
++                int numbits, rps_idx;
++
++                if (!s->ps.sps->nb_st_rps) {
++                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
++                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
++                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
++            }
++            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++            pos = get_bits_left(gb);
++            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
++            if (ret < 0) {
++                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
++                if (s->avctx->err_recognition & AV_EF_EXPLODE)
++                    return AVERROR_INVALIDDATA;
++            }
++            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
++                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
++            else
++                sh->slice_temporal_mvp_enabled_flag = 0;
++        } else {
++            s->sh.short_term_rps = NULL;
++            s->poc               = 0;
++        }
++
++        /* 8.3.1 */
++        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
++            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
++            s->nal_unit_type != HEVC_NAL_TSA_N   &&
++            s->nal_unit_type != HEVC_NAL_STSA_N  &&
++            s->nal_unit_type != HEVC_NAL_RADL_N  &&
++            s->nal_unit_type != HEVC_NAL_RADL_R  &&
++            s->nal_unit_type != HEVC_NAL_RASL_N  &&
++            s->nal_unit_type != HEVC_NAL_RASL_R)
++            s->pocTid0 = s->poc;
++
++        if (s->ps.sps->sao_enabled) {
++            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
++            if (ctx_cfmt(s) != 0) {
++                sh->slice_sample_adaptive_offset_flag[1] =
++                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
++            }
++        } else {
++            sh->slice_sample_adaptive_offset_flag[0] = 0;
++            sh->slice_sample_adaptive_offset_flag[1] = 0;
++            sh->slice_sample_adaptive_offset_flag[2] = 0;
++        }
++
++        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
++        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
++            int nb_refs;
++
++            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
++            if (sh->slice_type == HEVC_SLICE_B)
++                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
++
++            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
++                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
++                if (sh->slice_type == HEVC_SLICE_B)
++                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
++            }
++            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
++                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
++                       sh->nb_refs[L0], sh->nb_refs[L1]);
++                return AVERROR_INVALIDDATA;
++            }
++
++            sh->rpl_modification_flag[0] = 0;
++            sh->rpl_modification_flag[1] = 0;
++            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
++            if (!nb_refs) {
++                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
++                return AVERROR_INVALIDDATA;
++            }
++
++            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
++                sh->rpl_modification_flag[0] = get_bits1(gb);
++                if (sh->rpl_modification_flag[0]) {
++                    for (i = 0; i < sh->nb_refs[L0]; i++)
++                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
++                }
++
++                if (sh->slice_type == HEVC_SLICE_B) {
++                    sh->rpl_modification_flag[1] = get_bits1(gb);
++                    if (sh->rpl_modification_flag[1] == 1)
++                        for (i = 0; i < sh->nb_refs[L1]; i++)
++                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
++                }
++            }
++
++            if (sh->slice_type == HEVC_SLICE_B)
++                sh->mvd_l1_zero_flag = get_bits1(gb);
++
++            if (s->ps.pps->cabac_init_present_flag)
++                sh->cabac_init_flag = get_bits1(gb);
++            else
++                sh->cabac_init_flag = 0;
++
++            sh->collocated_ref_idx = 0;
++            if (sh->slice_temporal_mvp_enabled_flag) {
++                sh->collocated_list = L0;
++                if (sh->slice_type == HEVC_SLICE_B)
++                    sh->collocated_list = !get_bits1(gb);
++
++                if (sh->nb_refs[sh->collocated_list] > 1) {
++                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
++                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
++                        av_log(s->avctx, AV_LOG_ERROR,
++                               "Invalid collocated_ref_idx: %d.\n",
++                               sh->collocated_ref_idx);
++                        return AVERROR_INVALIDDATA;
++                    }
++                }
++            }
++
++            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
++                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) {
++                int ret = pred_weight_table(s, gb);
++                if (ret < 0)
++                    return ret;
++            }
++            else
++            {
++              // Give us unit weights
++              default_pred_weight_table(s);
++            }
++
++            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++                av_log(s->avctx, AV_LOG_ERROR,
++                       "Invalid number of merging MVP candidates: %d.\n",
++                       sh->max_num_merge_cand);
++                return AVERROR_INVALIDDATA;
++            }
++        }
++
++        sh->slice_qp_delta = get_se_golomb(gb);
++
++        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
++            sh->slice_cb_qp_offset = get_se_golomb(gb);
++            sh->slice_cr_qp_offset = get_se_golomb(gb);
++        } else {
++            sh->slice_cb_qp_offset = 0;
++            sh->slice_cr_qp_offset = 0;
++        }
++
++        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
++            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
++        else
++            sh->cu_chroma_qp_offset_enabled_flag = 0;
++
++        if (s->ps.pps->deblocking_filter_control_present_flag) {
++            int deblocking_filter_override_flag = 0;
++
++            if (s->ps.pps->deblocking_filter_override_enabled_flag)
++                deblocking_filter_override_flag = get_bits1(gb);
++
++            if (deblocking_filter_override_flag) {
++                sh->disable_deblocking_filter_flag = get_bits1(gb);
++                if (!sh->disable_deblocking_filter_flag) {
++                    int beta_offset_div2 = get_se_golomb(gb);
++                    int tc_offset_div2   = get_se_golomb(gb) ;
++                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
++                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
++                        av_log(s->avctx, AV_LOG_ERROR,
++                            "Invalid deblock filter offsets: %d, %d\n",
++                            beta_offset_div2, tc_offset_div2);
++                        return AVERROR_INVALIDDATA;
++                    }
++                    sh->beta_offset = beta_offset_div2 * 2;
++                    sh->tc_offset   =   tc_offset_div2 * 2;
++                }
++            } else {
++                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
++                sh->beta_offset                    = s->ps.pps->beta_offset;
++                sh->tc_offset                      = s->ps.pps->tc_offset;
++            }
++        } else {
++            sh->disable_deblocking_filter_flag = 0;
++            sh->beta_offset                    = 0;
++            sh->tc_offset                      = 0;
++        }
++
++        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
++            (sh->slice_sample_adaptive_offset_flag[0] ||
++             sh->slice_sample_adaptive_offset_flag[1] ||
++             !sh->disable_deblocking_filter_flag)) {
++            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++        } else {
++            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
++        }
++    } else if (!s->slice_initialized) {
++        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    sh->num_entry_point_offsets = 0;
++    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
++        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
++        // It would be possible to bound this tighter but this here is simpler
++        if (num_entry_point_offsets > get_bits_left(gb)) {
++            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
++            return AVERROR_INVALIDDATA;
++        }
++
++        sh->num_entry_point_offsets = num_entry_point_offsets;
++        if (sh->num_entry_point_offsets > 0) {
++            int offset_len = get_ue_golomb_long(gb) + 1;
++
++            if (offset_len < 1 || offset_len > 32) {
++                sh->num_entry_point_offsets = 0;
++                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
++                return AVERROR_INVALIDDATA;
++            }
++
++            av_freep(&sh->entry_point_offset);
++            av_freep(&sh->offset);
++            av_freep(&sh->size);
++            sh->entry_point_offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(unsigned));
++            sh->offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
++            sh->size = av_malloc_array(sh->num_entry_point_offsets, sizeof(int));
++            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
++                sh->num_entry_point_offsets = 0;
++                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
++                return AVERROR(ENOMEM);
++            }
++            for (i = 0; i < sh->num_entry_point_offsets; i++) {
++                unsigned val = get_bits_long(gb, offset_len);
++                sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size
++            }
++            if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) {
++                s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here
++                s->threads_number = 1;
++            } else
++                s->enable_parallel_tiles = 0;
++        } else
++            s->enable_parallel_tiles = 0;
++    }
++
++    if (s->ps.pps->slice_header_extension_present_flag) {
++        unsigned int length = get_ue_golomb_long(gb);
++        if (length*8LL > get_bits_left(gb)) {
++            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
++            return AVERROR_INVALIDDATA;
++        }
++        for (i = 0; i < length; i++)
++            skip_bits(gb, 8);  // slice_header_extension_data_byte
++    }
++
++    // Inferred parameters
++    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
++    if (sh->slice_qp > 51 ||
++        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "The slice_qp %d is outside the valid range "
++               "[%d, 51].\n",
++               sh->slice_qp,
++               -s->ps.sps->qp_bd_offset);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sh->slice_ctb_addr_rs = sh->slice_segment_addr;
++
++    if (!s->sh.slice_ctb_addr_rs && s->sh.dependent_slice_segment_flag) {
++        av_log(s->avctx, AV_LOG_ERROR, "Impossible slice segment.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (get_bits_left(gb) < 0) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Overread slice header by %d bits\n", -get_bits_left(gb));
++        return AVERROR_INVALIDDATA;
++    }
++
++    s->slice_initialized = 1;
++    return 0;
++}
++
++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
++{
++    SAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
++    int c_idx, i;
++
++    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
++        s->sh.slice_sample_adaptive_offset_flag[1]) {
++        if (lc->ctb_left_flag)
++        {
++            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++            if (sao_merge_left_flag) {
++                *sao = sao[-1];
++                return;
++            }
++        }
++        if (lc->ctb_up_flag)
++        {
++            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++            if (sao_merge_up_flag) {
++                *sao = sao[-(int)s->ps.sps->ctb_width];
++                return;
++            }
++        }
++    }
++
++    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
++        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
++                                                 s->ps.pps->log2_sao_offset_scale_chroma;
++        int offset_abs[4];
++        char offset_sign[4] = {0};
++
++        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
++            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
++            continue;
++        }
++
++        if (c_idx == 2) {
++            sao->type_idx[2] = sao->type_idx[1];
++            sao->eo_class[2] = sao->eo_class[1];
++        } else {
++            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
++        }
++
++        // ** Could use BY22 here quite plausibly - this is all bypass stuff
++        //    though only per CTB so not very timing critical
++
++        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
++            continue;
++
++        for (i = 0; i < 4; i++)
++            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
++
++        if (sao->type_idx[c_idx] == SAO_BAND) {
++            for (i = 0; i < 4; i++) {
++                if (offset_abs[i] != 0)
++                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
++            }
++            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
++        } else if (c_idx != 2) {
++            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
++        }
++
++        // Inferred parameters
++        sao->offset_val[c_idx][0] = 0;
++        for (i = 0; i < 4; i++) {
++            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
++            if (sao->type_idx[c_idx] == SAO_EDGE) {
++                if (i > 1)
++                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++            } else if (offset_sign[i]) {
++                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++            }
++        }
++    }
++}
++
++
++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
++    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);
++
++    if (log2_res_scale_abs_plus1 !=  0) {
++        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
++        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
++                               (1 - 2 * res_scale_sign_flag);
++    } else {
++        lc->tu.res_scale_val = 0;
++    }
++
++
++    return 0;
++}
++
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
++{
++    return jb->intra.cmds + jb->intra.n++;
++}
++
++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx)
++{
++    // If rpi_enabled then sand - U & V done on U call
++    if (c_idx <= 1)
++    {
++        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++        cmd->type = RPI_PRED_INTRA;
++        cmd->size = log2_trafo_size;
++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
++        cmd->c_idx = c_idx;
++        cmd->i_pred.x = x0;
++        cmd->i_pred.y = y0;
++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
++    }
++}
++
++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++                              int xBase, int yBase, int cb_xBase, int cb_yBase,
++                              int log2_cb_size, int log2_trafo_size,
++                              int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr)
++{
++//    const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1];
++    const int log2_trafo_size_c = log2_trafo_size - ctx_hshift(s, 1);
++    int i;
++
++    if (lc->cu.pred_mode == MODE_INTRA) {
++        int trafo_size = 1 << log2_trafo_size;
++        ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size);
++        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0);
++    }
++
++    if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
++        (ctx_cfmt(s) == 2 && (cbf_cb[1] || cbf_cr[1]))) {
++        int scan_idx   = SCAN_DIAG;
++        int scan_idx_c = SCAN_DIAG;
++        int cbf_chroma = cbf_cb[0] || cbf_cr[0] ||
++                         (ctx_cfmt(s) == 2 &&
++                         (cbf_cb[1] || cbf_cr[1]));
++
++        if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) {
++            lc->tu.cu_qp_delta = ff_hevc_rpi_cu_qp_delta_abs(lc);
++            if (lc->tu.cu_qp_delta != 0)
++                if (ff_hevc_rpi_cu_qp_delta_sign_flag(lc) == 1)
++                    lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta;
++            lc->tu.is_cu_qp_delta_coded = 1;
++
++            if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) ||
++                lc->tu.cu_qp_delta >  (25 + s->ps.sps->qp_bd_offset / 2)) {
++                av_log(s->avctx, AV_LOG_ERROR,
++                       "The cu_qp_delta %d is outside the valid range "
++                       "[%d, %d].\n",
++                       lc->tu.cu_qp_delta,
++                       -(26 + s->ps.sps->qp_bd_offset / 2),
++                        (25 + s->ps.sps->qp_bd_offset / 2));
++                return AVERROR_INVALIDDATA;
++            }
++
++            ff_hevc_rpi_set_qPy(s, lc, cb_xBase, cb_yBase, log2_cb_size);
++        }
++
++        if (!lc->tu.is_cu_chroma_qp_offset_coded && cbf_chroma &&
++            !lc->cu.cu_transquant_bypass_flag) {
++            int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
++            if (cu_chroma_qp_offset_flag) {
++                int cu_chroma_qp_offset_idx  = 0;
++                if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
++                    cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
++                    av_log(s->avctx, AV_LOG_ERROR,
++                        "cu_chroma_qp_offset_idx not yet tested.\n");
++                }
++                lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
++                lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
++            }
++            lc->tu.is_cu_chroma_qp_offset_coded = 1;
++        }
++
++        if (lc->cu.pred_mode == MODE_INTRA && log2_trafo_size < 4) {
++            if (lc->tu.intra_pred_mode >= 6 &&
++                lc->tu.intra_pred_mode <= 14) {
++                scan_idx = SCAN_VERT;
++            } else if (lc->tu.intra_pred_mode >= 22 &&
++                       lc->tu.intra_pred_mode <= 30) {
++                scan_idx = SCAN_HORIZ;
++            }
++
++            if (lc->tu.intra_pred_mode_c >=  6 &&
++                lc->tu.intra_pred_mode_c <= 14) {
++                scan_idx_c = SCAN_VERT;
++            } else if (lc->tu.intra_pred_mode_c >= 22 &&
++                       lc->tu.intra_pred_mode_c <= 30) {
++                scan_idx_c = SCAN_HORIZ;
++            }
++        }
++
++        lc->tu.cross_pf = 0;
++
++        if (cbf_luma)
++            ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
++        if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) {
++            const int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1));
++            const int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1));
++            lc->tu.cross_pf  = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma &&
++                                (lc->cu.pred_mode == MODE_INTER ||
++                                 (lc->tu.chroma_mode_c ==  4)));
++
++            if (lc->tu.cross_pf) {
++                hls_cross_component_pred(lc, 0);
++            }
++            for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++                if (lc->cu.pred_mode == MODE_INTRA) {
++                    ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
++                    do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
++                }
++                if (cbf_cb[i])
++                    ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
++                                                log2_trafo_size_c, scan_idx_c, 1);
++                else
++                    if (lc->tu.cross_pf) {
++                        const ptrdiff_t stride = frame_stride1(s->frame, 1);
++                        const int hshift = ctx_hshift(s, 1);
++                        const int vshift = ctx_vshift(s, 1);
++                        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
++                        int16_t * const coeffs   = (int16_t*)lc->edge_emu_buffer2;
++                        int size = 1 << log2_trafo_size_c;
++
++                        uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride +
++                                                              ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++                        for (i = 0; i < (size * size); i++) {
++                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++                        }
++                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
++                    }
++            }
++
++            if (lc->tu.cross_pf) {
++                hls_cross_component_pred(lc, 1);
++            }
++            for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++                if (lc->cu.pred_mode == MODE_INTRA) {
++                    ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
++                    do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
++                }
++                if (cbf_cr[i])
++                    ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c),
++                                                log2_trafo_size_c, scan_idx_c, 2);
++                else
++                    if (lc->tu.cross_pf) {
++                        ptrdiff_t stride = frame_stride1(s->frame, 2);
++                        const int hshift = ctx_hshift(s, 2);
++                        const int vshift = ctx_vshift(s, 2);
++                        int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer;
++                        int16_t *coeffs   = (int16_t*)lc->edge_emu_buffer2;
++                        const int size = 1 << log2_trafo_size_c;
++
++                        uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride +
++                                                          ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++                        for (i = 0; i < (size * size); i++) {
++                            coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++                        }
++                        s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride);
++                    }
++            }
++        } else if (ctx_cfmt(s) != 0 && blk_idx == 3) {
++            int trafo_size_h = 1 << (log2_trafo_size + 1);
++            int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1));
++            for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++                if (lc->cu.pred_mode == MODE_INTRA) {
++                    ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
++                                                    trafo_size_h, trafo_size_v);
++                    do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
++                }
++                if (cbf_cb[i])
++                    ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
++                                                log2_trafo_size, scan_idx_c, 1);
++            }
++            for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) {
++                if (lc->cu.pred_mode == MODE_INTRA) {
++                    ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size),
++                                                trafo_size_h, trafo_size_v);
++                    do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
++                }
++                if (cbf_cr[i])
++                    ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size),
++                                                log2_trafo_size, scan_idx_c, 2);
++            }
++        }
++    } else if (ctx_cfmt(s) != 0 && lc->cu.pred_mode == MODE_INTRA) {
++        if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) {
++            int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1));
++            int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1));
++            ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v);
++            do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1);
++            do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2);
++            if (ctx_cfmt(s) == 2) {
++                ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c),
++                                                trafo_size_h, trafo_size_v);
++                do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
++                do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
++            }
++        } else if (blk_idx == 3) {
++            int trafo_size_h = 1 << (log2_trafo_size + 1);
++            int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1));
++            ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase,
++                                            trafo_size_h, trafo_size_v);
++            do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1);
++            do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2);
++            if (ctx_cfmt(s) == 2) {
++                ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)),
++                                                trafo_size_h, trafo_size_v);
++                do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
++                do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
++            }
++        }
++    }
++
++    return 0;
++}
++
++static void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
++{
++    int cb_size          = 1 << log2_cb_size;
++    int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++
++    int min_pu_width     = s->ps.sps->min_pu_width;
++    int x_end = FFMIN(x0 + cb_size, s->ps.sps->width);
++    int y_end = FFMIN(y0 + cb_size, s->ps.sps->height);
++    int i, j;
++
++    for (j = (y0 >> log2_min_pu_size); j < (y_end >> log2_min_pu_size); j++)
++        for (i = (x0 >> log2_min_pu_size); i < (x_end >> log2_min_pu_size); i++)
++            s->is_pcm[i + j * min_pu_width] = 2;
++}
++
++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++                              int xBase, int yBase, int cb_xBase, int cb_yBase,
++                              int log2_cb_size, int log2_trafo_size,
++                              int trafo_depth, int blk_idx,
++                              const int *base_cbf_cb, const int *base_cbf_cr)
++{
++    uint8_t split_transform_flag;
++    int cbf_cb[2];
++    int cbf_cr[2];
++    int ret;
++
++    cbf_cb[0] = base_cbf_cb[0];
++    cbf_cb[1] = base_cbf_cb[1];
++    cbf_cr[0] = base_cbf_cr[0];
++    cbf_cr[1] = base_cbf_cr[1];
++
++    if (lc->cu.intra_split_flag) {
++        if (trafo_depth == 1) {
++            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
++            if (ctx_cfmt(s) == 3) {
++                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
++                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
++            } else {
++                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
++            }
++        }
++    } else {
++        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
++        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
++    }
++
++    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
++        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
++        trafo_depth     < lc->cu.max_trafo_depth       &&
++        !(lc->cu.intra_split_flag && trafo_depth == 0)) {
++        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
++    } else {
++        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
++                          lc->cu.pred_mode == MODE_INTER &&
++                          lc->cu.part_mode != PART_2Nx2N &&
++                          trafo_depth == 0;
++
++        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
++                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
++                               inter_split;
++    }
++
++    if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) {
++        if (trafo_depth == 0 || cbf_cb[0]) {
++            cbf_cb[0] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++            if (ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
++                cbf_cb[1] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++            }
++        }
++
++        if (trafo_depth == 0 || cbf_cr[0]) {
++            cbf_cr[0] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++            if (ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3)) {
++                cbf_cr[1] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth);
++            }
++        }
++    }
++
++    if (split_transform_flag) {
++        const int trafo_size_split = 1 << (log2_trafo_size - 1);
++        const int x1 = x0 + trafo_size_split;
++        const int y1 = y0 + trafo_size_split;
++
++#define SUBDIVIDE(x, y, idx)                                                    \
++do {                                                                            \
++    ret = hls_transform_tree(s, lc, x, y, x0, y0, cb_xBase, cb_yBase, log2_cb_size, \
++                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
++                             cbf_cb, cbf_cr);                                   \
++    if (ret < 0)                                                                \
++        return ret;                                                             \
++} while (0)
++
++        SUBDIVIDE(x0, y0, 0);
++        SUBDIVIDE(x1, y0, 1);
++        SUBDIVIDE(x0, y1, 2);
++        SUBDIVIDE(x1, y1, 3);
++
++#undef SUBDIVIDE
++    } else {
++        int min_tu_size      = 1 << s->ps.sps->log2_min_tb_size;
++        int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
++        int min_tu_width     = s->ps.sps->min_tb_width;
++        int cbf_luma         = 1;
++
++        if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 ||
++            cbf_cb[0] || cbf_cr[0] ||
++            (ctx_cfmt(s) == 2 && (cbf_cb[1] || cbf_cr[1]))) {
++            cbf_luma = ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth);
++        }
++
++        ret = hls_transform_unit(s, lc, x0, y0, xBase, yBase, cb_xBase, cb_yBase,
++                                 log2_cb_size, log2_trafo_size,
++                                 blk_idx, cbf_luma, cbf_cb, cbf_cr);
++        if (ret < 0)
++            return ret;
++        // TODO: store cbf_luma somewhere else
++        if (cbf_luma) {
++            int i, j;
++            for (i = 0; i < (1 << log2_trafo_size); i += min_tu_size)
++                for (j = 0; j < (1 << log2_trafo_size); j += min_tu_size) {
++                    int x_tu = (x0 + j) >> log2_min_tu_size;
++                    int y_tu = (y0 + i) >> log2_min_tu_size;
++                    s->cbf_luma[y_tu * min_tu_width + x_tu] = 1;
++                }
++        }
++        if (!s->sh.disable_deblocking_filter_flag) {
++            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size);
++            if (s->ps.pps->transquant_bypass_enable_flag &&
++                lc->cu.cu_transquant_bypass_flag)
++                set_deblocking_bypass(s, x0, y0, log2_trafo_size);
++        }
++    }
++    return 0;
++}
++
++
++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++{
++    GetBitContext gb;
++    int ret;
++
++    ret = init_get_bits(&gb, pcm, length);
++    if (ret < 0)
++        return ret;
++
++    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
++                       frame_stride1(s->frame, 0),
++                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
++                       s->frame->linesize[1],
++                       cb_size >> ctx_hshift(s, 1),
++                       cb_size >> ctx_vshift(s, 1),
++                       &gb, s->ps.sps->pcm.bit_depth_chroma);
++
++    return 0;
++}
++
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++    return x << (y * 2);
++}
++
++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
++{
++    // Length in bits
++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
++
++    const uint8_t * const pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
++
++    if (!s->sh.disable_deblocking_filter_flag)
++        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
++
++    // Copy coeffs
++    {
++        const int blen = (length + 7) >> 3;
++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
++        // Allocation is in int16_t s
++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++        // sample this rounding doesn't affect the total size we need to allocate for
++        // the coeff buffer
++        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
++        memcpy(coeffs, pcm, blen);
++
++        // Our coeff stash assumes that any partially allocated 64byte lump
++        // is zeroed so make that true.
++        {
++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++            if ((-(intptr_t)eopcm & 63) != 0)
++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
++        }
++
++        // Add command
++        {
++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++            cmd->type = RPI_PRED_I_PCM;
++            cmd->size = log2_cb_size;
++            cmd->i_pcm.src = coeffs;
++            cmd->i_pcm.x = x0;
++            cmd->i_pcm.y = y0;
++            cmd->i_pcm.src_len = length;
++        }
++        return 0;
++    }
++}
++
++
++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCFrame * const ref,
++                                const Mv * const mv, const int y0, const int height)
++{
++    if (s->threads_type == FF_THREAD_FRAME) {
++        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++
++        // Progress has to be attached to current job as the actual wait
++        // is in worker_core which can't use lc
++        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
++        if (*pr < y) {
++            *pr = y;
++        }
++    }
++}
++
++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                  const int x0, const int y0, const int nPbW,
++                                  const int nPbH, const int log2_cb_size, const int part_idx,
++                                  const int merge_idx, MvField * const mv)
++{
++    enum InterPredIdc inter_pred_idc = PRED_L0;
++    int mvp_flag;
++
++    ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH);
++    mv->pred_flag = 0;
++    if (s->sh.slice_type == HEVC_SLICE_B)
++        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
++
++    if (inter_pred_idc != PRED_L1) {
++        if (s->sh.nb_refs[L0])
++            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
++
++        mv->pred_flag = PF_L0;
++        ff_hevc_rpi_hls_mvd_coding(lc);
++        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++                                 part_idx, merge_idx, mv, mvp_flag, 0);
++        mv->mv[0].x += lc->pu.mvd.x;
++        mv->mv[0].y += lc->pu.mvd.y;
++    }
++
++    if (inter_pred_idc != PRED_L0) {
++        if (s->sh.nb_refs[L1])
++            mv->ref_idx[1]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
++
++        if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
++            AV_ZERO32(&lc->pu.mvd);
++        } else {
++            ff_hevc_rpi_hls_mvd_coding(lc);
++        }
++
++        mv->pred_flag += PF_L1;
++        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++                                 part_idx, merge_idx, mv, mvp_flag, 1);
++        mv->mv[1].x += lc->pu.mvd.x;
++        mv->mv[1].y += lc->pu.mvd.y;
++    }
++}
++
++
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++{
++    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
++    HEVCRpiInterPredQ * ypt = yp + 1;
++    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
++        if (ypt->load < yp->load)
++            yp = ypt;
++    }
++
++    yp->load += load_val;
++    ipe->used_grp = 1;
++    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
++
++    return yp;
++}
++
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++    for (unsigned int i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
++
++        q->qpu_mc_curr->data[-1] = q->code_sync;
++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
++        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
++    }
++}
++
++// Returns 0 on success, -1 if Q is dangerously full
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++    if (!ipe->used_grp)
++        return 0;
++
++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
++    {
++        ipe->curr = 0;
++        rpi_inter_pred_sync(ipe);
++    }
++    ipe->used = 1;
++    ipe->used_grp = 0;
++
++    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
++        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
++            return -1;
++        }
++    }
++    return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++
++    ipe->curr = 0;
++    ipe->used = 0;
++    ipe->used_grp = 0;
++    for (i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base;
++        q->load = 0;
++        q->last_l0 = NULL;
++        q->last_l1 = NULL;
++    }
++}
++
++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++                                 const unsigned int n_max, const unsigned int n_grp,
++                                 const unsigned int total_size, const unsigned int min_gap)
++{
++    memset(ipe, 0, sizeof(*ipe));
++    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
++    ipe->n_grp = n_grp;
++    ipe->min_gap = min_gap;
++
++    gpu_malloc_cached(total_size, &ipe->gptr);
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline int offset_depth_adj(const HEVCRpiContext *const s, const int wt)
++{
++    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
++           wt << (s->ps.sps->bit_depth - 8);
++}
++
++static void
++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const Mv *const mv,
++           const int weight_mul,
++           const int weight_offset,
++           AVFrame *const src_frame)
++{
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx       = (my << 8) | mx;
++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
++    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++
++    if (my_mx == 0)
++    {
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++        const int bh = nPbH;
++
++        for (int start_x = 0; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred1_x0y0;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src_vc_address_y;
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->wo1 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
++        const unsigned int bh = nPbH;
++        int start_x = 0;
++
++#if 1
++        // As Y-pred operates on two independant 8-wide src blocks we can merge
++        // this pred with the previous one if it the previous one is 8 pel wide,
++        // the same height as the current block, immediately to the left of our
++        // current dest block and mono-pred.
++
++        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++        {
++            const int bw = FFMIN(nPbW, 8);
++            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
++
++            last_y8_src2->x = x1_m3;
++            last_y8_src2->y = y1_m3;
++            last_y8_src2->base = src_vc_address_y;
++            last_y8_p->w += bw;
++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++            last_y8_p->wo2 = wo;
++
++            jb->last_y8_p = NULL;
++            jb->last_y8_l1 = NULL;
++            start_x = bw;
++#if RPI_TSTATS
++            ++s->tstats.y_pred1_y8_merge;
++#endif
++        }
++#endif
++
++        for (; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                if (mx == 0 && my == 0)
++                    ++ts->y_pred1_x0y0;
++                else if (mx == 0)
++                    ++ts->y_pred1_x0;
++                else if (my == 0)
++                    ++ts->y_pred1_y0;
++                else
++                    ++ts->y_pred1_xy;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++            src1->x = x1_m3 + start_x;
++            src1->y = y1_m3;
++            src1->base = src_vc_address_y;
++            if (bw <= 8)
++            {
++                src2->x = MC_DUMMY_X;
++                src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++                src2->base = s->qpu_dummy_frame_emu;
++#else
++                src2->base = s->qpu_dummy_frame_qpu;
++#endif
++            }
++            else
++            {
++                src2->x = x1_m3 + start_x + 8;
++                src2->y = y1_m3;
++                src2->base = src_vc_address_y;
++            }
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo;
++            cmd_y->wo2 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++            if (bw == 8) {
++                jb->last_y8_l1 = src2;
++                jb->last_y8_p = cmd_y;
++            }
++        }
++    }
++}
++
++static void
++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const struct MvField *const mv_field,
++           const AVFrame *const src_frame,
++           const AVFrame *const src_frame2)
++{
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const Mv * const mv  = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
++
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx = (my<<8) | mx;
++    const unsigned int mx2          = mv2->x & 3;
++    const unsigned int my2          = mv2->y & 3;
++    const unsigned int my2_mx2 = (my2<<8) | mx2;
++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
++    const uint32_t wt_offset =
++        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
++    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++
++    if (my2_mx2_my_mx == 0)
++    {
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++        const int x2 = x0 + (mv2->x >> 2);
++        const int y2 = y0 + (mv2->y >> 2);
++        const int bh = nPbH;
++
++        // Can do chunks a full 16 wide if we don't want the H filter
++        for (int start_x=0; start_x < nPbW; start_x += 16)
++        {
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred2_x0y0;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 16);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = 0;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        // Filter requires a run-up of 3
++        const int x1 = x0 + (mv->x >> 2) - 3;
++        const int y1 = y0 + (mv->y >> 2) - 3;
++        const int x2 = x0 + (mv2->x >> 2) - 3;
++        const int y2 = y0 + (mv2->y >> 2) - 3;
++        const int bh = nPbH;
++
++        for (int start_x=0; start_x < nPbW; start_x += 8)
++        { // B blocks work 8 at a time
++            // B weights aren't doubled as the QPU code does the same
++            // amount of work as it does for P
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                const unsigned int mmx = mx | mx2;
++                const unsigned int mmy = my | my2;
++                if (mmx == 0 && mmy == 0)
++                    ++ts->y_pred2_x0y0;
++                else if (mmx == 0)
++                    ++ts->y_pred2_x0;
++                else if (mmy == 0)
++                    ++ts->y_pred2_y0;
++                else
++                    ++ts->y_pred2_xy;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 8);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++  const unsigned int lx, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const Mv * const mv,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  AVFrame * const src_frame)
++{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // = s->ps.sps->hshift[1];
++    const int vshift = 1; // = s->ps.sps->vshift[1];
++
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
++    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
++
++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++    {
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++        qpu_mc_src_t * const last_lx = *plast_lx;
++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++        last_lx->x = x1_c + start_x;
++        last_lx->y = y1_c;
++        last_lx->base = src_base_u;
++        cmd_c->h = bh;
++        cmd_c->w = bw;
++        cmd_c->coeffs_x = x_coeffs;
++        cmd_c->coeffs_y = y_coeffs;
++        cmd_c->wo_u = wo_u;
++        cmd_c->wo_v = wo_v;
++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++        *plast_lx = &cmd_c->next_src;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
++    }
++    return;
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++  const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const struct MvField * const mv_field,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  const int16_t * const c_weights2,
++  const int16_t * const c_offsets2,
++  AVFrame * const src_frame,
++  AVFrame * const src_frame2)
++{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // s->ps.sps->hshift[1];
++    const int vshift = 1; // s->ps.sps->vshift[1];
++    const Mv * const mv = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
++
++    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++
++    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
++    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
++
++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++
++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
++    {
++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++        qpu_mc_src_t * const src_l0 = cp->last_l0;
++        qpu_mc_src_t * const src_l1 = cp->last_l1;
++
++        src_l0->x = x1_c + start_x;
++        src_l0->y = y1_c;
++        src_l0->base = src1_base;
++        src_l1->x = x2_c + start_x;
++        src_l1->y = y2_c;
++        src_l1->base = src2_base;
++
++        u[0].h = bh;
++        u[0].w = bw;
++        u[0].coeffs_x1 = coefs0_x;
++        u[0].coeffs_y1 = coefs0_y;
++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
++        u[0].coeffs_x2 = coefs1_x;
++        u[0].coeffs_y2 = coefs1_y;
++        u[0].wo_u2 = wo_u2;
++        u[0].wo_v2 = wo_v2;
++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
++
++        cp->last_l0 = &u[0].next_src1;
++        cp->last_l1 = &u[0].next_src2;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++    }
++}
++
++
++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const int x0, const int y0,
++                                const int nPbW, const int nPbH,
++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++{
++    HEVCRpiJob * const jb = lc->jb0;
++
++    int merge_idx = 0;
++    struct MvField current_mv = {{{ 0 }}};
++
++    int min_pu_width = s->ps.sps->min_pu_width;
++
++    MvField * const tab_mvf = s->ref->tab_mvf;
++    const RefPicList  *const refPicList = s->ref->refPicList;
++    const HEVCFrame *ref0 = NULL, *ref1 = NULL;
++    int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++    int min_cb_width     = s->ps.sps->min_cb_width;
++    int x_cb             = x0 >> log2_min_cb_size;
++    int y_cb             = y0 >> log2_min_cb_size;
++    int x_pu, y_pu;
++    int i, j;
++    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++
++    if (!skip_flag)
++        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
++
++    if (skip_flag || lc->pu.merge_flag) {
++        if (s->sh.max_num_merge_cand > 1)
++            merge_idx = ff_hevc_rpi_merge_idx_decode(s, lc);
++        else
++            merge_idx = 0;
++
++        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++                                   partIdx, merge_idx, &current_mv);
++    } else {
++        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++                              partIdx, merge_idx, &current_mv);
++    }
++
++    x_pu = x0 >> s->ps.sps->log2_min_pu_size;
++    y_pu = y0 >> s->ps.sps->log2_min_pu_size;
++
++    for (j = 0; j < nPbH >> s->ps.sps->log2_min_pu_size; j++)
++        for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++)
++            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv;
++
++    if (current_mv.pred_flag & PF_L0) {
++        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
++        if (!ref0)
++            return;
++        hevc_await_progress(s, lc, ref0, &current_mv.mv[0], y0, nPbH);
++    }
++    if (current_mv.pred_flag & PF_L1) {
++        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
++        if (!ref1)
++            return;
++        hevc_await_progress(s, lc, ref1, &current_mv.mv[1], y0, nPbH);
++    }
++
++    if (current_mv.pred_flag == PF_L0) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 0,
++          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++          ref0->frame);
++
++        if (ctx_cfmt(s) != 0) {
++            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++              ref0->frame);
++            return;
++        }
++    } else if (current_mv.pred_flag == PF_L1) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 1,
++          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++          ref1->frame);
++
++        if (ctx_cfmt(s) != 0) {
++            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++              ref1->frame);
++            return;
++        }
++    } else if (current_mv.pred_flag == PF_BI) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
++
++        if (ctx_cfmt(s) != 0) {
++          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
++                       &current_mv,
++                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                       ref0->frame,
++                       ref1->frame);
++            return;
++        }
++    }
++}
++
++/**
++ * 8.4.1
++ */
++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int pu_size,
++                                int prev_intra_luma_pred_flag)
++{
++    int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
++    int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
++    int min_pu_width     = s->ps.sps->min_pu_width;
++    int size_in_pus      = pu_size >> s->ps.sps->log2_min_pu_size;
++    int x0b              = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
++    int y0b              = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
++
++    int cand_up   = (lc->ctb_up_flag || y0b) ?
++                    s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC;
++    int cand_left = (lc->ctb_left_flag || x0b) ?
++                    s->tab_ipm[y_pu * min_pu_width + x_pu - 1]   : INTRA_DC;
++
++    int y_ctb = (y0 >> (s->ps.sps->log2_ctb_size)) << (s->ps.sps->log2_ctb_size);
++
++    MvField *tab_mvf = s->ref->tab_mvf;
++    int intra_pred_mode;
++    int candidate[3];
++    int i, j;
++
++    // intra_pred_mode prediction does not cross vertical CTB boundaries
++    if ((y0 - 1) < y_ctb)
++        cand_up = INTRA_DC;
++
++    if (cand_left == cand_up) {
++        if (cand_left < 2) {
++            candidate[0] = INTRA_PLANAR;
++            candidate[1] = INTRA_DC;
++            candidate[2] = INTRA_ANGULAR_26;
++        } else {
++            candidate[0] = cand_left;
++            candidate[1] = 2 + ((cand_left - 2 - 1 + 32) & 31);
++            candidate[2] = 2 + ((cand_left - 2 + 1) & 31);
++        }
++    } else {
++        candidate[0] = cand_left;
++        candidate[1] = cand_up;
++        if (candidate[0] != INTRA_PLANAR && candidate[1] != INTRA_PLANAR) {
++            candidate[2] = INTRA_PLANAR;
++        } else if (candidate[0] != INTRA_DC && candidate[1] != INTRA_DC) {
++            candidate[2] = INTRA_DC;
++        } else {
++            candidate[2] = INTRA_ANGULAR_26;
++        }
++    }
++
++    if (prev_intra_luma_pred_flag) {
++        intra_pred_mode = candidate[lc->pu.mpm_idx];
++    } else {
++        if (candidate[0] > candidate[1])
++            FFSWAP(uint8_t, candidate[0], candidate[1]);
++        if (candidate[0] > candidate[2])
++            FFSWAP(uint8_t, candidate[0], candidate[2]);
++        if (candidate[1] > candidate[2])
++            FFSWAP(uint8_t, candidate[1], candidate[2]);
++
++        intra_pred_mode = lc->pu.rem_intra_luma_pred_mode;
++        for (i = 0; i < 3; i++)
++            if (intra_pred_mode >= candidate[i])
++                intra_pred_mode++;
++    }
++
++    /* write the intra prediction units into the mv array */
++    if (!size_in_pus)
++        size_in_pus = 1;
++    for (i = 0; i < size_in_pus; i++) {
++        memset(&s->tab_ipm[(y_pu + i) * min_pu_width + x_pu],
++               intra_pred_mode, size_in_pus);
++
++        for (j = 0; j < size_in_pus; j++) {
++            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag = PF_INTRA;
++        }
++    }
++
++    return intra_pred_mode;
++}
++
++static av_always_inline void set_ct_depth(const HEVCRpiContext * const s, int x0, int y0,
++                                          int log2_cb_size, int ct_depth)
++{
++    int length = (1 << log2_cb_size) >> s->ps.sps->log2_min_cb_size;
++    int x_cb   = x0 >> s->ps.sps->log2_min_cb_size;
++    int y_cb   = y0 >> s->ps.sps->log2_min_cb_size;
++    int y;
++
++    for (y = 0; y < length; y++)
++        memset(&s->tab_ct_depth[(y_cb + y) * s->ps.sps->min_cb_width + x_cb],
++               ct_depth, length);
++}
++
++static const uint8_t tab_mode_idx[] = {
++     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
++    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
++
++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++                                  const int log2_cb_size)
++{
++    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
++    uint8_t prev_intra_luma_pred_flag[4];
++    int split   = lc->cu.part_mode == PART_NxN;
++    int pb_size = (1 << log2_cb_size) >> split;
++    int side    = split + 1;
++    int chroma_mode;
++    int i, j;
++
++    for (i = 0; i < side; i++)
++        for (j = 0; j < side; j++)
++            prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
++
++    for (i = 0; i < side; i++) {
++        for (j = 0; j < side; j++) {
++            if (prev_intra_luma_pred_flag[2 * i + j])
++                lc->pu.mpm_idx = ff_hevc_rpi_mpm_idx_decode(lc);
++            else
++                lc->pu.rem_intra_luma_pred_mode = ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
++
++            lc->pu.intra_pred_mode[2 * i + j] =
++                luma_intra_pred_mode(s, lc, x0 + pb_size * j, y0 + pb_size * i, pb_size,
++                                     prev_intra_luma_pred_flag[2 * i + j]);
++        }
++    }
++
++    if (ctx_cfmt(s) == 3) {
++        for (i = 0; i < side; i++) {
++            for (j = 0; j < side; j++) {
++                lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++                if (chroma_mode != 4) {
++                    if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode])
++                        lc->pu.intra_pred_mode_c[2 * i + j] = 34;
++                    else
++                        lc->pu.intra_pred_mode_c[2 * i + j] = intra_chroma_table[chroma_mode];
++                } else {
++                    lc->pu.intra_pred_mode_c[2 * i + j] = lc->pu.intra_pred_mode[2 * i + j];
++                }
++            }
++        }
++    } else if (ctx_cfmt(s) == 2) {
++        int mode_idx;
++        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++        if (chroma_mode != 4) {
++            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++                mode_idx = 34;
++            else
++                mode_idx = intra_chroma_table[chroma_mode];
++        } else {
++            mode_idx = lc->pu.intra_pred_mode[0];
++        }
++        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
++    } else if (ctx_cfmt(s) != 0) {
++        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++        if (chroma_mode != 4) {
++            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++                lc->pu.intra_pred_mode_c[0] = 34;
++            else
++                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
++        } else {
++            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
++        }
++    }
++}
++
++static void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                                int x0, int y0,
++                                                int log2_cb_size)
++{
++    int pb_size          = 1 << log2_cb_size;
++    int size_in_pus      = pb_size >> s->ps.sps->log2_min_pu_size;
++    int min_pu_width     = s->ps.sps->min_pu_width;
++    MvField *tab_mvf     = s->ref->tab_mvf;
++    int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
++    int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
++    int j, k;
++
++    if (size_in_pus == 0)
++        size_in_pus = 1;
++    for (j = 0; j < size_in_pus; j++)
++        memset(&s->tab_ipm[(y_pu + j) * min_pu_width + x_pu], INTRA_DC, size_in_pus);
++    if (lc->cu.pred_mode == MODE_INTRA)
++        for (j = 0; j < size_in_pus; j++)
++            for (k = 0; k < size_in_pus; k++)
++                tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA;
++}
++
++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int log2_cb_size)
++{
++    int cb_size          = 1 << log2_cb_size;
++    int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++    int length           = cb_size >> log2_min_cb_size;
++    int min_cb_width     = s->ps.sps->min_cb_width;
++    int x_cb             = x0 >> log2_min_cb_size;
++    int y_cb             = y0 >> log2_min_cb_size;
++    int idx              = log2_cb_size - 2;
++    int qp_block_mask    = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
++    int x, y, ret;
++
++    lc->cu.x                = x0;
++    lc->cu.y                = y0;
++    lc->cu.pred_mode        = MODE_INTRA;
++    lc->cu.part_mode        = PART_2Nx2N;
++    lc->cu.intra_split_flag = 0;
++
++    SAMPLE_CTB(s->skip_flag, x_cb, y_cb) = 0;
++    for (x = 0; x < 4; x++)
++        lc->pu.intra_pred_mode[x] = 1;
++    if (s->ps.pps->transquant_bypass_enable_flag) {
++        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
++        if (lc->cu.cu_transquant_bypass_flag)
++            set_deblocking_bypass(s, x0, y0, log2_cb_size);
++    } else
++        lc->cu.cu_transquant_bypass_flag = 0;
++
++    if (s->sh.slice_type != HEVC_SLICE_I) {
++        uint8_t skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
++
++        x = y_cb * min_cb_width + x_cb;
++        for (y = 0; y < length; y++) {
++            memset(&s->skip_flag[x], skip_flag, length);
++            x += min_cb_width;
++        }
++        lc->cu.pred_mode = skip_flag ? MODE_SKIP : MODE_INTER;
++    } else {
++        x = y_cb * min_cb_width + x_cb;
++        for (y = 0; y < length; y++) {
++            memset(&s->skip_flag[x], 0, length);
++            x += min_cb_width;
++        }
++    }
++
++    if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) {
++        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++
++        if (!s->sh.disable_deblocking_filter_flag)
++            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
++    } else {
++        int pcm_flag = 0;
++
++        if (s->sh.slice_type != HEVC_SLICE_I)
++            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
++        if (lc->cu.pred_mode != MODE_INTRA ||
++            log2_cb_size == s->ps.sps->log2_min_cb_size) {
++            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
++            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
++                                      lc->cu.pred_mode == MODE_INTRA;
++        }
++
++        if (lc->cu.pred_mode == MODE_INTRA) {
++            if (lc->cu.part_mode == PART_2Nx2N && s->ps.sps->pcm_enabled_flag &&
++                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
++                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size) {
++                pcm_flag = ff_hevc_rpi_pcm_flag_decode(lc);
++            }
++            if (pcm_flag) {
++                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++                ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size);
++                if (s->ps.sps->pcm.loop_filter_disable_flag)
++                {
++                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
++                }
++
++                if (ret < 0)
++                    return ret;
++            } else {
++                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
++            }
++        } else {
++            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++            switch (lc->cu.part_mode) {
++            case PART_2Nx2N:
++                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++                break;
++            case PART_2NxN:
++                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
++                break;
++            case PART_Nx2N:
++                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
++                break;
++            case PART_2NxnU:
++                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx);
++                break;
++            case PART_2NxnD:
++                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx);
++                hls_prediction_unit(s, lc, x0, y0 + cb_size * 3 / 4, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
++                break;
++            case PART_nLx2N:
++                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
++                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
++                break;
++            case PART_nRx2N:
++                hls_prediction_unit(s, lc, x0,                   y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2);
++                hls_prediction_unit(s, lc, x0 + cb_size * 3 / 4, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
++                break;
++            case PART_NxN:
++                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
++                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
++                break;
++            }
++        }
++
++        if (!pcm_flag) {
++            int rqt_root_cbf = 1;
++
++            if (lc->cu.pred_mode != MODE_INTRA &&
++                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
++                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
++            }
++            if (rqt_root_cbf) {
++                const static int cbf[2] = { 0 };
++                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
++                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
++                                         s->ps.sps->max_transform_hierarchy_depth_inter;
++                ret = hls_transform_tree(s, lc, x0, y0, x0, y0, x0, y0,
++                                         log2_cb_size,
++                                         log2_cb_size, 0, 0, cbf, cbf);
++                if (ret < 0)
++                    return ret;
++            } else {
++                if (!s->sh.disable_deblocking_filter_flag)
++                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size);
++            }
++        }
++    }
++
++    if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0)
++        ff_hevc_rpi_set_qPy(s, lc, x0, y0, log2_cb_size);
++
++    x = y_cb * min_cb_width + x_cb;
++    for (y = 0; y < length; y++) {
++        memset(&s->qp_y_tab[x], lc->qp_y, length);
++        x += min_cb_width;
++    }
++
++    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
++        lc->qPy_pred = lc->qp_y;
++    }
++
++    set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth);
++
++    return 0;
++}
++
++// Returns:
++//  < 0  Error
++//  0    More data wanted
++//  1    EoSlice / EoPicture
++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++                               const int log2_cb_size, const int cb_depth)
++{
++    const int cb_size    = 1 << log2_cb_size;
++    int ret;
++    int split_cu;
++
++    lc->ct_depth = cb_depth;
++    if (x0 + cb_size <= s->ps.sps->width  &&
++        y0 + cb_size <= s->ps.sps->height &&
++        log2_cb_size > s->ps.sps->log2_min_cb_size) {
++        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
++    } else {
++        split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
++    }
++    if (s->ps.pps->cu_qp_delta_enabled_flag &&
++        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth) {
++        lc->tu.is_cu_qp_delta_coded = 0;
++        lc->tu.cu_qp_delta          = 0;
++    }
++
++    lc->tu.is_cu_chroma_qp_offset_coded = !(s->sh.cu_chroma_qp_offset_enabled_flag &&
++        log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth);
++    lc->tu.cu_qp_offset_cb = 0;
++    lc->tu.cu_qp_offset_cr = 0;
++
++    if (split_cu) {
++        int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1;
++        const int cb_size_split = cb_size >> 1;
++        const int x1 = x0 + cb_size_split;
++        const int y1 = y0 + cb_size_split;
++
++        int more_data = 0;
++
++        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
++        if (more_data < 0)
++            return more_data;
++
++        if (more_data && x1 < s->ps.sps->width) {
++            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++        if (more_data && y1 < s->ps.sps->height) {
++            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++        if (more_data && x1 < s->ps.sps->width &&
++            y1 < s->ps.sps->height) {
++            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++
++        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
++            lc->qPy_pred = lc->qp_y;
++
++        if (more_data)
++            return ((x1 + cb_size_split) < s->ps.sps->width ||
++                    (y1 + cb_size_split) < s->ps.sps->height);
++        else
++            return 0;
++    } else {
++        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
++        if (ret < 0)
++            return ret;
++        if ((!((x0 + cb_size) %
++               (1 << (s->ps.sps->log2_ctb_size))) ||
++             (x0 + cb_size >= s->ps.sps->width)) &&
++            (!((y0 + cb_size) %
++               (1 << (s->ps.sps->log2_ctb_size))) ||
++             (y0 + cb_size >= s->ps.sps->height))) {
++            int end_of_slice_flag = ff_hevc_rpi_end_of_slice_flag_decode(lc);
++            return !end_of_slice_flag;
++        } else {
++            return 1;
++        }
++    }
++
++    return 0;  // NEVER
++}
++
++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
++{
++    const int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
++    const int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++    const int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
++    const int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
++
++    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
++
++    lc->end_of_tiles_x = idxX + 1 >= s->ps.pps->num_tile_columns ? s->ps.sps->width :
++        (s->ps.pps->col_bd[idxX + 1] << s->ps.sps->log2_ctb_size);
++
++    if (ctb_addr_ts == 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1] ||
++        (s->ps.pps->entropy_coding_sync_enabled_flag && (x_ctb >> s->ps.sps->log2_ctb_size) == s->ps.pps->col_bd[idxX]))
++    {
++//        lc->first_qp_group = 1;
++        lc->qPy_pred = s->sh.slice_qp;
++    }
++
++    lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++
++    lc->boundary_flags = 0;
++
++    if (x_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]])
++        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
++    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
++        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
++    if (y_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]])
++        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
++    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
++        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
++
++    lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0;
++    lc->ctb_up_flag   = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0;
++    lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++        (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width);
++
++    lc->ctb_up_right_flag = ((y_ctb > 0) && (x_ctb + ctb_size) < lc->end_of_tiles_x &&
++        (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) &&
++        (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]]));
++}
++
++
++static void rpi_execute_dblk_cmds(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++    const unsigned int x0 = FFMAX(jb->bounds.x, ctb_size) - ctb_size;
++    const unsigned int y0 = FFMAX(jb->bounds.y, ctb_size) - ctb_size;
++    const unsigned int bound_r = jb->bounds.x + jb->bounds.w;
++    const unsigned int bound_b = jb->bounds.y + jb->bounds.h;
++    const int x_end = (bound_r >= s->ps.sps->width);
++    const int y_end = (bound_b >= s->ps.sps->height);
++    const unsigned int xr = bound_r - (x_end ? 0 : ctb_size);
++    const unsigned int yb = bound_b - (y_end ? 0 : ctb_size);
++    unsigned int x, y;
++
++    for (y = y0; y < yb; y += ctb_size ) {
++        for (x = x0; x < xr; x += ctb_size ) {
++            ff_hevc_rpi_hls_filter(s, x, y, ctb_size);
++        }
++    }
++
++    // Flush (SAO)
++    if (y > y0) {
++        const int tile_end = y_end ||
++            s->ps.pps->tile_id[jb->ctu_ts_last] != s->ps.pps->tile_id[jb->ctu_ts_last + 1];
++        const unsigned int xl = x0 > ctb_size ? x0 - ctb_size : 0;
++        const unsigned int yt = y0 > ctb_size ? y0 - ctb_size : 0;
++        const unsigned int yb = tile_end ? bound_b : y - ctb_size;
++
++        rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          xl, yt, bound_r - xl, yb - yt,
++          ctx_vshift(s, 1), 1, 1);
++        rpi_cache_flush_finish(rfe);
++    }
++
++    // Signal
++    if (s->threads_type == FF_THREAD_FRAME && x_end && y0 > 0) {
++        ff_hevc_rpi_progress_signal_recon(s, y_end ? INT_MAX : y0 - 1);
++    }
++
++    // Job done now
++    // ? Move outside this fn
++    job_free(s->jbc, jb);
++}
++
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    unsigned int i;
++    HEVCRpiIntraPredEnv * const iap = &jb->intra;
++    const HEVCPredCmd *cmd = iap->cmds;
++
++    for (i = iap->n; i > 0; i--, cmd++)
++    {
++        switch (cmd->type)
++        {
++            case RPI_PRED_INTRA:
++            {
++                HEVCRpiLocalContextIntra lci; // Abbreviated local context
++                HEVCRpiLocalContext * const lc = (HEVCRpiLocalContext *)&lci;
++                lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
++                lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
++                lc->na.cand_left         = (cmd->na >> 3) & 1;
++                lc->na.cand_up_left      = (cmd->na >> 2) & 1;
++                lc->na.cand_up           = (cmd->na >> 1) & 1;
++                lc->na.cand_up_right     = (cmd->na >> 0) & 1;
++                if (cmd->c_idx == 0)
++                    s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++                else
++                    s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++                break;
++            }
++
++            case RPI_PRED_ADD_RESIDUAL:
++                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++                break;
++            case RPI_PRED_ADD_DC:
++                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_U:
++                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_V:
++                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_C:
++                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++                break;
++            case RPI_PRED_ADD_DC_U:
++            case RPI_PRED_ADD_DC_V:
++                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++                break;
++
++            case RPI_PRED_I_PCM:
++                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++                break;
++
++            default:
++                av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++                abort();
++        }
++    }
++
++    // Mark done
++    iap->n = 0;
++}
++
++
++// Set initial uniform job values & zero ctu_count
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
++{
++    unsigned int i;
++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
++    const HEVCRpiSPS * const sps = s->ps.sps;
++
++    const uint16_t pic_width_y   = sps->width;
++    const uint16_t pic_height_y  = sps->height;
++
++    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
++    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
++
++    // We expect the pointer to change if we use another sps
++    if (sps != jb->sps)
++    {
++        worker_pic_free_one(jb);
++
++        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
++        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
++
++        {
++            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
++            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
++            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
++        }
++
++        jb->sps = sps;
++    }
++
++    jb->waited = 0;
++    jb->ctu_ts_first = ctu_ts_first;
++    jb->ctu_ts_last = -1;
++
++    rpi_inter_pred_reset(cipe);
++    for (i = 0; i < cipe->n; i++) {
++        HEVCRpiInterPredQ * const cp = cipe->q + i;
++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++
++        u->next_src1.x = 0;
++        u->next_src1.y = 0;
++        u->next_src1.base = 0;
++        u->pic_cw = pic_width_c;
++        u->pic_ch = pic_height_c;
++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        u->wdenom = s->sh.chroma_log2_weight_denom;
++        cp->last_l0 = &u->next_src1;
++
++        u->next_fn = 0;
++        u->next_src2.x = 0;
++        u->next_src2.y = 0;
++        u->next_src2.base = 0;
++        cp->last_l1 = &u->next_src2;
++
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++    }
++
++    rpi_inter_pred_reset(yipe);
++    for (i = 0; i < yipe->n; i++) {
++        HEVCRpiInterPredQ * const yp = yipe->q + i;
++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++
++        y->next_src1.x = 0;
++        y->next_src1.y = 0;
++        y->next_src1.base = 0;
++        y->next_src2.x = 0;
++        y->next_src2.y = 0;
++        y->next_src2.base = 0;
++        y->pic_h = pic_height_y;
++        y->pic_w = pic_width_y;
++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        y->wdenom = s->sh.luma_log2_weight_denom;
++        y->next_fn = 0;
++        yp->last_l0 = &y->next_src1;
++        yp->last_l1 = &y->next_src2;
++
++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
++    }
++
++    jb->last_y8_p = NULL;
++    jb->last_y8_l1 = NULL;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++        jb->progress_req[i] = -1;
++    }
++
++    worker_pic_reset(&jb->coeffs);
++}
++
++
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++    unsigned int max_block = 0;
++
++    if (!ipe->used) {
++        return 0;
++    }
++
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++        if (block_size > max_block)
++            max_block = block_size;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_qpu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_qpu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++
++        // Add to mailbox list
++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++        mail[i][1] = yp->code_setup;
++    }
++
++    // We don't need invalidate here as the uniforms aren't changed by the QPU
++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++    // new values which seems to give us a small performance advantage
++    //
++    // In most cases we will not have a completely packed set of uniforms and as
++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++    // fullest
++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++                                  ipe->n, ipe->max_fill + ipe->min_gap);
++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++    return 1;
++}
++#endif
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    if (!ipe->used) {
++        return 0;
++    }
++
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_emu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_emu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++    }
++
++    return 1;
++}
++#endif
++
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++
++
++static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
++{
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++  rpi_cache_flush_finish(rfe);
++}
++
++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
++    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
++    const unsigned int ctb_width = s->ps.sps->ctb_width;
++    RpiBlk *const bounds = &jb->bounds;
++    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
++    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
++    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
++    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
++    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
++}
++
++#if RPI_PASSES == 2
++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s, jb);
++
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s, jb);
++}
++#endif
++
++
++// Core execution tasks
++static void worker_core(HEVCRpiContext * const s0, HEVCRpiJob * const jb)
++{
++    const HEVCRpiContext * const s = s0;
++    vpu_qpu_wait_h sync_y;
++    int pred_y, pred_c;
++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++
++    {
++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++        if (cf->s[3].n + cf->s[2].n != 0)
++        {
++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++            vpu_qpu_job_add_vpu(vqj,
++                vpu_get_fn(s->ps.sps->bit_depth),
++                vpu_get_constants(),
++                cf->gptr.vc,
++                cf->s[2].n >> 8,
++                cf->gptr.vc + offset32,
++                cf->s[3].n >> 10,
++                0);
++
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
++        }
++    }
++
++    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
++
++// We could take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++
++    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
++
++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
++
++    rpi_cache_flush_execute(rfe);
++
++    // Await progress as required
++    // jb->waited will only be clear if we have already tested the progress values
++    // (in worker_submit_job) and found we don't have to wait
++    if (jb->waited)
++    {
++        unsigned int i;
++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++            if (jb->progress_req[i] >= 0) {
++                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
++            }
++        }
++    }
++
++    vpu_qpu_job_finish(vqj);
++
++    // We always work on a rectangular block
++    if (pred_y || pred_c)
++    {
++        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
++                                        ctx_vshift(s, 1), pred_y, pred_c);
++    }
++
++    // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    if (av_rpi_is_sand8_frame(s->frame))
++    {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++    }
++    else
++    {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
++    }
++#endif
++
++    // Wait for transform completion
++    // ? Could/should be moved to next pass which would let us add more jobs
++    //   to the VPU Q on this thread but when I tried that it all went a bit slower
++    vpu_qpu_wait(&sync_y);
++
++    rpi_cache_flush_finish(rfe);
++}
++
++
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++    av_freep(&ipe->q);
++    gpu_free(&ipe->gptr);
++}
++
++static HEVCRpiJob * job_new(void)
++{
++    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
++
++    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++    jb->intra.n = 0;
++    jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
++
++    // * Sizeof the union structure might be overkill but at the moment it
++    //   is correct (it certainly isn't going to be too small)
++    // *** really should add per ctu sync words to be accurate
++
++    rpi_inter_pred_alloc(&jb->chroma_ip,
++                         QPU_N_MAX, QPU_N_GRP,
++                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
++                         QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
++    rpi_inter_pred_alloc(&jb->luma_ip,
++                         QPU_N_MAX,  QPU_N_GRP,
++                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
++                         QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
++
++    return jb;
++}
++
++static void job_delete(HEVCRpiJob * const jb)
++{
++    worker_pic_free_one(jb);
++    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++    av_freep(&jb->intra.cmds);
++    rpi_free_inter_pred(&jb->chroma_ip);
++    rpi_free_inter_pred(&jb->luma_ip);
++}
++
++static void jbg_delete(HEVCRpiJobGlobal * const jbg)
++{
++    HEVCRpiJob * jb;
++
++    if (jbg == NULL)
++        return;
++
++    jb = jbg->free1;
++    while (jb != NULL)
++    {
++        HEVCRpiJob * const jb2 = jb;
++        jb = jb2->next;
++        job_delete(jb2);
++    }
++
++    pthread_mutex_destroy(&jbg->lock);
++    av_free(jbg);
++}
++
++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
++{
++    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
++    if (jbg == NULL)
++        return NULL;
++
++    pthread_mutex_init(&jbg->lock, NULL);
++
++    while (job_count-- != 0)
++    {
++        HEVCRpiJob * const jb = job_new();
++        if (jb == NULL)
++            goto fail;
++
++        jb->next = jbg->free1;
++        jbg->free1 = jb;
++    }
++
++    return jbg;
++
++fail:
++    jbg_delete(jbg);
++    return NULL;
++}
++
++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
++{
++    HEVCRpiJobGlobal * jbg;
++
++    if (jbc == NULL)
++        return;
++
++    jbg = jbc->jbg;
++
++    if (jbc->jb1 != NULL)
++        job_delete(jbc->jb1);
++
++    pthread_mutex_destroy(&jbc->in_lock);
++    sem_destroy(&jbc->sem_out);
++    av_free(jbc);
++
++    // Deref the global job context
++    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
++        jbg_delete(jbg);
++}
++
++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
++{
++    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
++
++    if (jbc == NULL)
++        return NULL;
++
++    jbc->jbg = jbg;
++    atomic_fetch_add(&jbg->ref_count, 1);
++
++    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
++    pthread_mutex_init(&jbc->in_lock, NULL);
++
++    if ((jbc->jb1 = job_new()) == NULL)
++        goto fail;
++    jbc->jb1->jbc_local = jbc;
++
++    return jbc;
++
++fail:
++    rpi_job_ctl_delete(jbc);
++    return NULL;
++}
++
++
++
++static av_cold void hevc_init_worker(HEVCRpiContext * const s)
++{
++#if RPI_PASSES == 2
++    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
++#elif RPI_PASSES == 3
++    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
++    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
++#else
++#error Passes confused
++#endif
++    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
++
++    pass_queues_start_all(s);
++}
++
++static av_cold void hevc_exit_worker(HEVCRpiContext *s)
++{
++    pass_queues_term_all(s);
++
++    pass_queues_kill_all(s);
++
++    rpi_job_ctl_delete(s->jbc);
++    s->jbc = NULL;
++}
++
++
++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
++{
++    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
++
++    // Check for obvious disasters
++    if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
++        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (s->sh.dependent_slice_segment_flag) {
++        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
++        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
++            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++        s->ps.pps->tile_id[ctb_addr_ts] + s->sh.num_entry_point_offsets >= tiles)
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    // Tiled stuff must start at start of tile if it has multiple entry points
++    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++        s->sh.num_entry_point_offsets != 0 &&
++        s->sh.slice_ctb_addr_rs != s->ps.pps->tile_pos_rs[s->ps.pps->tile_id[ctb_addr_ts]])
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    // Setup any required decode vars
++    if (!s->sh.dependent_slice_segment_flag)
++        lc->qPy_pred = s->sh.slice_qp;
++
++    lc->qp_y = s->sh.slice_qp;
++
++    // General setup
++    lc->wpp_init = 0;
++    lc->bt_line_no = 0;
++    lc->ts = ctb_addr_ts;
++    return 0;
++}
++
++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++    const GetBitContext * const gb = &s->HEVClc->gb;
++    int i, j;
++
++    const unsigned int length = nal->size;
++    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
++    unsigned int cmpt;
++    unsigned int startheader;
++
++    if (s->sh.num_entry_point_offsets == 0) {
++        return 0;
++    }
++
++    for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
++        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++            startheader--;
++            cmpt++;
++        }
++    }
++
++    for (i = 1; i < s->sh.num_entry_point_offsets; i++) {
++        offset += (s->sh.entry_point_offset[i - 1] - cmpt);
++        for (j = 0, cmpt = 0, startheader = offset
++             + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) {
++            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++                startheader--;
++                cmpt++;
++            }
++        }
++        s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt;
++        s->sh.offset[i - 1] = offset;
++    }
++    if (s->sh.num_entry_point_offsets != 0) {
++        offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt;
++        if (length < offset) {
++            av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++            return AVERROR_INVALIDDATA;
++        }
++        s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset;
++        s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset;
++    }
++    s->data = nal->data;
++    return 0;
++}
++
++
++// Return
++// < 0   Error
++// 0     OK
++//
++// jb->ctu_ts_last < 0       Job still filling
++// jb->ctu_ts_last >= 0      Job ready
++
++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
++{
++    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++    HEVCRpiJob * const jb = lc->jb0;
++    int more_data = 1;
++    int ctb_addr_ts = lc->ts;
++
++    lc->unit_done = 0;
++    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
++    {
++        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
++        const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
++        int q_full;
++
++        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
++
++        ff_hevc_rpi_cabac_init(s, lc, ctb_addr_ts);
++
++        hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
++
++        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
++        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
++        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
++
++        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
++
++        if (more_data < 0) {
++            s->tab_slice_address[ctb_addr_rs] = -1;
++            return more_data;
++        }
++
++        // Inc TS to next.
++        // N.B. None of the other position vars have changed
++        ctb_addr_ts++;
++        ff_hevc_rpi_save_states(s, lc, ctb_addr_ts);
++
++        // Report progress so we can use our MVs in other frames
++        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
++            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
++        }
++
++        // End of line || End of tile line || End of tile
++        // (EoL covers end of frame for our purposes here)
++        q_full = x_ctb + ctb_size >= s->ps.sps->width ||
++            s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts] != ctb_addr_rs + 1 ||
++            s->ps.pps->tile_id[ctb_addr_ts - 1] != s->ps.pps->tile_id[ctb_addr_ts];
++
++        // Allocate QPU chuncks on fixed size 64 pel boundries rather than
++        // whatever ctb_size is today.
++        // * We might quite like to continue to 64 pel vertical too but that
++        //   currently confuses WPP
++        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
++        {
++            int overflow = 0;
++            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
++                overflow = 1;
++            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
++                overflow = 1;
++            if (overflow)
++            {
++                // * This is very annoying (and slow) to cope with in WPP so
++                //   we treat it as an error there (no known stream triggers this
++                //   with the current buffer sizes).  Non-wpp should cope fine.
++                av_log(s, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
++                q_full = 1;
++            }
++        }
++
++        if (q_full)
++        {
++            // Do job
++            // Prep for submission
++            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
++            job_gen_bounds(s, jb);
++            break;
++        }
++
++        // If max_blocks started as 0 then this will never be true
++        if (--max_blocks == 0)
++            break;
++    }
++
++    lc->unit_done = (more_data <= 0);
++    lc->ts = ctb_addr_ts;
++    return 0;
++}
++
++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
++{
++    lc->context = s;
++    lc->jb0 = NULL;
++    lc->lc_n = n;
++    lc->bt_terminate = 0;
++    lc->bt_psem_out = NULL;
++    sem_init(&lc->bt_sem_in, 0, 0);
++}
++
++#define TRACE_WPP 0
++#if RPI_EXTRA_BIT_THREADS > 0
++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
++{
++    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
++    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
++}
++
++// Move local context parameters from an aux bit thread back to the main
++// thread at the end of a slice as processing is going to continue there.
++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
++{
++    if (src_lc == dst_lc) {
++        return;
++    }
++
++    // Move the job
++    // We will still have an active job if the final line terminates early
++    // Dest should always be null by now
++    av_assert1(dst_lc->jb0 == NULL);
++    dst_lc->jb0 = src_lc->jb0;
++    src_lc->jb0 = NULL;
++
++    // Always need to store where we are in the bitstream
++    dst_lc->ts = src_lc->ts;
++    dst_lc->gb = src_lc->gb;
++    // Need to store context if we might have a dependent seg
++    if (is_dep)
++    {
++        dst_lc->qPy_pred = src_lc->qPy_pred;
++        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
++        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
++    }
++}
++
++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
++{
++    rpi_sem_wait(&lc->bt_sem_in);
++    return lc->bt_terminate;
++}
++
++// Do one WPP line
++// Will not work correctly over horizontal tile boundries - vertical should be OK
++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
++{
++    const int is_tile = lc->bt_is_tile;
++    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
++    const unsigned int line = lc->bt_line_no;
++    const unsigned int line_inc = lc->bt_line_inc;
++    const int is_last = (line >= lc->bt_last_line);
++
++    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
++    const unsigned int ts_next =
++        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
++            INT_MAX :
++        is_tile ?
++            s->ps.pps->ctb_addr_rs_to_ts[s->ps.pps->tile_pos_rs[tile_id + line_inc]] :
++            lc->ts + lc->bt_line_width * line_inc;
++    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
++    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
++    unsigned int ts_prev;
++    int loop_n = 0;
++    int err = 0;
++
++    av_assert1(line <= s->sh.num_entry_point_offsets);
++
++#if TRACE_WPP
++    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
++           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
++           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
++           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
++#endif
++    if (line != 0)
++    {
++        const uint8_t * const data = s->data + s->sh.offset[line - 1];
++        const unsigned int len = s->sh.size[line - 1];
++        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
++            return err;
++
++        ff_init_cabac_decoder(&lc->cc, data, len);
++
++        lc->wpp_init = 1;  // Stop ff_hevc_rpi_cabac_init trying to read non-existant termination bits
++    }
++
++    // We should never be processing a dependent slice here so reset is good
++    // ?? These probably shouldn't be needed (as they should be set by later
++    //    logic) but do seem to be required
++    lc->qPy_pred = s->sh.slice_qp;
++    lc->qp_y = s->sh.slice_qp;
++
++    do
++    {
++        if (!is_last && loop_n > 1) {
++#if TRACE_WPP
++            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
++#endif
++            sem_post(lc->bt_psem_out);
++        }
++        if (!is_first && loop_n != 0)
++        {
++#if TRACE_WPP
++            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
++#endif
++            if (wait_bt_sem_in(lc) != 0)
++                return AVERROR_EXIT;
++        }
++
++#if TRACE_WPP
++        {
++            int n;
++            sem_getvalue(&lc->bt_sem_in, &n);
++            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
++        }
++#endif
++
++        ts_prev = lc->ts;
++
++        // If we have had an error - do no further decode but do continue
++        // moving signals around so the other threads continue to operate
++        // correctly (or at least as correctly as they can with this line missing)
++        //
++        // Errors in WPP/Tile are less fatal than normal as we have a good idea
++        // of how to restart on the next line so there is no need to give up totally
++        if (err != 0)
++        {
++            lc->unit_done = 0;
++            lc->ts += partial_size;
++        }
++        else
++        {
++            worker_pass0_ready(s, lc);
++
++            if ((err = fill_job(s, lc, partial_size)) < 0 ||
++                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
++            {
++                if (err == 0) {
++                    av_log(s, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
++                    err = AVERROR_INVALIDDATA;
++                }
++                worker_free(s, lc);
++                lc->ts = ts_prev + partial_size;  // Pretend we did all that
++                lc->unit_done = 0;
++            }
++            else if (is_tile)
++            {
++                worker_submit_job(s, lc);
++            }
++        }
++
++        ++loop_n;
++    } while (lc->ts < ts_eol && !lc->unit_done);
++
++    // If we are on the last line & we didn't get a whole line we must wait for
++    // and sink the sem_posts from the line above / tile to the left.
++    while ((ts_prev += partial_size) < ts_eol)
++    {
++#if TRACE_WPP
++        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
++#endif
++        if (wait_bt_sem_in(lc) != 0)
++            return AVERROR_EXIT;
++    }
++
++    lc->bt_line_no += line_inc;
++
++    if (!is_tile && err == 0)
++        worker_submit_job(s, lc);
++
++    if (!is_last) {
++        lc->ts = ts_next;
++
++#if TRACE_WPP
++        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++        sem_post(lc->bt_psem_out);
++        if (loop_n > 1) {
++#if TRACE_WPP
++            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++            sem_post(lc->bt_psem_out);
++        }
++    }
++    else
++    {
++        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);
++
++        // When all done poke the thread 0 sem_in one final time
++#if TRACE_WPP
++        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
++#endif
++        sem_post(&s->HEVClcList[0]->bt_sem_in);
++    }
++
++#if TRACE_WPP
++    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
++#endif
++    return err;
++}
++
++static void wpp_setup_lcs(HEVCRpiContext * const s)
++{
++    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const unsigned int line_width = line_ts_width(s, ts);
++
++    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
++    {
++        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++        lc->ts = ts;
++        lc->bt_is_tile = 0;
++        lc->bt_line_no = i;
++        lc->bt_line_width = line_width;
++        lc->bt_last_line = s->sh.num_entry_point_offsets;
++        lc->bt_line_inc = RPI_BIT_THREADS;
++        ts += line_width;
++    }
++}
++
++
++// Can only process tile single row at once
++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
++{
++    const HEVCRpiPPS * const pps = s->ps.pps;
++    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const unsigned int tile0 = pps->tile_id[ts0];
++    const unsigned int col0 = tile0 % pps->num_tile_columns;
++
++    const unsigned int col = (slice_row == 0) ? col0 : 0;
++    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
++    const unsigned int last_line = FFMIN(
++        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
++
++    const unsigned int par =
++        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
++#if TRACE_WPP
++    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
++           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
++#endif
++    for (unsigned int i = 0; i != par; ++i, ++line)
++    {
++        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++        const unsigned int tile = tile0 + line;
++
++        lc->ts = pps->ctb_addr_rs_to_ts[pps->tile_pos_rs[tile]];
++        lc->bt_line_no = line;
++        lc->bt_is_tile = 1;
++        lc->bt_line_width = line_ts_width(s, lc->ts);
++        lc->bt_last_line = last_line;
++        lc->bt_line_inc = par;
++    }
++}
++
++
++static void * bit_thread(void * v)
++{
++    HEVCRpiLocalContext * const lc = v;
++    HEVCRpiContext *const s = lc->context;
++
++    while (wait_bt_sem_in(lc) == 0)
++    {
++        int err;
++
++        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
++            if (lc->bt_terminate) {
++                av_log(s, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
++                break;
++            }
++            av_log(s, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
++        }
++    }
++
++    return NULL;
++}
++
++static int bit_threads_start(HEVCRpiContext * const s)
++{
++    if (s->bt_started)
++        return 0;
++
++    for (int i = 1; i < RPI_BIT_THREADS; ++i)
++    {
++        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
++        if (s->HEVClcList[i] == NULL) {
++            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
++                return -1;
++        }
++
++        bt_lc_init(s, s->HEVClcList[i], i);
++        job_lc_init(s->HEVClcList[i]);
++    }
++
++    // Link the sems in a circle
++    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
++        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
++    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
++
++    // Init all lc before starting any threads
++    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++    {
++        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
++            return -1;
++    }
++
++    s->bt_started = 1;
++    return 0;
++}
++
++static int bit_threads_kill(HEVCRpiContext * const s)
++{
++    if (!s->bt_started)
++        return 0;
++    s->bt_started = 0;
++
++    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++    {
++        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
++        if (lc == NULL)
++            break;
++
++        lc->bt_terminate = 1;
++        sem_post(&lc->bt_sem_in);
++        pthread_join(s->bit_threads[i], NULL);
++
++        sem_destroy(&lc->bt_sem_in);
++        job_lc_kill(lc);
++    }
++    return 0;
++}
++#endif
++
++
++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++{
++    HEVCRpiContext * const s  = avctxt->priv_data;
++    HEVCRpiLocalContext * const lc = s->HEVClc;
++    int err;
++
++    // Start of slice
++    if ((err = slice_start(s, lc)) != 0)
++        return err;
++
++#if RPI_EXTRA_BIT_THREADS > 0
++
++    if (s->sh.num_entry_point_offsets != 0 &&
++        s->ps.pps->num_tile_columns > 1)
++    {
++        unsigned int slice_row = 0;
++
++#if TRACE_WPP
++        printf("%s: Do Tiles\n", __func__);
++#endif
++        // Generate & start extra bit threads if they aren't already running
++        bit_threads_start(s);
++
++        do
++        {
++            // Reset lc lines etc.
++            tile_one_row_setup_lcs(s, slice_row);
++
++#if TRACE_WPP
++            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
++                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
++#if TRACE_WPP
++            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
++                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++            while (lc->bt_line_no <= lc->bt_last_line) {
++                rpi_sem_wait(&lc->bt_sem_in);
++                rpi_run_one_line(s, lc, 0);
++            }
++#if TRACE_WPP
++            printf("%s: Done body\n", __func__);
++#endif
++
++            // Wait for everything else to finish
++            rpi_sem_wait(&lc->bt_sem_in);
++
++            ++slice_row;
++        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
++
++
++#if TRACE_WPP
++        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++    else
++
++    // * We only cope with WPP in a single column
++    //   Probably want to deal with that case as tiles rather than WPP anyway
++    // ?? Not actually sure that the main code deals with WPP + multi-col correctly
++    if (s->ps.pps->entropy_coding_sync_enabled_flag &&
++        s->ps.pps->num_tile_columns == 1 &&
++        s->sh.num_entry_point_offsets != 0)
++    {
++#if TRACE_WPP
++        printf("%s: Do WPP\n", __func__);
++#endif
++        // Generate & start extra bit threads if they aren't already running
++        bit_threads_start(s);
++
++        // Reset lc lines etc.
++        wpp_setup_lcs(s);
++
++        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
++#if TRACE_WPP
++        printf("%s: Done 1st\n", __func__);
++#endif
++
++        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
++            rpi_sem_wait(&lc->bt_sem_in);
++            rpi_run_one_line(s, lc, 0);
++        }
++#if TRACE_WPP
++        printf("%s: Done body\n", __func__);
++#endif
++
++        // Wait for everything else to finish
++        rpi_sem_wait(&lc->bt_sem_in);
++
++#if TRACE_WPP
++        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++    else
++#endif
++    {
++#if TRACE_WPP
++        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
++#endif
++        // Single bit thread
++        do {
++            // Make sure we have space to prepare the next job
++            worker_pass0_ready(s, lc);
++
++            if ((err = fill_job(s, lc, 0)) < 0)
++                goto fail;
++
++            worker_submit_job(s, lc);
++        } while (!lc->unit_done);
++
++#if TRACE_WPP
++        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++
++    // If we have reached the end of the frame then wait for the worker to finish all its jobs
++    if (lc->ts >= s->ps.sps->ctb_size) {
++        worker_wait(s, lc);
++    }
++
++#if RPI_TSTATS
++    {
++        HEVCRpiStats *const ts = &s->tstats;
++
++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
++        memset(ts, 0, sizeof(*ts));
++    }
++#endif
++
++    return lc->ts;
++
++fail:
++    // Cleanup
++    av_log(s, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
++    // Free our job & wait for temination
++    worker_free(s, lc);
++    worker_wait(s, lc);
++    return err;
++}
++
++
++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++    int err;
++    if ((err = gen_entry_points(s, nal)) < 0)
++        return err;
++
++    return rpi_decode_entry(s->avctx, NULL);
++}
++
++static int set_side_data(HEVCRpiContext *s)
++{
++    AVFrame *out = s->ref->frame;
++
++    if (s->sei.frame_packing.present &&
++        s->sei.frame_packing.arrangement_type >= 3 &&
++        s->sei.frame_packing.arrangement_type <= 5 &&
++        s->sei.frame_packing.content_interpretation_type > 0 &&
++        s->sei.frame_packing.content_interpretation_type < 3) {
++        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
++        if (!stereo)
++            return AVERROR(ENOMEM);
++
++        switch (s->sei.frame_packing.arrangement_type) {
++        case 3:
++            if (s->sei.frame_packing.quincunx_subsampling)
++                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
++            else
++                stereo->type = AV_STEREO3D_SIDEBYSIDE;
++            break;
++        case 4:
++            stereo->type = AV_STEREO3D_TOPBOTTOM;
++            break;
++        case 5:
++            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
++            break;
++        }
++
++        if (s->sei.frame_packing.content_interpretation_type == 2)
++            stereo->flags = AV_STEREO3D_FLAG_INVERT;
++    }
++
++    if (s->sei.display_orientation.present &&
++        (s->sei.display_orientation.anticlockwise_rotation ||
++         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
++        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
++        AVFrameSideData *rotation = av_frame_new_side_data(out,
++                                                           AV_FRAME_DATA_DISPLAYMATRIX,
++                                                           sizeof(int32_t) * 9);
++        if (!rotation)
++            return AVERROR(ENOMEM);
++
++        av_display_rotation_set((int32_t *)rotation->data, angle);
++        av_display_matrix_flip((int32_t *)rotation->data,
++                               s->sei.display_orientation.hflip,
++                               s->sei.display_orientation.vflip);
++    }
++
++    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++    // so the side data persists for the entire coded video sequence.
++    if (s->sei.mastering_display.present > 0 &&
++        IS_IRAP(s) && s->no_rasl_output_flag) {
++        s->sei.mastering_display.present--;
++    }
++    if (s->sei.mastering_display.present) {
++        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
++        const int mapping[3] = {2, 0, 1};
++        const int chroma_den = 50000;
++        const int luma_den = 10000;
++        int i;
++        AVMasteringDisplayMetadata *metadata =
++            av_mastering_display_metadata_create_side_data(out);
++        if (!metadata)
++            return AVERROR(ENOMEM);
++
++        for (i = 0; i < 3; i++) {
++            const int j = mapping[i];
++            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
++            metadata->display_primaries[i][0].den = chroma_den;
++            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
++            metadata->display_primaries[i][1].den = chroma_den;
++        }
++        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
++        metadata->white_point[0].den = chroma_den;
++        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
++        metadata->white_point[1].den = chroma_den;
++
++        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
++        metadata->max_luminance.den = luma_den;
++        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
++        metadata->min_luminance.den = luma_den;
++        metadata->has_luminance = 1;
++        metadata->has_primaries = 1;
++
++        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
++        av_log(s->avctx, AV_LOG_DEBUG,
++               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
++               av_q2d(metadata->display_primaries[0][0]),
++               av_q2d(metadata->display_primaries[0][1]),
++               av_q2d(metadata->display_primaries[1][0]),
++               av_q2d(metadata->display_primaries[1][1]),
++               av_q2d(metadata->display_primaries[2][0]),
++               av_q2d(metadata->display_primaries[2][1]),
++               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
++        av_log(s->avctx, AV_LOG_DEBUG,
++               "min_luminance=%f, max_luminance=%f\n",
++               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
++    }
++    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++    // so the side data persists for the entire coded video sequence.
++    if (s->sei.content_light.present > 0 &&
++        IS_IRAP(s) && s->no_rasl_output_flag) {
++        s->sei.content_light.present--;
++    }
++    if (s->sei.content_light.present) {
++        AVContentLightMetadata *metadata =
++            av_content_light_metadata_create_side_data(out);
++        if (!metadata)
++            return AVERROR(ENOMEM);
++        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
++        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
++
++        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
++        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
++               metadata->MaxCLL, metadata->MaxFALL);
++    }
++
++    if (s->sei.a53_caption.a53_caption) {
++        AVFrameSideData* sd = av_frame_new_side_data(out,
++                                                     AV_FRAME_DATA_A53_CC,
++                                                     s->sei.a53_caption.a53_caption_size);
++        if (sd)
++            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
++        av_freep(&s->sei.a53_caption.a53_caption);
++        s->sei.a53_caption.a53_caption_size = 0;
++        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
++    }
++
++    if (s->sei.alternative_transfer.present &&
++        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
++        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
++        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
++    }
++
++    return 0;
++}
++
++static int hevc_frame_start(HEVCRpiContext * const s)
++{
++    int pic_size_in_ctb  = ((s->ps.sps->width  >> s->ps.sps->log2_min_cb_size) + 1) *
++                           ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
++    int ret;
++
++    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
++    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++
++    s->is_decoded        = 0;
++    s->first_nal_type    = s->nal_unit_type;
++
++    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
++
++    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
++    if (ret < 0)
++        goto fail;
++
++    ret = ff_hevc_rpi_frame_rps(s);
++    if (ret < 0) {
++        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
++        goto fail;
++    }
++
++    s->ref->frame->key_frame = IS_IRAP(s);
++
++    ret = set_side_data(s);
++    if (ret < 0)
++        goto fail;
++
++    s->frame->pict_type = 3 - s->sh.slice_type;
++
++    if (!IS_IRAP(s))
++        ff_hevc_rpi_bump_frame(s);
++
++    av_frame_unref(s->output_frame);
++    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
++    if (ret < 0)
++        goto fail;
++
++    ff_thread_finish_setup(s->avctx);
++
++    return 0;
++
++fail:
++    if (s->ref)
++        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++    s->ref = NULL;
++    return ret;
++}
++
++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
++{
++    GetBitContext * const gb    = &s->HEVClc->gb;
++    int ctb_addr_ts, ret;
++
++    *gb              = nal->gb;
++    s->nal_unit_type = nal->type;
++    s->temporal_id   = nal->temporal_id;
++
++    switch (s->nal_unit_type) {
++    case HEVC_NAL_VPS:
++        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_SPS:
++        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
++                                     s->apply_defdispwin);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_PPS:
++        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_SEI_PREFIX:
++    case HEVC_NAL_SEI_SUFFIX:
++        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_TRAIL_R:
++    case HEVC_NAL_TRAIL_N:
++    case HEVC_NAL_TSA_N:
++    case HEVC_NAL_TSA_R:
++    case HEVC_NAL_STSA_N:
++    case HEVC_NAL_STSA_R:
++    case HEVC_NAL_BLA_W_LP:
++    case HEVC_NAL_BLA_W_RADL:
++    case HEVC_NAL_BLA_N_LP:
++    case HEVC_NAL_IDR_W_RADL:
++    case HEVC_NAL_IDR_N_LP:
++    case HEVC_NAL_CRA_NUT:
++    case HEVC_NAL_RADL_N:
++    case HEVC_NAL_RADL_R:
++    case HEVC_NAL_RASL_N:
++    case HEVC_NAL_RASL_R:
++        ret = hls_slice_header(s);
++        if (ret < 0)
++            return ret;
++
++        // The definition of _N unit types is "non-reference for other frames
++        // with the same temporal_id" so they may/will be ref frames for pics
++        // with a higher temporal_id.
++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++            !(s->nal_unit_type == HEVC_NAL_TRAIL_N ||
++                        s->nal_unit_type == HEVC_NAL_TSA_N   ||
++                        s->nal_unit_type == HEVC_NAL_STSA_N  ||
++                        s->nal_unit_type == HEVC_NAL_RADL_N  ||
++                        s->nal_unit_type == HEVC_NAL_RASL_N);
++        s->offload_recon = s->used_for_ref;
++//        s->offload_recon = 0;
++
++#if DEBUG_DECODE_N
++        {
++            static int z = 0;
++            if (IS_IDR(s)) {
++                z = 1;
++            }
++            if (z != 0 && z++ > DEBUG_DECODE_N) {
++                s->is_decoded = 0;
++                break;
++            }
++        }
++#endif
++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
++            s->is_decoded = 0;
++            break;
++        }
++
++        if (s->sh.first_slice_in_pic_flag) {
++            if (s->max_ra == INT_MAX) {
++                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
++                    s->max_ra = s->poc;
++                } else {
++                    if (IS_IDR(s))
++                        s->max_ra = INT_MIN;
++                }
++            }
++
++            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
++                s->poc <= s->max_ra) {
++                s->is_decoded = 0;
++                break;
++            } else {
++                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
++                    s->max_ra = INT_MIN;
++            }
++
++            ret = hevc_frame_start(s);
++            if (ret < 0)
++                return ret;
++        } else if (!s->ref) {
++            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
++            goto fail;
++        }
++
++        if (s->nal_unit_type != s->first_nal_type) {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
++                   s->first_nal_type, s->nal_unit_type);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (!s->sh.dependent_slice_segment_flag &&
++            s->sh.slice_type != HEVC_SLICE_I) {
++            ret = ff_hevc_rpi_slice_rpl(s);
++            if (ret < 0) {
++                av_log(s->avctx, AV_LOG_WARNING,
++                       "Error constructing the reference lists for the current slice.\n");
++                goto fail;
++            }
++        }
++
++        ctb_addr_ts = hls_slice_data(s, nal);
++        if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) {
++            s->is_decoded = 1;
++        }
++
++        if (ctb_addr_ts < 0) {
++            ret = ctb_addr_ts;
++            goto fail;
++        }
++        break;
++    case HEVC_NAL_EOS_NUT:
++    case HEVC_NAL_EOB_NUT:
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++        break;
++    case HEVC_NAL_AUD:
++    case HEVC_NAL_FD_NUT:
++        break;
++    default:
++        av_log(s->avctx, AV_LOG_INFO,
++               "Skipping NAL unit %d\n", s->nal_unit_type);
++    }
++
++    return 0;
++fail:
++    if (s->avctx->err_recognition & AV_EF_EXPLODE)
++        return ret;
++    return 0;
++}
++
++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
++{
++    int i, ret = 0;
++    int eos_at_start = 1;
++
++    s->ref = NULL;
++    s->last_eos = s->eos;
++    s->eos = 0;
++
++    /* split the input packet into NAL units, so we know the upper bound on the
++     * number of slices in the frame */
++    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
++                                s->nal_length_size, s->avctx->codec_id, 1);
++    if (ret < 0) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Error splitting the input into NAL units.\n");
++        return ret;
++    }
++
++    for (i = 0; i < s->pkt.nb_nals; i++) {
++        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
++            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
++            if (eos_at_start) {
++                s->last_eos = 1;
++            } else {
++                s->eos = 1;
++            }
++        } else {
++            eos_at_start = 0;
++        }
++    }
++
++    /* decode the NAL units */
++    for (i = 0; i < s->pkt.nb_nals; i++) {
++        ret = decode_nal_unit(s, &s->pkt.nals[i]);
++        if (ret < 0) {
++            av_log(s->avctx, AV_LOG_WARNING,
++                   "Error parsing NAL unit #%d.\n", i);
++            goto fail;
++        }
++    }
++
++fail:  // Also success path
++    if (s->ref != NULL) {
++        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
++            ff_hevc_rpi_progress_signal_all_done(s);
++        }
++        else {
++            // Flush frame to real memory as we expect to be able to pass
++            // it straight on to mmal
++            flush_frame(s, s->frame);
++        }
++    }
++    return ret;
++}
++
++static void print_md5(void *log_ctx, int level, uint8_t md5[16])
++{
++    int i;
++    for (i = 0; i < 16; i++)
++        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
++}
++
++static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
++{
++    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++    int pixel_shift;
++    int i, j;
++
++    if (!desc)
++        return AVERROR(EINVAL);
++
++    pixel_shift = desc->comp[0].depth > 8;
++
++    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
++           s->poc);
++
++    /* the checksums are LE, so we have to byteswap for >8bpp formats
++     * on BE arches */
++#if HAVE_BIGENDIAN
++    if (pixel_shift && !s->checksum_buf) {
++        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
++                       FFMAX3(frame->linesize[0], frame->linesize[1],
++                              frame->linesize[2]));
++        if (!s->checksum_buf)
++            return AVERROR(ENOMEM);
++    }
++#endif
++
++    for (i = 0; frame->data[i]; i++) {
++        int width  = s->avctx->coded_width;
++        int height = s->avctx->coded_height;
++        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
++        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
++        uint8_t md5[16];
++
++        av_md5_init(s->sei.picture_hash.md5_ctx);
++        for (j = 0; j < h; j++) {
++            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
++#if HAVE_BIGENDIAN
++            if (pixel_shift) {
++                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
++                                    (const uint16_t *) src, w);
++                src = s->checksum_buf;
++            }
++#endif
++            av_md5_update(s->sei.picture_hash.md5_ctx, src, w << pixel_shift);
++        }
++        av_md5_final(s->sei.picture_hash.md5_ctx, md5);
++
++        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
++            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
++            print_md5(s->avctx, AV_LOG_DEBUG, md5);
++            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
++        } else {
++            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
++            print_md5(s->avctx, AV_LOG_ERROR, md5);
++            av_log   (s->avctx, AV_LOG_ERROR, " != ");
++            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
++            av_log   (s->avctx, AV_LOG_ERROR, "\n");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    av_log(s->avctx, AV_LOG_DEBUG, "\n");
++
++    return 0;
++}
++
++static int all_sps_supported(const HEVCRpiContext * const s)
++{
++    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        if (s->ps.sps_list[i] != NULL)
++        {
++            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++            if (!is_sps_supported(sps))
++                return 0;
++        }
++    }
++    return 1;
++}
++
++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
++{
++    int ret, i;
++
++    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
++                                   &s->nal_length_size, s->avctx->err_recognition,
++                                   s->apply_defdispwin, s->avctx);
++    if (ret < 0)
++        return ret;
++
++    /* export stream parameters from the first SPS */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        if (first && s->ps.sps_list[i]) {
++            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++            export_stream_params(s->avctx, &s->ps, sps);
++            break;
++        }
++    }
++
++    return 0;
++}
++
++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
++                             AVPacket *avpkt)
++{
++    int ret;
++    int new_extradata_size;
++    uint8_t *new_extradata;
++    HEVCRpiContext *s = avctx->priv_data;
++
++    if (!avpkt->size) {
++        ret = ff_hevc_rpi_output_frame(s, data, 1);
++        if (ret < 0)
++            return ret;
++
++        *got_output = ret;
++        return 0;
++    }
++
++    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
++                                            &new_extradata_size);
++    if (new_extradata && new_extradata_size > 0) {
++        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
++        if (ret < 0)
++            return ret;
++    }
++
++    s->ref = NULL;
++    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
++    if (ret < 0)
++        return ret;
++
++    /* verify the SEI checksum */
++    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
++        s->sei.picture_hash.is_md5) {
++        ret = verify_md5(s, s->ref->frame);
++        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
++            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++            return ret;
++        }
++    }
++    s->sei.picture_hash.is_md5 = 0;
++
++    if (s->is_decoded) {
++        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
++        s->is_decoded = 0;
++    }
++
++    if (s->output_frame->buf[0]) {
++        av_frame_move_ref(data, s->output_frame);
++        *got_output = 1;
++    }
++
++    return avpkt->size;
++}
++
++static int hevc_ref_frame(HEVCRpiContext *s, HEVCFrame *dst, HEVCFrame *src)
++{
++    int ret;
++
++    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
++    if (ret < 0)
++        return ret;
++
++    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++    if (!dst->tab_mvf_buf)
++        goto fail;
++    dst->tab_mvf = src->tab_mvf;
++
++    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++    if (!dst->rpl_tab_buf)
++        goto fail;
++    dst->rpl_tab = src->rpl_tab;
++
++    dst->rpl_buf = av_buffer_ref(src->rpl_buf);
++    if (!dst->rpl_buf)
++        goto fail;
++
++    dst->poc        = src->poc;
++    dst->ctb_count  = src->ctb_count;
++    dst->flags      = src->flags;
++    dst->sequence   = src->sequence;
++    return 0;
++
++fail:
++    ff_hevc_rpi_unref_frame(s, dst, ~0);
++    return AVERROR(ENOMEM);
++}
++
++
++static av_cold int hevc_decode_free(AVCodecContext *avctx)
++{
++    HEVCRpiContext * const s = avctx->priv_data;
++    int i;
++
++    pic_arrays_free(s);
++
++    av_freep(&s->sei.picture_hash.md5_ctx);
++
++    av_freep(&s->cabac_state);
++
++#if RPI_EXTRA_BIT_THREADS
++    bit_threads_kill(s);
++#endif
++
++    hevc_exit_worker(s);
++    vpu_qpu_term();
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++    }
++    job_lc_kill(s->HEVClc);
++    av_rpi_zc_uninit(avctx);
++
++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
++    av_freep(&s->sao_pixel_buffer_v[0]);
++    av_frame_free(&s->output_frame);
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++        av_frame_free(&s->DPB[i].frame);
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
++        av_buffer_unref(&s->ps.vps_list[i]);
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
++        av_buffer_unref(&s->ps.sps_list[i]);
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
++        av_buffer_unref(&s->ps.pps_list[i]);
++    s->ps.sps = NULL;
++    s->ps.pps = NULL;
++    s->ps.vps = NULL;
++
++    av_freep(&s->sh.entry_point_offset);
++    av_freep(&s->sh.offset);
++    av_freep(&s->sh.size);
++
++    for (i = 1; i < s->threads_number; i++) {
++        if (s->sList[i] != NULL) {
++            av_freep(&s->sList[i]);
++        }
++    }
++
++    // Free separately from sLists as used that way by RPI WPP
++    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
++        av_freep(s->HEVClcList + i);
++    }
++    s->HEVClc = NULL;  // Allocated as part of HEVClcList
++
++    ff_h2645_packet_uninit(&s->pkt);
++
++    return 0;
++}
++
++
++static av_cold int hevc_init_context(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    int i;
++
++    s->avctx = avctx;
++
++    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
++    if (!s->HEVClc)
++        goto fail;
++    s->HEVClcList[0] = s->HEVClc;
++    s->sList[0] = s;
++
++    // Whilst FFmpegs init fn is only called once the close fn is called as
++    // many times as we have threads (init_thread_copy is called for the
++    // threads).  So to match init & term put the init here where it will be
++    // called by both init & copy
++    av_rpi_zc_init(avctx);
++
++    if (vpu_qpu_init() != 0)
++        goto fail;
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    {
++        static const uint32_t dframe[1] = {0x80808080};
++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
++    }
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
++#endif
++
++    bt_lc_init(s, s->HEVClc, 0);
++    job_lc_init(s->HEVClc);
++
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
++    }
++
++    s->cabac_state = av_malloc(HEVC_CONTEXTS);
++    if (!s->cabac_state)
++        goto fail;
++
++    s->output_frame = av_frame_alloc();
++    if (!s->output_frame)
++        goto fail;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        s->DPB[i].frame = av_frame_alloc();
++        if (!s->DPB[i].frame)
++            goto fail;
++        s->DPB[i].tf.f = s->DPB[i].frame;
++        s->DPB[i].dpb_no = i;
++    }
++
++    s->max_ra = INT_MAX;
++
++    s->sei.picture_hash.md5_ctx = av_md5_alloc();
++    if (!s->sei.picture_hash.md5_ctx)
++        goto fail;
++
++    ff_bswapdsp_init(&s->bdsp);
++
++    s->context_initialized = 1;
++    s->eos = 0;
++
++    ff_hevc_rpi_reset_sei(&s->sei);
++
++    return 0;
++
++fail:
++    av_log(s, AV_LOG_ERROR, "%s: Failed\n", __func__);
++    hevc_decode_free(avctx);
++    return AVERROR(ENOMEM);
++}
++
++static int hevc_update_thread_context(AVCodecContext *dst,
++                                      const AVCodecContext *src)
++{
++    HEVCRpiContext *s  = dst->priv_data;
++    HEVCRpiContext *s0 = src->priv_data;
++    int i, ret;
++
++    if (!s->context_initialized) {
++        ret = hevc_init_context(dst);
++        if (ret < 0)
++            return ret;
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++        if (s0->DPB[i].frame->buf[0]) {
++            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
++            if (ret < 0)
++                return ret;
++        }
++    }
++
++    if (s->ps.sps != s0->ps.sps)
++        s->ps.sps = NULL;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
++        av_buffer_unref(&s->ps.vps_list[i]);
++        if (s0->ps.vps_list[i]) {
++            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
++            if (!s->ps.vps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        av_buffer_unref(&s->ps.sps_list[i]);
++        if (s0->ps.sps_list[i]) {
++            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
++            if (!s->ps.sps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
++        av_buffer_unref(&s->ps.pps_list[i]);
++        if (s0->ps.pps_list[i]) {
++            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
++            if (!s->ps.pps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    if (s->ps.sps != s0->ps.sps)
++        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
++            return ret;
++
++    s->seq_decode = s0->seq_decode;
++    s->seq_output = s0->seq_output;
++    s->pocTid0    = s0->pocTid0;
++    s->max_ra     = s0->max_ra;
++    s->eos        = s0->eos;
++    s->no_rasl_output_flag = s0->no_rasl_output_flag;
++
++    s->is_nalff        = s0->is_nalff;
++    s->nal_length_size = s0->nal_length_size;
++
++    s->threads_number      = s0->threads_number;
++    s->threads_type        = s0->threads_type;
++
++    if (s0->eos) {
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra = INT_MAX;
++    }
++
++    s->sei.frame_packing        = s0->sei.frame_packing;
++    s->sei.display_orientation  = s0->sei.display_orientation;
++    s->sei.mastering_display    = s0->sei.mastering_display;
++    s->sei.content_light        = s0->sei.content_light;
++    s->sei.alternative_transfer = s0->sei.alternative_transfer;
++
++    // * We do this here as it allows us to easily locate our parents
++    //   global job pool, but there really should be a less nasty way
++    if (s->jbc == NULL)
++    {
++        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
++        hevc_init_worker(s);
++    }
++
++    return 0;
++}
++
++static av_cold int hevc_decode_init(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    int ret;
++
++    avctx->internal->allocate_progress = 1;
++
++    {
++        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
++        if (jbg == NULL)
++        {
++            av_log(s, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
++            return -1;
++        }
++
++        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
++        {
++            av_log(s, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
++            return -1;
++        }
++    }
++
++    ret = hevc_init_context(avctx);
++    if (ret < 0)
++        return ret;
++
++    hevc_init_worker(s);
++
++    s->enable_parallel_tiles = 0;
++    s->sei.picture_timing.picture_struct = 0;
++    s->eos = 1;
++
++    atomic_init(&s->wpp_err, 0);
++
++    if(avctx->active_thread_type & FF_THREAD_SLICE)
++        s->threads_number = avctx->thread_count;
++    else
++        s->threads_number = 1;
++
++    if (avctx->extradata_size > 0 && avctx->extradata) {
++        ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
++
++        if (ret == 0 && !all_sps_supported(s))
++            ret = AVERROR_DECODER_NOT_FOUND;
++
++        if (ret < 0)
++        {
++            hevc_decode_free(avctx);
++            return ret;
++        }
++    }
++
++    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++        s->threads_type = FF_THREAD_FRAME;
++    else
++        s->threads_type = FF_THREAD_SLICE;
++
++    return 0;
++}
++
++static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    int ret;
++
++    memset(s, 0, sizeof(*s));
++
++    ret = hevc_init_context(avctx);
++    if (ret < 0)
++        return ret;
++
++    return 0;
++}
++
++static void hevc_decode_flush(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    ff_hevc_rpi_flush_dpb(s);
++    s->max_ra = INT_MAX;
++    s->eos = 1;
++}
++
++#define OFFSET(x) offsetof(HEVCRpiContext, x)
++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
++
++
++static const AVOption options[] = {
++    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
++        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
++        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++    { NULL },
++};
++
++static const AVClass hevc_rpi_decoder_class = {
++    .class_name = "HEVC RPI decoder",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++};
++
++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
++    AV_PIX_FMT_SAND128,
++    AV_PIX_FMT_SAND64_10,
++    AV_PIX_FMT_NONE
++};
++
++AVCodec ff_hevc_rpi_decoder = {
++    .name                  = "hevc_rpi",
++    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
++    .type                  = AVMEDIA_TYPE_VIDEO,
++    .id                    = AV_CODEC_ID_HEVC,
++    .priv_data_size        = sizeof(HEVCRpiContext),
++    .priv_class            = &hevc_rpi_decoder_class,
++    .init                  = hevc_decode_init,
++    .close                 = hevc_decode_free,
++    .decode                = hevc_rpi_decode_frame,
++    .flush                 = hevc_decode_flush,
++    .update_thread_context = hevc_update_thread_context,
++    .init_thread_copy      = hevc_init_thread_copy,
++    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++#if 0
++    // Debugging is often easier without threads getting in the way
++                            0,
++#warning H265 threading turned off
++#else
++    // We only have decent optimisation for frame - so only admit to that
++                             AV_CODEC_CAP_FRAME_THREADS,
++#endif
++    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING,
++    .pix_fmts              = hevc_rpi_pix_fmts,
++    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++};
++
+diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
+new file mode 100644
+index 0000000000..f61b29e669
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.h
+@@ -0,0 +1,1054 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDEC_H
++#define AVCODEC_RPI_HEVCDEC_H
++
++#include "config.h"
++
++#include <stdatomic.h>
++
++#include "libavutil/buffer.h"
++
++#include "avcodec.h"
++#include "bswapdsp.h"
++#include "cabac.h"
++#include "get_bits.h"
++#include "rpi_hevcpred.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++#include "rpi_hevcdsp.h"
++#include "internal.h"
++#include "thread.h"
++#include "videodsp.h"
++
++#define MAX_NB_THREADS 16
++#define SHIFT_CTB_WPP 2
++
++//TODO: check if this is really the maximum
++#define MAX_TRANSFORM_DEPTH 5
++
++#define MAX_TB_SIZE 32
++#define MAX_QP 51
++#define DEFAULT_INTRA_TC_OFFSET 2
++
++#define HEVC_CONTEXTS 199
++
++#define MRG_MAX_NUM_CANDS     5
++
++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
++
++// Size of DPB array
++#define HEVC_DPB_ELS            32
++
++#define L0 0
++#define L1 1
++
++#define EPEL_EXTRA_BEFORE 1
++#define EPEL_EXTRA_AFTER  2
++#define EPEL_EXTRA        3
++#define QPEL_EXTRA_BEFORE 3
++#define QPEL_EXTRA_AFTER  4
++#define QPEL_EXTRA        7
++
++#define EDGE_EMU_BUFFER_STRIDE 80
++
++#include <semaphore.h>
++#include "rpi_qpu.h"
++
++// Max jobs per frame thread. Actual usage will be limited by the size
++// of the global job pool
++// ?? Limits
++#define RPI_MAX_JOBS            8
++
++// This is the number of _extra_ bit threads - we will have
++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
++//
++// 0 is legitimate and will disable our WPP processing
++//#define RPI_EXTRA_BIT_THREADS 0
++#define RPI_EXTRA_BIT_THREADS   2
++
++// Number of separate threads/passes in worker
++// 2 and 3 are the currently valid numbers
++// At the moment 3 seems fractionally faster
++//#define RPI_PASSES 2
++#define RPI_PASSES              3
++
++// Print out various usage stats
++#define RPI_TSTATS              0
++
++// Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++// (currently slower than deblocking on the ARM)
++// #define RPI_DEBLOCK_VPU
++
++#define RPI_VPU_DEBLOCK_CACHED 0
++
++// Use ARM emulation of QPU pred
++// These are for debug only as the emulation makes only limited
++// effort to be fast
++#define RPI_QPU_EMU_Y           0
++#define RPI_QPU_EMU_C           0
++
++// Max width & height we are prepared to consider
++// Sand frame shape calc becomes confused with large frames
++// Some buffer alloc also depends on this
++#define HEVC_RPI_MAX_WIDTH      2048
++#define HEVC_RPI_MAX_HEIGHT     1088
++
++
++/**
++ * Value of the luma sample at position (x, y) in the 2D array tab.
++ */
++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
++
++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
++                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
++
++enum RPSType {
++    ST_CURR_BEF = 0,
++    ST_CURR_AFT,
++    ST_FOLL,
++    LT_CURR,
++    LT_FOLL,
++    NB_RPS_TYPE,
++};
++
++enum SyntaxElement {
++    SAO_MERGE_FLAG = 0,
++    SAO_TYPE_IDX,
++    SAO_EO_CLASS,
++    SAO_BAND_POSITION,
++    SAO_OFFSET_ABS,
++    SAO_OFFSET_SIGN,
++    END_OF_SLICE_FLAG,
++    SPLIT_CODING_UNIT_FLAG,
++    CU_TRANSQUANT_BYPASS_FLAG,
++    SKIP_FLAG,
++    CU_QP_DELTA,
++    PRED_MODE_FLAG,
++    PART_MODE,
++    PCM_FLAG,
++    PREV_INTRA_LUMA_PRED_FLAG,
++    MPM_IDX,
++    REM_INTRA_LUMA_PRED_MODE,
++    INTRA_CHROMA_PRED_MODE,
++    MERGE_FLAG,
++    MERGE_IDX,
++    INTER_PRED_IDC,
++    REF_IDX_L0,
++    REF_IDX_L1,
++    ABS_MVD_GREATER0_FLAG,
++    ABS_MVD_GREATER1_FLAG,
++    ABS_MVD_MINUS2,
++    MVD_SIGN_FLAG,
++    MVP_LX_FLAG,
++    NO_RESIDUAL_DATA_FLAG,
++    SPLIT_TRANSFORM_FLAG,
++    CBF_LUMA,
++    CBF_CB_CR,
++    TRANSFORM_SKIP_FLAG,
++    EXPLICIT_RDPCM_FLAG,
++    EXPLICIT_RDPCM_DIR_FLAG,
++    LAST_SIGNIFICANT_COEFF_X_PREFIX,
++    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
++    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
++    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
++    SIGNIFICANT_COEFF_GROUP_FLAG,
++    SIGNIFICANT_COEFF_FLAG,
++    COEFF_ABS_LEVEL_GREATER1_FLAG,
++    COEFF_ABS_LEVEL_GREATER2_FLAG,
++    COEFF_ABS_LEVEL_REMAINING,
++    COEFF_SIGN_FLAG,
++    LOG2_RES_SCALE_ABS,
++    RES_SCALE_SIGN_FLAG,
++    CU_CHROMA_QP_OFFSET_FLAG,
++    CU_CHROMA_QP_OFFSET_IDX,
++};
++
++enum PartMode {
++    PART_2Nx2N = 0,
++    PART_2NxN  = 1,
++    PART_Nx2N  = 2,
++    PART_NxN   = 3,
++    PART_2NxnU = 4,
++    PART_2NxnD = 5,
++    PART_nLx2N = 6,
++    PART_nRx2N = 7,
++};
++
++enum PredMode {
++    MODE_INTER = 0,
++    MODE_INTRA,
++    MODE_SKIP,
++};
++
++enum InterPredIdc {
++    PRED_L0 = 0,
++    PRED_L1,
++    PRED_BI,
++};
++
++enum PredFlag {
++    PF_INTRA = 0,
++    PF_L0,
++    PF_L1,
++    PF_BI,
++};
++
++enum IntraPredMode {
++    INTRA_PLANAR = 0,
++    INTRA_DC,
++    INTRA_ANGULAR_2,
++    INTRA_ANGULAR_3,
++    INTRA_ANGULAR_4,
++    INTRA_ANGULAR_5,
++    INTRA_ANGULAR_6,
++    INTRA_ANGULAR_7,
++    INTRA_ANGULAR_8,
++    INTRA_ANGULAR_9,
++    INTRA_ANGULAR_10,
++    INTRA_ANGULAR_11,
++    INTRA_ANGULAR_12,
++    INTRA_ANGULAR_13,
++    INTRA_ANGULAR_14,
++    INTRA_ANGULAR_15,
++    INTRA_ANGULAR_16,
++    INTRA_ANGULAR_17,
++    INTRA_ANGULAR_18,
++    INTRA_ANGULAR_19,
++    INTRA_ANGULAR_20,
++    INTRA_ANGULAR_21,
++    INTRA_ANGULAR_22,
++    INTRA_ANGULAR_23,
++    INTRA_ANGULAR_24,
++    INTRA_ANGULAR_25,
++    INTRA_ANGULAR_26,
++    INTRA_ANGULAR_27,
++    INTRA_ANGULAR_28,
++    INTRA_ANGULAR_29,
++    INTRA_ANGULAR_30,
++    INTRA_ANGULAR_31,
++    INTRA_ANGULAR_32,
++    INTRA_ANGULAR_33,
++    INTRA_ANGULAR_34,
++};
++
++enum SAOType {
++    SAO_NOT_APPLIED = 0,
++    SAO_BAND,
++    SAO_EDGE,
++    SAO_APPLIED
++};
++
++enum SAOEOClass {
++    SAO_EO_HORIZ = 0,
++    SAO_EO_VERT,
++    SAO_EO_135D,
++    SAO_EO_45D,
++};
++
++enum ScanType {
++    SCAN_DIAG = 0,
++    SCAN_HORIZ,
++    SCAN_VERT,
++};
++
++typedef struct RefPicList {
++    struct HEVCFrame *ref[HEVC_MAX_REFS];
++    int list[HEVC_MAX_REFS];
++    int isLongTerm[HEVC_MAX_REFS];
++    int nb_refs;
++} RefPicList;
++
++typedef struct RefPicListTab {
++    RefPicList refPicList[2];
++} RefPicListTab;
++
++typedef struct CodingUnit {
++    int x;
++    int y;
++
++    enum PredMode pred_mode;    ///< PredMode
++    enum PartMode part_mode;    ///< PartMode
++
++    // Inferred parameters
++    uint8_t intra_split_flag;   ///< IntraSplitFlag
++    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
++    uint8_t cu_transquant_bypass_flag;
++} CodingUnit;
++
++typedef struct NeighbourAvailable {
++    int cand_bottom_left;
++    int cand_left;
++    int cand_up;
++    int cand_up_left;
++    int cand_up_right;
++    int cand_up_right_sap;
++} NeighbourAvailable;
++
++typedef struct PredictionUnit {
++    int mpm_idx;
++    int rem_intra_luma_pred_mode;
++    uint8_t intra_pred_mode[4];
++    Mv mvd;
++    uint8_t merge_flag;
++    uint8_t intra_pred_mode_c[4];
++    uint8_t chroma_mode_c[4];
++} PredictionUnit;
++
++typedef struct TransformUnit {
++    int cu_qp_delta;
++
++    int res_scale_val;
++
++    // Inferred parameters;
++    int intra_pred_mode;
++    int intra_pred_mode_c;
++    int chroma_mode_c;
++    uint8_t is_cu_qp_delta_coded;
++    uint8_t is_cu_chroma_qp_offset_coded;
++    int8_t  cu_qp_offset_cb;
++    int8_t  cu_qp_offset_cr;
++    uint8_t cross_pf;
++} TransformUnit;
++
++typedef struct DBParams {
++    int8_t beta_offset; // -12 to +12
++    int8_t tc_offset;   // -12 to +12
++} DBParams;
++
++#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
++#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
++#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
++
++struct HEVCRpiJob;
++
++typedef struct HEVCFrame {
++    AVFrame *frame;
++    ThreadFrame tf;
++    MvField *tab_mvf;
++    RefPicList *refPicList;
++    RefPicListTab **rpl_tab;
++    int ctb_count;
++    int poc;
++    struct HEVCFrame *collocated_ref;
++
++    AVBufferRef *tab_mvf_buf;
++    AVBufferRef *rpl_tab_buf;
++    AVBufferRef *rpl_buf;
++
++    /**
++     * A sequence counter, so that old frames are output first
++     * after a POC reset
++     */
++    uint16_t sequence;
++
++    /**
++     * A combination of HEVC_FRAME_FLAG_*
++     */
++    uint8_t flags;
++
++    // Entry no in DPB - can be used as a small unique
++    // frame identifier (within the current thread)
++    uint8_t dpb_no;
++} HEVCFrame;
++
++typedef struct HEVCRpiLocalContextIntra {
++    TransformUnit tu;
++    NeighbourAvailable na;
++} HEVCRpiLocalContextIntra;
++
++typedef struct HEVCRpiLocalContext {
++    TransformUnit tu;  // Moved to start to match HEVCRpiLocalContextIntra (yuk!)
++    NeighbourAvailable na;
++
++    // Vars that allow us to locate everything from just an lc
++    struct HEVCRpiContext * context;  // ??? make const ???
++    unsigned int lc_n; // lc list el no
++
++    // Job wait links
++    struct HEVCRpiLocalContext * jw_next;
++    struct HEVCRpiLocalContext * jw_prev;
++    struct HEVCRpiLocalContext * ljw_next;
++    struct HEVCRpiLocalContext * ljw_prev;
++    struct HEVCRpiJob * volatile jw_job;
++    sem_t jw_sem;
++
++    // ?? Wrap in structure ??
++    sem_t bt_sem_in;
++    sem_t * bt_psem_out;
++    volatile int bt_terminate;
++    unsigned int ts;
++    unsigned int bt_last_line;  // Last line in this bit_thread chunk
++    unsigned int bt_line_no;
++    unsigned int bt_line_width;
++    unsigned int bt_line_inc;
++
++    struct HEVCRpiJob * jb0;
++    char unit_done;  // Set once we have dealt with this slice
++//    char max_done;
++    char bt_is_tile;
++    char last_progress_good;
++
++    char wpp_init;   // WPP/Tile bitstream init has happened
++
++    uint8_t cabac_state[HEVC_CONTEXTS];
++
++    uint8_t stat_coeff[4];
++
++//    uint8_t first_qp_group;
++
++    GetBitContext gb;
++    CABACContext cc;
++
++    int8_t qp_y;
++    int8_t curr_qp_y;
++
++    int qPy_pred;
++
++    uint8_t ctb_left_flag;
++    uint8_t ctb_up_flag;
++    uint8_t ctb_up_right_flag;
++    uint8_t ctb_up_left_flag;
++    int     end_of_tiles_x;
++    int     end_of_tiles_y;
++    /* +7 is for subpixel interpolation, *2 for high bit depths */
++    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++    /* The extended size between the new edge emu buffer is abused by SAO */
++    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++
++    int ct_depth;
++    CodingUnit cu;
++    PredictionUnit pu;
++
++#define BOUNDARY_LEFT_SLICE     (1 << 0)
++#define BOUNDARY_LEFT_TILE      (1 << 1)
++#define BOUNDARY_UPPER_SLICE    (1 << 2)
++#define BOUNDARY_UPPER_TILE     (1 << 3)
++    /* properties of the boundary of the current CTB for the purposes
++     * of the deblocking filter */
++    int boundary_flags;
++} HEVCRpiLocalContext;
++
++
++// Each block can have an intra prediction and an add_residual command
++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
++
++// Sand only has 2 planes (Y/C)
++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
++
++#ifdef RPI_DEBLOCK_VPU
++// Worst case is 16x16 CTUs
++#define RPI_MAX_DEBLOCK_CMDS (HEVC_RPI_MAX_WIDTH*4/16)
++#endif
++
++// Command for intra prediction and transform_add of predictions to coefficients
++enum rpi_pred_cmd_e
++{
++    RPI_PRED_ADD_RESIDUAL,
++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++    RPI_PRED_ADD_DC,
++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
++    RPI_PRED_ADD_DC_V,
++    RPI_PRED_INTRA,
++    RPI_PRED_I_PCM,
++    RPI_PRED_CMD_MAX
++};
++
++typedef struct HEVCPredCmd {
++    uint8_t type;
++    uint8_t size;  // log2 "size" used by all variants
++    uint8_t na;    // i_pred - but left here as they pack well
++    uint8_t c_idx; // i_pred
++    union {
++        struct {  // TRANSFORM_ADD
++            uint8_t * dst;
++            const int16_t * buf;
++            uint16_t stride;  // Should be good enough for all pic fmts we use
++            int16_t dc;
++        } ta;
++        struct {
++            uint8_t * dst;
++            uint32_t stride;
++            int dc;
++        } dc;
++        struct {  // INTRA
++            uint16_t x;
++            uint16_t y;
++            enum IntraPredMode mode;
++        } i_pred;
++        struct {  // I_PCM
++            uint16_t x;
++            uint16_t y;
++            const void * src;
++            uint32_t src_len;
++        } i_pcm;
++    };
++} HEVCPredCmd;
++
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
++
++typedef struct HEVCRpiInterPredQ
++{
++    union qpu_mc_pred_cmd_u *qpu_mc_base;
++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    struct qpu_mc_src_s *last_l0;
++    struct qpu_mc_src_s *last_l1;
++    unsigned int load;
++    uint32_t code_setup;
++    uint32_t code_sync;
++    uint32_t code_exit;
++} HEVCRpiInterPredQ;
++
++typedef struct HEVCRpiInterPredEnv
++{
++    HEVCRpiInterPredQ * q;
++    uint8_t n;                  // Number of Qs
++    uint8_t n_grp;              // Number of Q in a group
++    uint8_t curr;               // Current Q number (0..n-1)
++    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
++    uint8_t used_grp;           // 0 if nothing in any Q in the current group
++    unsigned int max_fill;
++    unsigned int min_gap;
++    GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++    unsigned int n;        // Number of commands
++    HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCoeffEnv {
++    unsigned int n;
++    int16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCoeffsEnv {
++    HEVCRpiCoeffEnv s[4];
++    GPU_MEM_PTR_T gptr;
++    void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiFrameProgressWait {
++    int req;
++    struct HEVCRpiFrameProgressWait * next;
++    sem_t sem;
++} HEVCRpiFrameProgressWait;
++
++typedef struct HEVCRpiFrameProgressState {
++    struct HEVCRpiFrameProgressWait * first;
++    struct HEVCRpiFrameProgressWait * last;
++    pthread_mutex_t lock;
++} HEVCRpiFrameProgressState;
++
++typedef struct RpiBlk
++{
++    unsigned int x;
++    unsigned int y;
++    unsigned int w;
++    unsigned int h;
++} RpiBlk;
++
++typedef struct HEVCRpiJob {
++    struct HEVCRpiJob * next;  // Free chain
++    struct HEVCRpiJobCtl * jbc_local;
++    const HEVCRpiSPS * sps;       // sps used to set up this job
++
++    int waited;
++    int ctu_ts_first;
++    int ctu_ts_last;
++    RpiBlk bounds;  // Bounding box of job
++
++    struct qpu_mc_pred_y_p_s * last_y8_p;
++    struct qpu_mc_src_s * last_y8_l1;
++
++    HEVCRpiInterPredEnv chroma_ip;
++    HEVCRpiInterPredEnv luma_ip;
++    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
++    HEVCRpiIntraPredEnv intra;
++    HEVCRpiCoeffsEnv coeffs;
++    HEVCRpiFrameProgressWait progress_wait;
++} HEVCRpiJob;
++
++struct HEVCRpiContext;
++
++typedef void HEVCRpiWorkerFn(struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
++
++typedef struct HEVCRpiPassQueue
++{
++//    int pending;
++    volatile int terminate;
++    sem_t sem_in;
++    sem_t * psem_out;
++    unsigned int job_n;
++    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
++    HEVCRpiWorkerFn * worker;
++    pthread_t thread;
++    uint8_t pass_n;  // Pass number - debug
++    uint8_t started;
++} HEVCRpiPassQueue;
++
++
++struct HEVCRpiJobGlobal;
++
++typedef struct HEVCRpiJobCtl
++{
++    sem_t sem_out;
++
++    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
++    struct HEVCRpiJobGlobal * jbg;
++
++    HEVCRpiLocalContext * lcw_head;
++    HEVCRpiLocalContext * lcw_tail;
++
++    pthread_mutex_t in_lock;
++    int offload_in;
++
++    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
++} HEVCRpiJobCtl;
++
++
++typedef struct HEVCRpiJobGlobal
++{
++    intptr_t ref_count;
++    pthread_mutex_t lock;
++    HEVCRpiJob * free1;                 // Singly linked list of free jobs
++    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
++    HEVCRpiLocalContext * wait_good;  // Last good tail
++    HEVCRpiLocalContext * wait_tail;
++
++} HEVCRpiJobGlobal;
++
++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++    int y_pred1_y8_merge;
++    int y_pred1_xy;
++    int y_pred1_x0;
++    int y_pred1_y0;
++    int y_pred1_x0y0;
++    int y_pred1_wle8;
++    int y_pred1_wgt8;
++    int y_pred1_hle16;
++    int y_pred1_hgt16;
++    int y_pred2_xy;
++    int y_pred2_x0;
++    int y_pred2_y0;
++    int y_pred2_x0y0;
++    int y_pred2_hle16;
++    int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++
++typedef struct HEVCRpiContext {
++    const AVClass *c;  // needed by private avoptions
++    AVCodecContext *avctx;
++
++    struct HEVCRpiContext  *sList[MAX_NB_THREADS];
++
++    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
++    HEVCRpiLocalContext    *HEVClc;
++
++    uint8_t             threads_type;
++    uint8_t             threads_number;
++
++    int                 width;
++    int                 height;
++
++    char used_for_ref;  // rpi
++    char offload_recon;
++
++    HEVCRpiJobCtl * jbc;
++
++    // Function pointers
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
++#endif
++    HEVCRpiQpu qpu;
++
++    HEVCRpiFrameProgressState progress_states[2];
++
++#ifdef RPI_DEBLOCK_VPU
++// With the new scheme of rpi_execute_dblk_cmds 
++// it looks like ff_hevc_rpi_hls_filter is no longer called in raster order.
++// This causes trouble if RPI_DEBLOCK_VPU_Q_COUNT > 1 because we prepare setup
++// data for more than one row at a time before triggering the deblocker for one row.
++// This means that the deblock of the final row can use the wrong setup buffer.
++// 
++// Also concerned that the thread progress and waiting for job completion is
++// not done correctly with RPI_DEBLOCK_VPU at the end of the frame, or for small CTU sizes.
++#define RPI_DEBLOCK_VPU_Q_COUNT 1
++
++    int enable_rpi_deblock;
++
++    int uv_setup_width;
++    int uv_setup_height;
++    int setup_width; // Number of 16x16 blocks across the image
++    int setup_height; // Number of 16x16 blocks down the image
++
++    struct dblk_vpu_q_s
++    {
++        GPU_MEM_PTR_T deblock_vpu_gmem;
++
++        uint8_t (*y_setup_arm)[2][2][2][4];
++        uint8_t (*y_setup_vc)[2][2][2][4];
++
++        uint8_t (*uv_setup_arm)[2][2][2][4];
++        uint8_t (*uv_setup_vc)[2][2][2][4];
++
++        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
++        int vpu_cmds_vc;
++
++        vpu_qpu_wait_h cmd_id;
++    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
++
++    struct dblk_vpu_q_s * dvq;
++    unsigned int dvq_n;
++#endif
++
++    uint8_t *cabac_state;
++
++    /** 1 if the independent slice segment header was successfully parsed */
++    uint8_t slice_initialized;
++
++    AVFrame *frame;
++    AVFrame *output_frame;
++    uint8_t *sao_pixel_buffer_h[3];
++    uint8_t *sao_pixel_buffer_v[3];
++
++    HEVCRpiParamSets ps;
++
++    AVBufferPool *tab_mvf_pool;
++    AVBufferPool *rpl_tab_pool;
++
++    ///< candidate references for the current frame
++    RefPicList rps[5];
++
++    SliceHeader sh;
++    SAOParams *sao;
++    DBParams *deblock;
++    enum HEVCNALUnitType nal_unit_type;
++    int temporal_id;  ///< temporal_id_plus1 - 1
++    HEVCFrame *ref;
++    HEVCFrame DPB[HEVC_DPB_ELS];
++    int poc;
++    int pocTid0;
++    int slice_idx; ///< number of the slice being currently decoded
++    int eos;       ///< current packet contains an EOS/EOB NAL
++    int last_eos;  ///< last packet contains an EOS/EOB NAL
++    int max_ra;
++    int bs_width;
++    int bs_height;
++
++    int is_decoded;
++    int no_rasl_output_flag;
++
++    HEVCPredContext hpc;
++    HEVCDSPContext hevcdsp;
++    VideoDSPContext vdsp;
++    BswapDSPContext bdsp;
++    int8_t *qp_y_tab;
++    uint8_t *horizontal_bs;
++    uint8_t *vertical_bs;
++
++    int32_t *tab_slice_address;
++
++    //  CU
++    uint8_t *skip_flag;
++    uint8_t *tab_ct_depth;
++    // PU
++    uint8_t *tab_ipm;
++
++    uint8_t *cbf_luma; // cbf_luma of colocated TU
++    uint8_t *is_pcm;
++
++    // CTB-level flags affecting loop filter operation
++    uint8_t *filter_slice_edges;
++
++    /** used on BE to byteswap the lines for checksumming */
++    uint8_t *checksum_buf;
++    int      checksum_buf_size;
++
++    /**
++     * Sequence counters for decoded and output frames, so that old
++     * frames are output first after a POC reset
++     */
++    uint16_t seq_decode;
++    uint16_t seq_output;
++
++    int enable_parallel_tiles;
++    atomic_int wpp_err;
++
++    const uint8_t *data;
++
++    H2645Packet pkt;
++    // type of the first VCL NAL of the current frame
++    enum HEVCNALUnitType first_nal_type;
++
++    uint8_t context_initialized;
++    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
++                            ///< as a format defined in 14496-15
++    int apply_defdispwin;
++
++    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
++    int nuh_layer_id;
++
++    HEVCSEIContext sei;
++
++    // Put structures that allocate non-trivial storage at the end
++    // These are mostly used indirectly so position in the structure doesn't matter
++    HEVCRpiLocalContextIntra HEVClcIntra;
++    HEVCRpiPassQueue passq[RPI_PASSES];
++#if RPI_EXTRA_BIT_THREADS > 0
++    int bt_started;
++    // This simply contains thread descriptors - task setup is held elsewhere
++    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
++#endif
++#if RPI_TSTATS
++    HEVCRpiStats tstats;
++#endif
++} HEVCRpiContext;
++
++/**
++ * Mark all frames in DPB as unused for reference.
++ */
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
++
++/**
++ * Drop all frames currently in DPB.
++ */
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
++
++const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref,
++                                 int x0, int y0);
++
++/**
++ * Construct the reference picture sets for the current frame.
++ */
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture list(s) for the current slice.
++ */
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts);
++int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts);
++int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_end_of_slice_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                             const int x0, const int y0, const int x_cb, const int y_cb);
++int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int ct_depth,
++                                          const int x0, const int y0);
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
++int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
++int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size);
++int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth);
++int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth);
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
++int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx);
++
++/**
++ * Get the number of candidate references for the current frame.
++ */
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
++
++/**
++ * Find next frame in output order and put a reference to it in frame.
++ * @return 1 if a frame was output, 0 otherwise
++ */
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags);
++
++void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++                                     const int nPbW, const int nPbH);
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++                                int nPbH, int log2_cb_size, int part_idx,
++                                int merge_idx, MvField * const mv);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW,
++                              int nPbH, int log2_cb_size, int part_idx,
++                              int merge_idx, MvField * const mv,
++                              int mvp_lx_flag, int LX);
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase, int log2_cb_size);
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++                                           int log2_trafo_size);
++int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_qp_delta_abs(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_hls_filter(HEVCRpiContext * const s, const int x, const int y, const int ctb_size);
++void ff_hevc_rpi_hls_filters(HEVCRpiContext *s, int x_ctb, int y_ctb, int ctb_size);
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const int x0, const int y0,
++                                const int log2_trafo_size, const enum ScanType scan_idx,
++                                const int c_idx);
++
++void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
++
++
++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra[4];
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
++{
++    if (s->used_for_ref)
++        ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++}
++
++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
++{
++    if (s->used_for_ref)
++    {
++        ff_hevc_rpi_progress_signal_field(s, y, 0);
++    }
++}
++
++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
++{
++    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++}
++
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_rpi_progress_set_all_done(HEVCFrame * const ref)
++{
++    if (ref->tf.progress != NULL)
++    {
++        int * const p = (int *)&ref->tf.progress->data;
++        p[0] = INT_MAX;
++        p[1] = INT_MAX;
++    }
++}
++
++#define HEVC_RPI_420_ONLY 1
++#define HEVC_RPI_SAND128_ONLY 1
++
++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++    return cidx == 0 ? 0 : 1;
++#else
++    return s->ps.sps->hshift[cidx];
++#endif
++}
++
++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++    return cidx == 0 ? 0 : 1;
++#else
++    return s->ps.sps->vshift[cidx];
++#endif
++}
++
++static inline int ctx_cfmt(const HEVCRpiContext * const s)
++{
++#if HEVC_RPI_420_ONLY
++    return 1;
++#else
++    return s->ps.sps->chroma_format_idc;
++#endif
++}
++
++static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
++{
++#if HEVC_RPI_SAND128_ONLY
++    return 128;
++#else
++    return frame->linesize[c_idx];
++#endif
++}
++
++#if HEVC_RPI_SAND128_ONLY
++// Propagate this decision to later zc includes
++#define RPI_ZC_SAND128_ONLY 1
++#endif
++
++#endif /* AVCODEC_RPI_HEVCDEC_H */
+diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
+new file mode 100644
+index 0000000000..3e4cfe8d46
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.c
+@@ -0,0 +1,415 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdsp.h"
++
++static const int8_t transform[32][32] = {
++    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
++      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
++    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
++      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
++    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
++     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
++    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
++      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
++    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
++      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
++    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
++     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
++    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
++     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
++    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
++      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
++    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
++      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
++    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
++     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
++    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
++     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
++    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
++      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
++    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
++      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
++    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
++     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
++    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
++     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
++    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
++      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
++    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
++      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
++    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
++     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
++    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
++     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
++    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
++      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
++    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
++      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
++    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
++     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
++    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
++     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
++    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
++      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
++    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
++      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
++    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
++     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
++    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
++     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
++    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
++      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
++    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
++      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
++    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
++     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
++    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
++      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
++    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
++      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
++    { -2, 58, 10, -2},
++    { -4, 54, 16, -2},
++    { -6, 46, 28, -4},
++    { -4, 36, 36, -4},
++    { -4, 28, 46, -6},
++    { -2, 16, 54, -4},
++    { -2, 10, 58, -2},
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
++    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
++    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
++    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
++};
++
++#define BIT_DEPTH 8
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
++                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                               const MvField *curr, const MvField *neigh, uint8_t *bs)
++{
++    for (; pus > 0; pus--) {
++        int strength, out;
++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
++
++#if 1 // This more directly matches the original implementation
++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++            // same L0 and L1
++            if (curr_refL0 == neigh_refL0 &&
++                curr_refL0 == curr_refL1 &&
++                neigh_refL0 == neigh_refL1) {
++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL0 == curr_refL0 &&
++                       neigh_refL1 == curr_refL1) {
++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL1 == curr_refL0 &&
++                       neigh_refL0 == curr_refL1) {
++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else {
++                strength = 1;
++            }
++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++            Mv curr_mv0, neigh_mv0;
++
++            if (curr->pred_flag & 1) {
++                curr_mv0   = curr->mv[0];
++            } else {
++                curr_mv0   = curr->mv[1];
++                curr_refL0 = curr_refL1;
++            }
++
++            if (neigh->pred_flag & 1) {
++                neigh_mv0   = neigh->mv[0];
++            } else {
++                neigh_mv0   = neigh->mv[1];
++                neigh_refL0 = neigh_refL1;
++            }
++
++            if (curr_refL0 == neigh_refL0) {
++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else
++                strength = 1;
++        } else
++            strength = 1;
++#else // This has exactly the same effect, but is more suitable for vectorisation
++        Mv curr_mv[2];
++        Mv neigh_mv[2];
++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
++
++        if (!(curr->pred_flag & 2)) {
++            curr_mv[1] = curr_mv[0];
++            curr_refL1 = curr_refL0;
++        }
++        if (!(neigh->pred_flag & 2)) {
++            neigh_mv[1] = neigh_mv[0];
++            neigh_refL1 = neigh_refL0;
++        }
++        if (!(curr->pred_flag & 1)) {
++            curr_mv[0] = curr_mv[1];
++            curr_refL0 = curr_refL1;
++        }
++        if (!(neigh->pred_flag & 1)) {
++            neigh_mv[0] = neigh_mv[1];
++            neigh_refL0 = neigh_refL1;
++        }
++
++        strength = 1;
++
++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
++
++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
++
++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#endif
++
++        curr += in_inc / sizeof (MvField);
++        neigh += in_inc / sizeof (MvField);
++
++        for (out = dup; out > 0; out--)
++        {
++            *bs = strength;
++            bs += out_inc;
++        }
++    }
++}
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef PEL_FUNC
++#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
++    for(i = 0 ; i < 10 ; i++)                                                  \
++{                                                                              \
++    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
++}
++
++#undef EPEL_FUNCS
++#define EPEL_FUNCS(depth)                                                     \
++    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
++    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
++    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
++    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
++
++#undef EPEL_UNI_FUNCS
++#define EPEL_UNI_FUNCS(depth)                                                 \
++    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
++    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
++    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
++    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
++    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
++    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
++    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
++
++#undef EPEL_BI_FUNCS
++#define EPEL_BI_FUNCS(depth)                                                \
++    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
++    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
++    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
++    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
++    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
++    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
++    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
++
++#undef QPEL_FUNCS
++#define QPEL_FUNCS(depth)                                                     \
++    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
++    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
++    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
++    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
++
++#undef QPEL_UNI_FUNCS
++#define QPEL_UNI_FUNCS(depth)                                                 \
++    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
++    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
++    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
++    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
++
++#undef QPEL_BI_FUNCS
++#define QPEL_BI_FUNCS(depth)                                                  \
++    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
++    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
++    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
++    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
++    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++
++#define SLICED_ADD_RESIDUAL(depth)\
++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
++    }                                                                         \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#define HEVC_DSP(depth)                                                     \
++    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
++    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
++    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
++    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
++    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
++    SLICED_ADD_RESIDUAL(depth);                                             \
++    hevcdsp->dequant                = FUNC(dequant, depth);                 \
++    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
++    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
++    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
++    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
++    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
++    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
++                                                                            \
++    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
++    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
++    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
++    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
++                                                                            \
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
++    }                                                                       \
++    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
++    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++    SLICED_SAO(depth);                                                         \
++                                                                               \
++    QPEL_FUNCS(depth);                                                         \
++    QPEL_UNI_FUNCS(depth);                                                     \
++    QPEL_BI_FUNCS(depth);                                                      \
++    EPEL_FUNCS(depth);                                                         \
++    EPEL_UNI_FUNCS(depth);                                                     \
++    EPEL_BI_FUNCS(depth);                                                      \
++                                                                               \
++    SLICED_LOOP_FILTERS(depth);                                                \
++    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
++    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
++    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
++    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
++    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
++    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
++    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
++    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
++int i = 0;
++
++    switch (bit_depth) {
++    case 9:
++        HEVC_DSP(9);
++        break;
++    case 10:
++        HEVC_DSP(10);
++        break;
++    case 12:
++        HEVC_DSP(12);
++        break;
++    default:
++        HEVC_DSP(8);
++        break;
++    }
++
++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++
++    if (ARCH_PPC)
++        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
++    if (ARCH_X86)
++        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
++    if (ARCH_ARM)
++        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
++    if (ARCH_MIPS)
++        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
++}
+diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
+new file mode 100644
+index 0000000000..c974baa820
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.h
+@@ -0,0 +1,182 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDSP_H
++#define AVCODEC_RPI_HEVCDSP_H
++
++#include "hevc.h"
++#include "get_bits.h"
++
++#define MAX_PB_SIZE 64
++
++typedef struct SAOParams {
++//    int offset_abs[3][4];   ///< sao_offset_abs
++//    int offset_sign[3][4];  ///< sao_offset_sign
++
++    uint8_t band_position[3];   ///< sao_band_position
++    uint8_t eo_class[3];        ///< sao_eo_class
++    uint8_t type_idx[3];    ///< sao_type_idx
++
++    int16_t offset_val[3][5];   ///<SaoOffsetVal
++
++} SAOParams;
++
++typedef struct Mv {
++    int16_t x;  ///< horizontal component of motion vector
++    int16_t y;  ///< vertical component of motion vector
++} Mv;
++
++typedef struct MvField {
++    DECLARE_ALIGNED(4, Mv, mv)[2];
++    int8_t ref_idx[2];
++    int8_t pred_flag;
++} MvField;
++
++// This controls how many sao dsp functions there are
++// N=5 has width = 8, 16, 32, 48, 64
++// N=6 adds a function for width=24 (in fn array el 5 so existing code should
++// still work)
++#define SAO_FILTER_N 6
++
++
++typedef struct HEVCDSPContext {
++    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++
++    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
++
++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++
++    void (*dequant)(int16_t *coeffs, int16_t log2_size);
++
++    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
++
++    void (*transform_4x4_luma)(int16_t *coeffs);
++
++    void (*idct[4])(int16_t *coeffs, int col_limit);
++
++    void (*idct_dc[4])(int16_t *coeffs);
++
++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
++                               int width, int height);
++
++    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++
++    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++
++    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                        int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++
++    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, int denom, int wx0, int wx1,
++                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, intptr_t mx, intptr_t my, int width);
++
++    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, int denom, int wx0, int ox0, int wx1,
++                                         int ox1, intptr_t mx, intptr_t my, int width);
++
++    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                    int beta, int32_t *tc,
++                                    uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                    int beta, int32_t *tc,
++                                    uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++                                      int beta, int32_t *tc,
++                                      uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++                                      int beta, int32_t *tc,
++                                      uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                        int32_t *tc, uint8_t *no_p,
++                                        uint8_t *no_q);
++    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                        int32_t *tc, uint8_t *no_p,
++                                        uint8_t *no_q);
++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                                 const uint8_t no_p[2], const uint8_t no_q[2],
++                                 uint8_t * _pix_l);
++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f);
++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f);
++
++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
++                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                               const MvField *curr, const MvField *neigh, uint8_t *bs);
++} HEVCDSPContext;
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++
++extern const int8_t ff_hevc_rpi_epel_filters[7][4];
++extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
++
++void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
++void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
++#endif /* AVCODEC_RPI_HEVCDSP_H */
+diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
+new file mode 100644
+index 0000000000..b129a70315
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp_template.c
+@@ -0,0 +1,2269 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "get_bits.h"
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++#include "rpi_hevcdsp.h"
++
++#include "rpi_hevc_shader_template.h"
++
++static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++
++    dst = (pixel *)_dst + 1;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
++                                                ptrdiff_t stride, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + dc);
++        }
++        dst += stride;
++    }
++}
++
++
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_v, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_u, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, unsigned int size)
++{
++    unsigned int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int16_t * ru = res;
++    const int16_t * rv = res + size * size;
++
++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++        }
++        dst += stride;
++    }
++
++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
++}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int dc_v = dc >> 16;
++    const int dc_u = (dc << 16) >> 16;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++        }
++        dst += stride;
++    }
++}
++
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 32);
++}
++
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++
++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++{
++    int16_t *coeffs = (int16_t *) _coeffs;
++    int x, y;
++    int size = 1 << log2_size;
++
++    if (mode) {
++        coeffs += size;
++        for (y = 0; y < size - 1; y++) {
++            for (x = 0; x < size; x++)
++                coeffs[x] += coeffs[x - size];
++            coeffs += size;
++        }
++    } else {
++        for (y = 0; y < size; y++) {
++            for (x = 1; x < size; x++)
++                coeffs[x] += coeffs[x - 1];
++            coeffs += size;
++        }
++    }
++}
++
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
++{
++    int shift  = 15 - BIT_DEPTH - log2_size;
++    int x, y;
++    int size = 1 << log2_size;
++
++    if (shift > 0) {
++        int offset = 1 << (shift - 1);
++        for (y = 0; y < size; y++) {
++            for (x = 0; x < size; x++) {
++                *coeffs = (*coeffs + offset) >> shift;
++                coeffs++;
++            }
++        }
++    } else {
++        for (y = 0; y < size; y++) {
++            for (x = 0; x < size; x++) {
++                *coeffs = *coeffs << -shift;
++                coeffs++;
++            }
++        }
++    }
++}
++
++#define SET(dst, x)   (dst) = (x)
++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
++
++#define TR_4x4_LUMA(dst, src, step, assign)                             \
++    do {                                                                \
++        int c0 = src[0 * step] + src[2 * step];                         \
++        int c1 = src[2 * step] + src[3 * step];                         \
++        int c2 = src[0 * step] - src[3 * step];                         \
++        int c3 = 74 * src[1 * step];                                    \
++                                                                        \
++        assign(dst[2 * step], 74 * (src[0 * step] -                     \
++                                    src[2 * step] +                     \
++                                    src[3 * step]));                    \
++        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
++        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
++        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
++    } while (0)
++
++static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++{
++    int i;
++    int shift    = 7;
++    int add      = 1 << (shift - 1);
++    int16_t *src = coeffs;
++
++    for (i = 0; i < 4; i++) {
++        TR_4x4_LUMA(src, src, 4, SCALE);
++        src++;
++    }
++
++    shift = 20 - BIT_DEPTH;
++    add   = 1 << (shift - 1);
++    for (i = 0; i < 4; i++) {
++        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
++        coeffs += 4;
++    }
++}
++
++#undef TR_4x4_LUMA
++
++#define TR_4(dst, src, dstep, sstep, assign, end)                 \
++    do {                                                          \
++        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
++        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
++        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
++        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
++                                                                  \
++        assign(dst[0 * dstep], e0 + o0);                          \
++        assign(dst[1 * dstep], e1 + o1);                          \
++        assign(dst[2 * dstep], e1 - o1);                          \
++        assign(dst[3 * dstep], e0 - o0);                          \
++    } while (0)
++
++#define TR_8(dst, src, dstep, sstep, assign, end)                 \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_8[4];                                               \
++        int o_8[4] = { 0 };                                       \
++        for (i = 0; i < 4; i++)                                   \
++            for (j = 1; j < end; j += 2)                          \
++                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
++        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
++                                                                  \
++        for (i = 0; i < 4; i++) {                                 \
++            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
++            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
++        }                                                         \
++    } while (0)
++
++#define TR_16(dst, src, dstep, sstep, assign, end)                \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_16[8];                                              \
++        int o_16[8] = { 0 };                                      \
++        for (i = 0; i < 8; i++)                                   \
++            for (j = 1; j < end; j += 2)                          \
++                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
++        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
++                                                                  \
++        for (i = 0; i < 8; i++) {                                 \
++            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
++            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
++        }                                                         \
++    } while (0)
++
++#define TR_32(dst, src, dstep, sstep, assign, end)                \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_32[16];                                             \
++        int o_32[16] = { 0 };                                     \
++        for (i = 0; i < 16; i++)                                  \
++            for (j = 1; j < end; j += 2)                          \
++                o_32[i] += transform[j][i] * src[j * sstep];      \
++        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
++                                                                  \
++        for (i = 0; i < 16; i++) {                                \
++            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
++            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
++        }                                                         \
++    } while (0)
++
++#define IDCT_VAR4(H)                                              \
++    int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR8(H)                                              \
++    int limit  = FFMIN(col_limit, H);                             \
++    int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR16(H)   IDCT_VAR8(H)
++#define IDCT_VAR32(H)   IDCT_VAR8(H)
++
++#define IDCT(H)                                                   \
++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
++                                        int col_limit)            \
++{                                                                 \
++    int i;                                                        \
++    int      shift = 7;                                           \
++    int      add   = 1 << (shift - 1);                            \
++    int16_t *src   = coeffs;                                      \
++    IDCT_VAR ## H(H);                                             \
++                                                                  \
++    for (i = 0; i < H; i++) {                                     \
++        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
++        if (limit2 < H && i%4 == 0 && !!i)                        \
++            limit2 -= 4;                                          \
++        src++;                                                    \
++    }                                                             \
++                                                                  \
++    shift = 20 - BIT_DEPTH;                                       \
++    add   = 1 << (shift - 1);                                     \
++    for (i = 0; i < H; i++) {                                     \
++        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
++        coeffs += H;                                              \
++    }                                                             \
++}
++
++#define IDCT_DC(H)                                                \
++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
++{                                                                 \
++    int i, j;                                                     \
++    int shift = 14 - BIT_DEPTH;                                   \
++    int add   = 1 << (shift - 1);                                 \
++    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
++                                                                  \
++    for (j = 0; j < H; j++) {                                     \
++        for (i = 0; i < H; i++) {                                 \
++            coeffs[i + j * H] = coeff;                            \
++        }                                                         \
++    }                                                             \
++}
++
++IDCT( 4)
++IDCT( 8)
++IDCT(16)
++IDCT(32)
++
++IDCT_DC( 4)
++IDCT_DC( 8)
++IDCT_DC(16)
++IDCT_DC(32)
++
++#undef TR_4
++#undef TR_8
++#undef TR_16
++#undef TR_32
++
++#undef SET
++#undef SCALE
++
++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  int16_t *sao_offset_val, int sao_left_class,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    for (k = 0; k < 4; k++)
++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++
++#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
++
++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++
++    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            int diff0 = CMP(src[x], src[x + a_stride]);
++            int diff1 = CMP(src[x], src[x + b_stride]);
++            int offset_val        = edge_idx[2 + diff0 + diff1];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++
++
++#if BIT_DEPTH == 10
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
 +#endif
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
-new file mode 100644
-index 0000000000..82bf380eb4
---- /dev/null
-+++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,63 @@
-+#ifndef rpi_shader_H
-+#define rpi_shader_H
-+
-+extern unsigned int rpi_shader[];
-+
-+#define mc_setup_c_q0 (rpi_shader + 0)
-+#define mc_start (rpi_shader + 0)
-+#define mc_setup_c_qn (rpi_shader + 2)
-+#define mc_filter_c_p (rpi_shader + 142)
-+#define mc_filter_c_p_l1 (rpi_shader + 272)
-+#define mc_filter_c_b (rpi_shader + 402)
-+#define mc_sync_q0 (rpi_shader + 590)
-+#define mc_sync_q1 (rpi_shader + 608)
-+#define mc_sync_q2 (rpi_shader + 620)
-+#define mc_sync_q3 (rpi_shader + 632)
-+#define mc_sync_q4 (rpi_shader + 644)
-+#define mc_sync_q5 (rpi_shader + 662)
-+#define mc_sync_q6 (rpi_shader + 674)
-+#define mc_sync_q7 (rpi_shader + 686)
-+#define mc_sync_q8 (rpi_shader + 698)
-+#define mc_sync_q9 (rpi_shader + 716)
-+#define mc_sync_q10 (rpi_shader + 728)
-+#define mc_sync_q11 (rpi_shader + 740)
-+#define mc_exit_c_qn (rpi_shader + 752)
-+#define mc_exit_y_qn (rpi_shader + 752)
-+#define mc_exit_c_q0 (rpi_shader + 770)
-+#define mc_exit_y_q0 (rpi_shader + 770)
-+#define mc_setup_y_q0 (rpi_shader + 790)
-+#define mc_setup_y_qn (rpi_shader + 792)
-+#define mc_filter_y_pxx (rpi_shader + 1032)
-+#define mc_filter_y_bxx (rpi_shader + 1162)
-+#define mc_filter_y_p00 (rpi_shader + 1292)
-+#define mc_filter_y_b00 (rpi_shader + 1382)
-+#define mc_setup_c10_q0 (rpi_shader + 1462)
-+#define mc_setup_c10_qn (rpi_shader + 1464)
-+#define mc_filter_c10_p (rpi_shader + 1600)
-+#define mc_filter_c10_p_l1 (rpi_shader + 1728)
-+#define mc_filter_c10_b (rpi_shader + 1856)
-+#define mc_sync10_q0 (rpi_shader + 2042)
-+#define mc_sync10_q1 (rpi_shader + 2060)
-+#define mc_sync10_q2 (rpi_shader + 2072)
-+#define mc_sync10_q3 (rpi_shader + 2084)
-+#define mc_sync10_q4 (rpi_shader + 2096)
-+#define mc_sync10_q5 (rpi_shader + 2114)
-+#define mc_sync10_q6 (rpi_shader + 2126)
-+#define mc_sync10_q7 (rpi_shader + 2138)
-+#define mc_sync10_q8 (rpi_shader + 2150)
-+#define mc_sync10_q9 (rpi_shader + 2168)
-+#define mc_sync10_q10 (rpi_shader + 2180)
-+#define mc_sync10_q11 (rpi_shader + 2192)
-+#define mc_exit_c10_q0 (rpi_shader + 2204)
-+#define mc_exit_y10_q0 (rpi_shader + 2204)
-+#define mc_exit_c10_qn (rpi_shader + 2224)
-+#define mc_exit_y10_qn (rpi_shader + 2224)
-+#define mc_setup_y10_q0 (rpi_shader + 2242)
-+#define mc_setup_y10_qn (rpi_shader + 2244)
-+#define mc_filter_y10_pxx (rpi_shader + 2494)
-+#define mc_filter_y10_p00 (rpi_shader + 2624)
-+#define mc_filter_y10_bxx (rpi_shader + 2716)
-+#define mc_filter_y10_b00 (rpi_shader + 2846)
-+#define mc_end (rpi_shader + 2926)
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int sao_eo_class    = sao->eo_class[c_idx];
++    int init_x = 0, width = _width, height = _height;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    if (sao_eo_class != SAO_EO_VERT) {
++        if (borders[0]) {
++            for (y = 0; y < height; y++) {
++                dst[y * stride_dst] = src[y * stride_src];
++            }
++            init_x = 1;
++        }
++        if (borders[2]) {
++            int offset     = width - 1;
++            for (x = 0; x < height; x++) {
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++            }
++            width--;
++        }
++    }
++    if (sao_eo_class != SAO_EO_HORIZ) {
++        if (borders[1]) {
++            for (x = init_x; x < width; x++)
++                dst[x] = src[x];
++        }
++        if (borders[3]) {
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++            for (x = init_x; x < width; x++)
++                dst[x + y_stride_dst] = src[x + y_stride_src];
++            height--;
++        }
++    }
++}
++
++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int sao_eo_class    = sao->eo_class[c_idx];
++    int init_x = 0, init_y = 0, width = _width, height = _height;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    if (sao_eo_class != SAO_EO_VERT) {
++        if (borders[0]) {
++            for (y = 0; y < height; y++) {
++                dst[y * stride_dst] = src[y * stride_src];
++            }
++            init_x = 1;
++        }
++        if (borders[2]) {
++            int offset     = width - 1;
++            for (x = 0; x < height; x++) {
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++            }
++            width--;
++        }
++    }
++    if (sao_eo_class != SAO_EO_HORIZ) {
++        if (borders[1]) {
++            for (x = init_x; x < width; x++)
++                dst[x] = src[x];
++            init_y = 1;
++        }
++        if (borders[3]) {
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++            for (x = init_x; x < width; x++)
++                dst[x + y_stride_dst] = src[x + y_stride_src];
++            height--;
++        }
++    }
++
++    {
++        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
++        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
++        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
++        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
++
++        // Restore pixels that can't be modified
++        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
++            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
++                dst[y*stride_dst] = src[y*stride_src];
++        }
++        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
++            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
++                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
++        }
++
++        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
++            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
++                dst[x] = src[x];
++        }
++        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
++            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
++                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
++        }
++        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
++            dst[0] = src[0];
++        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
++            dst[width-1] = src[width-1];
++        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
++            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
++        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
++            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
++
++    }
++}
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
++
++// --- Plaited chroma versions
++
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table_u[32] = { 0 };
++    int offset_table_v[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++    width *= 2;
++
++    for (k = 0; k < 4; k++)
++    {
++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++    }
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2)
++        {
++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++            // *** & 31 shouldn't be wanted but just now we generate broken input that
++            // crashes us in 10-bit world
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
++        }
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
 +
-+#endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
-new file mode 100644
-index 0000000000..ba6cc13a95
---- /dev/null
-+++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1741 @@
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
 +
-+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
-+# the warning that we are using rotation & ra/rb registers. r0..3 can be
-+# rotated through all 16 elems ra regs can only be rotated through their
-+# local 4.  As it happens this is what is wanted here as we do not want the
-+# constants from the other half of the calc.
++    stride_dst /= sizeof(pixel);
++    width *= 2;
 +
-+# PREREAD is the number of requests that we have sitting in the TMU request
-+# queue.
-+#
-+# There are 8 slots availible in the TMU request Q for tm0s requests, but
-+# only 4 output FIFO entries and overflow is bad (corruption or crash)
-+# (If threaded then only 2 out FIFO entries, but we aren't.)
-+# In s/w we are effectively limited to the min vertical read which is >= 4
-+# so output FIFO is the limit.
-+#
-+# However in the current world there seems to be no benefit (and a small
-+# overhead) in setting this bigger than 2.
++    av_assert0(width <= 64);
 +
-+.set PREREAD,                      4
++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2) {
++            int diff0u = CMP(src[x], src[x + a_stride]);
++            int diff1u = CMP(src[x], src[x + b_stride]);
++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
 +
-+# Block heights - 8 & 16 are the only numbers we currently support
++// Do once
++#if BIT_DEPTH == 8
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
++#endif
 +
-+.set C_BLK_HEIGHT_8,               16
-+.set C_BLK_HEIGHT_16,              8
-+.set Y_BLK_HEIGHT_8,               16
-+.set Y_BLK_HEIGHT_16,              8
++#undef CMP
 +
-+# QPU counts - depend on block size
-+# If we have a 2-byte format & block_size > 8 then can only afford
-+# 8 QPUs
-+# These numbers must match the numbers in rpi_shader_cmd.h
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
++                                      uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
 +
-+.set N_QPU_8,                      12
-+.set N_QPU_16,                     12
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = src[x] << (14 - BIT_DEPTH);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
 +
-+# register allocation
-+#
++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, intptr_t mx, intptr_t my, int width)
++{
++    int y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
 +
-+# ra0-3
-+# Used as temp and may be loop filter coeffs (split into .8s)
-+# or temp in loop. Check usage on an individual basis.
++    for (y = 0; y < height; y++) {
++        memcpy(dst, src, width * sizeof(pixel));
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+# ra4-7
-+# C:   L0 H filter out FIFO
-+# otherwise -- free --
++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    int shift = 14  + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# ra8-11
-+# temp in some places - check usage
-+# Y:   (with rb8-11) horiz out FIFO
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+# ra12-15
-+# -- free --
++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# uniform: width:height
-+.set ra_width_height,              ra16
-+.set ra_width,                     ra16.16b
-+.set ra_height,                    ra16.16a
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+# y:y2 same layout as y_y2_next so we can update both together
-+.set ra_y_y2,                      ra17
-+.set ra_y2,                        ra17.16a
-+.set ra_y,                         ra17.16b
++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                           int16_t *src2,
++                                           int height, int denom, int wx0, int wx1,
++                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
 +
-+# uniform: L1 weight (U on left, V on right)
-+# Only used in Y B
-+.set ra_wt_off_mul_l1,             ra18
-+.set ra_wt_off_l1,                 ra18.16b
-+.set ra_wt_mul_l1,                 ra18.16a
++    int shift = 14  + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
 +
-+# y_next:y2_next same layout as y_y2 so we can update both together
-+.set ra_y_y2_next,                 ra19
-+.set ra_y_next,                    ra19.16b
-+.set ra_y2_next,                   ra19.16a
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
++        }
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+# Setup: consts - subdivide a single register
-+.set ra_kff100100,                 ra20
-+.set ra_k256,                      ra20.16a
-+.set ra_k0,                        ra20.8a
-+.set ra_k1,                        ra20.8b
-+.set ra_k16,                       ra20.8c
-+.set ra_k255,                      ra20.8d
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define QPEL_FILTER(src, stride)                                               \
++    (filter[0] * src[x - 3 * stride] +                                         \
++     filter[1] * src[x - 2 * stride] +                                         \
++     filter[2] * src[x -     stride] +                                         \
++     filter[3] * src[x             ] +                                         \
++     filter[4] * src[x +     stride] +                                         \
++     filter[5] * src[x + 2 * stride] +                                         \
++     filter[6] * src[x + 3 * stride] +                                         \
++     filter[7] * src[x + 4 * stride])
++
++static void FUNC(put_hevc_qpel_h)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
 +
-+# Loop: xshifts
-+.set ra_xshift,                    ra21.16a
-+.set ra_xshift_next,               ra21.16b
++static void FUNC(put_hevc_qpel_v)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    for (y = 0; y < height; y++)  {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
 +
-+# Loop var: L0 weight (U on left, V on right)
-+# _off_ is not used in loop as we want to modify it before use
-+.set ra_wt_off_mul_l0,             ra22
-+.set ra_wt_mul_l0,                 ra22.16a
-+.set ra_wt_off_l0,                 ra22.16b
++static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
++                                   uint8_t *_src,
++                                   ptrdiff_t _srcstride,
++                                   int height, intptr_t mx,
++                                   intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
-+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
-+#   2nd byte   but as the source should never be > 3 there 0x3ff should do
-+.set ra_blk_height_pmax,           ra23
-+.set ra_pmax,                      ra23.16a
-+.set ra_blk_height,                ra23.8c
-+# -- free --                       ra23.8d
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++        tmp += MAX_PB_SIZE;
++        dst += MAX_PB_SIZE;
++    }
++}
 +
-+# Loop:  src frame base (L0)
-+.set ra_base,                      ra24
++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                      uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Loop: src frame base (L1)
-+.set ra_base2,                     ra25
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+# Loop: next src frame base (L0)
-+.set ra_base_next,                 ra26
++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
 +
-+# -- free --                       ra27
-+# -- free --                       ra28
-+# -- free --                       ra29
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
 +
-+# Use an even numbered register as a link register to avoid corrupting flags
-+.set ra_link,                      ra30
++    int shift = 14  + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# -- free --                       ra31
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+.set rb_xshift2,                   rb0
-+.set rb_xshift2_next,              rb1
++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                     uint8_t *_src, ptrdiff_t _srcstride,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
-+.set rb_elem_x,                    rb2
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+# El Flags
-+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
-+.set rb_ef,                        rb3
 +
-+# rb4-7
-+# C-B: L1 H filter out FIFO
-+# Y:   (with ra2.8x) Y vertical filter coeffs
++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
 +
-+# rb8-11
-+# C:   Vertical filter coeffs
-+# Y:   (with ra8-11) horiz out FIFO
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
 +
-+# Loop var: offset to add before shift (round + weighting offsets)
-+# Exact value varies by loop
-+.set rb_wt_off,                    rb12
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Setup: denom + 6 + 9
-+.set rb_wt_den_p15,                rb13
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                       uint8_t *_src, ptrdiff_t _srcstride,
++                                       int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift =  14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# -- free --                       rb14
-+# -- free --                       rb15
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+# Line pitch (128 for sand128)
-+.set rb_pitch,                     rb16
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
 +
-+# Loop count - 2 (set up TMU for next xfer)
-+.set rb_i_tmu,                     rb17
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
 +
-+# Loop count for min(height, 16)
-+# Y will reset & loop again if height > 16
-+.set rb_lcount,                    rb18
++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int16_t *src2,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# frame_base2_next
-+.set rb_base2_next,                rb19
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
-+# offset to the slice
-+.set rb_xpitch,                    rb20
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
 +
-+# -- free --                       rb21
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
-+.set rb_pmask,                     rb22
++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                        uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox,
++                                        intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Loop: destination address
-+.set rb_dest,                      rb23
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+# vdw_setup_1(dst_pitch)
-+.set rb_dma1_base,                 rb24
++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
 +
-+# Setup: pic width - 1
-+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
-+.set rb_max_x,                     rb25
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
 +
-+# Loop: height<<23 + width<<16 + vdw_setup_0
-+.set rb_dma0,                      rb26
++    int shift = 14  + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
 +
-+# vdw_setup_0 (depends on QPU number)
-+.set rb_dma0_base,                 rb27
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+# Setup: vw_setup value to reset VPM write pointer
-+.set rb_vpm_init,                  rb28
++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                        uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox,
++                                        intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Loop: vdw_setup_1(dst_pitch-width) = stride
-+.set rb_dma1,                      rb29
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+# Setup: pic_height - 1
-+.set rb_max_y,                     rb30
++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
 +
-+# -- free --                       rb31
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
 +
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
 +
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                         uint8_t *_src, ptrdiff_t _srcstride,
++                                         int height, int denom, int wx, int ox,
++                                         intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
-+.set i_shift16,                    -16
-+.set i_shift21,                    -11
-+.set i_shift23,                     -9
-+.set i_shift30,                     -2
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+# Much of the setup code is common between Y & C
-+# Macros that express this - obviously these can't be overlapped
-+# so are probably unsuitable for loop code
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
 +
-+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
-+  mov r2, qpu_num
-+.if v_bit_depth <= 8
-+  # 8 bit version
-+  asr r1, r2, 2
-+  shl r1, r1, 6
-+  and r0, r2, 3
-+  or  r0, r0, r1
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
 +
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add r_vpm, r0, r1  # VPM 8bit storage
++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int16_t *src2,
++                                        int height, int denom, int wx0, int wx1,
++                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
 +
-+.else
-+  # 16 bit version
-+  # Limited to 8 QPUs if blk height > 8
-+  asr r1, r2, 1
-+.if v_blk_height <= 8
-+  shl r1, r1, 4
-+.else
-+  shl r1, r1, 5
-+.endif
-+  and r0, r2, 1
-+  or  r0, r0, r1
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
-+  add r_vpm, r0, r1
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define EPEL_FILTER(src, stride)                                               \
++    (filter[0] * src[x - stride] +                                             \
++     filter[1] * src[x]          +                                             \
++     filter[2] * src[x + stride] +                                             \
++     filter[3] * src[x + 2 * stride])
 +
-+  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
-+  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
-+  shl r0, r0, 6
-+.endif
-+  add r_dma, r0, r1  # DMA out
-+.endm
++static void FUNC(put_hevc_epel_h)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
 +
++static void FUNC(put_hevc_epel_v)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
 +
-+.macro m_setup_q0
-+  srel -, 12
-+.endm
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
 +
-+# Code start label
-+::mc_start
++static void FUNC(put_hevc_epel_hv)(int16_t *dst,
++                                   uint8_t *_src, ptrdiff_t _srcstride,
++                                   int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
 +
-+################################################################################
-+# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++    src -= EPEL_EXTRA_BEFORE * srcstride;
 +
-+.macro m_setup_c, v_bit_depth
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_pmask,           0xff
-+.set v_blk_height,      C_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         2
-+.set v_pmask,           0xffff
-+.set v_blk_height,      C_BLK_HEIGHT_16
-+.endif
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
 +
-+  mov tmurs, 1                                  # No swap TMUs
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++        tmp += MAX_PB_SIZE;
++        dst += MAX_PB_SIZE;
++    }
++}
 +
-+# Load first request location
-+  mov ra0, unif                                 # next_x_y
++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+  mov ra_base, unif                             # Store frame c base
++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Read image dimensions
-+  sub r0, unif, 1                               # pic c width
-+  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
-+  sub rb_max_y, unif, 1                         # pic c height
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        }
++        dst  += dststride;
++        src  += srcstride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+# load constants
-+  mov ra_kff100100, 0xff100100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# get source pitch
-+  mov rb_xpitch, unif                           # stride2
-+  mov rb_pitch, unif                            # stride1
-+  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
-+  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
 +
-+  and r0, 1, elem_num
-+  nop                   ; mul24 r0, r0, 5
-+.if v_bit_depth <= 8
-+  add rb_elem_x, r0, elem_num
-+.else
-+  add r0, r0, elem_num
-+  add rb_elem_x, r0, r0
-+.endif
++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        dst  += dststride;
++        src  += srcstride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
-+  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
-+  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
-+  min r0, r0, rb_max_x
++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Get shift
-+# Shift will always calculate as 0 for 9+ bit
-+# Ideally we can optimize the shift out of the code in these cases but for now
-+# it is tidier to leave it in
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.else
-+  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
-+.endif
++    src -= EPEL_EXTRA_BEFORE * srcstride;
 +
-+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1
-+  add ra_base, ra_base, r0
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
 +
-+  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
 +
-+# Compute part of VPM to use for DMA output
-+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int16_t *src2,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# And again for L1, but only worrying about frame2 stuff
++    src -= EPEL_EXTRA_BEFORE * srcstride;
 +
-+# Load first request location
-+  mov ra0, unif                                 # next_x_y
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+  mov ra_base2, unif                            # [ra0 delay] Store frame c base
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
 +
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+  shl r0, ra0.16b, v_x_shift
-+  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Get shift (already zero if 9+ bit so ignore)
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        }
++        dst += dststride;
++        src += srcstride;
++    }
++}
 +
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+.if v_bit_depth <= 8
-+  and r0, r0, -4
-+.endif
-+  sub r1, ra_k0, rb_pitch
-+  and r1, r0, r1
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r2, ra_y2
-+  add ra_base2, ra_base2, r0
++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+# Do preloads
-+# r0 = ra_y, r2 = ra_y2
-+  mov r3, PREREAD       ; mov r0, ra_y
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        }
++        dst += dststride;
++        src += srcstride;
++    }
++}
 +
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1  ; mov ra_y, r0
++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
-+  add t1s, ra_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz 1b
++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
 +
-+  mov ra_link, unif                             # link
-+# touch registers to keep simulator happy
-+  # ra/b4..7: B0 -> B stash registers
-+  mov ra4, 0 ; mov rb4, 0
-+  bra -, ra_link
-+  mov ra5, 0 ; mov rb5, 0
-+  mov ra6, 0 ; mov rb6, 0
-+  mov ra7, 0 ; mov rb7, 0
-+# >>> ra_link
-+.endm
++    src -= EPEL_EXTRA_BEFORE * srcstride;
 +
-+::mc_setup_c_q0
-+  m_setup_q0
-+::mc_setup_c_qn
-+  m_setup_c 8
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+################################################################################
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
 +
-+# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
 +
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x, ra_x16_base point to the current coordinates for this block
++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int16_t *src2,
++                                        int height, int denom, int wx0, int wx1,
++                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
 +
-+.macro m_filter_c_p, v_tmu, v_bit_depth
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
 +
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_x_mul,           4
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
 +
-+.if v_tmu == 0
-+.set vrx_xshift,        rb_xshift2              # b side more convienient
-+.set vrx_xshift_next,   ra_xshift_next
-+.set vra_y_next,        ra_y_next
-+.set vrx_base_next,     ra_base_next
-+.set vra_y,             ra_y
-+.set vra_base,          ra_base
-+.set vr_txs,            t0s
-+.else
-+.set vrx_xshift,        ra_xshift               # a side more convienient
-+.set vrx_xshift_next,   rb_xshift2_next
-+.set vra_y_next,        ra_y2_next
-+.set vrx_base_next,     rb_base2_next
-+.set vra_y,             ra_y2
-+.set vra_base,          ra_base2
-+.set vr_txs,            t1s
-+.endif
++// line zero
++#define P3 pix[-4 * xstride]
++#define P2 pix[-3 * xstride]
++#define P1 pix[-2 * xstride]
++#define P0 pix[-1 * xstride]
++#define Q0 pix[0 * xstride]
++#define Q1 pix[1 * xstride]
++#define Q2 pix[2 * xstride]
++#define Q3 pix[3 * xstride]
 +
-+# per-channel shifts were calculated on the *previous* invocation
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
++// line three. used only for deblocking decision
++#define TP3 pix[-4 * xstride + 3 * ystride]
++#define TP2 pix[-3 * xstride + 3 * ystride]
++#define TP1 pix[-2 * xstride + 3 * ystride]
++#define TP0 pix[-1 * xstride + 3 * ystride]
++#define TQ0 pix[0  * xstride + 3 * ystride]
++#define TQ1 pix[1  * xstride + 3 * ystride]
++#define TQ2 pix[2  * xstride + 3 * ystride]
++#define TQ3 pix[3  * xstride + 3 * ystride]
++
++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
++                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
++                                        int beta, int *_tc,
++                                        uint8_t *_no_p, uint8_t *_no_q)
++{
++    int d, j;
++    pixel *pix        = (pixel *)_pix;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
 +
-+  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
++    beta <<= BIT_DEPTH - 8;
 +
-+  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
-+  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
-+  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
-+  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
-+  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
++        const int no_p = _no_p[j];
++        const int no_q = _no_q[j];
 +
-+.if v_bit_depth <= 8
-+  shl vrx_xshift_next, r0, 3
-+  and r0, r0, -4
-+.endif
-+  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
-+  add vrx_base_next, r3, r0     ; mov r1, ra_height
++        if (d0 + d3 >= beta) {
++            pix += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
 +
-+# set up VPM write
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
-+  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
 +
-+# ; unpack filter coefficients
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix += ystride;
++                }
++            }
++        }
++    }
++}
 +
-+  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
-+  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
-+  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, int *_tc,
++                                          uint8_t *_no_p, uint8_t *_no_q)
++{
++    int d, j, no_p, no_q;
++    pixel *pix        = (pixel *)_pix;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
 +
-+  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
 +
-+  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix += ystride;
++        }
++    }
++}
 +
-+  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
-+  sub ra3, rb_wt_den_p15, ra_k1
++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                            int32_t *tc, uint8_t *no_p,
++                                            uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
++}
 +
-+# r5           = 0 (loop counter)
-+# ra9          = alias for rb_max_y
-+# ra_wt_mul_l0 = weight L0
-+# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
-+# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                            int32_t *tc, uint8_t *no_p,
++                                            uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
++}
 +
-+# We want (r0r1)
-+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
-+# We fetch (after shift)
-+#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                          int beta, int32_t *tc, uint8_t *no_p,
++                                          uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
++                                beta, tc, no_p, no_q);
++}
 +
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                          int beta, int32_t *tc, uint8_t *no_p,
++                                          uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
++                                beta, tc, no_p, no_q);
++}
 +
-+.if v_tmu == 0
-+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
-+  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
-+  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
-+.else
-+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
-+  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
-+  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
-+.endif
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
 +
-+  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
-+  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
-+  min r3, r3, ra9       ; mov.ifnc r0, r2
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
 +
-+  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
-+  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
 +
-+# apply horizontal filter
-+# The filter coeffs for the two halves of this are the same (unlike in the
-+# Y case) so it doesn't matter which ra0 we get them from
-+# Also as the two halves are locked together we don't need to separate the 1st
-+# r0 mul or the last r1 mul as they are vaild for all QPUs
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
 +
-+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
-+  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
++                                 uint8_t * _pix_l)
++{
++    int d, j;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    const ptrdiff_t xstride = 1;
++    const ptrdiff_t ystride = _stride / sizeof(pixel);
 +
-+# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
-+# Have to dup block as we need to move the brr - code is more common than it
-+# looks at first glance
-+.if v_bit_depth <= 8
-+  brr.anyn -, r:1b
-+  add r2, r2, r3        ; mov ra5, ra6
-+  mov ra6, ra7          ; mul24 r1, ra7, rb10
-+  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
-+.else
-+  add r2, r2, r3        ; mov ra5, ra6
-+  brr.anyn -, r:1b
-+  mov ra6, ra7          ; mul24 r1, ra7, rb10
-+  sub r2, r2, r0        ; mul24 r0, ra4, rb8
-+  asr ra7, r2, v_bit_depth - 8
-+.endif
-+# >>> .anyn 1b
++    beta <<= BIT_DEPTH - 8;
 +
-+  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
-+  add r1, r1, r0        ; mul24 r0, ra7, rb11
-+  sub r1, r1, r0
-+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
-+  asr r1, r1, 14
-+  nop                   ; mul24 r1, r1, ra_wt_mul_l0
-+  shl r1, r1, 8         ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+  brr.anyn -, r:1b
-+  asr r1, r1, ra3
-+  min r1, r1, ra_pmax   ; mov -, vw_wait
-+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
-+# >>> .anyn 1b
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
++        const int no_p = _no_p[j];
++        const int no_q = _no_q[j];
 +
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++        if (d0 + d3 >= beta) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
 +
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            }
++        }
++    }
++}
 +
-+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
 +
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
 +
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  add rb_lcount, rb_lcount, r0
-+  brr -, r:1b
-+  add rb_dma0, rb_dma0, r1
-+  add rb_dest, rb_dest, r2
-+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
 +
-+# At 10 bits
-+# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
-+# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
-+# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
-+# (P)
-+# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
-+# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
-+# ... should be OK
-+#
-+# (B)
-+# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
-+# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
-+# So signed overflow if we sign extend here :-(
-+#
-+# In practice this doesn't happen (we need a maximal offset and a very unlucky
-+# filter).
-+#
-+# This could be fixed by offsetting the filters s.t. they are unsigned until
-+# weight mul and then removing the offset with the weighting offset (I think
-+# this should work) or splitting the rounding & offsetting
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, const int32_t *_tc,
++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++    int d, j, no_p, no_q;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
 +
-+::mc_filter_c_p
-+  m_filter_c_p 0, 8
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
 +
-+::mc_filter_c_p_l1
-+  m_filter_c_p 1, 8
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix_l += ystride;
++            pix_r += ystride;
++        }
++    }
++}
 +
-+################################################################################
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
 +
-+# mc_filter_c_b
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
 +
-+# At this point we have already issued two pairs of texture requests for the current block
-+# ra_x, ra_x16_base point to the current coordinates for this block
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
 +
-+.macro m_filter_c_b, v_bit_depth
+diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
+new file mode 100644
+index 0000000000..f6db76482d
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.c
+@@ -0,0 +1,122 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+.if v_bit_depth <= 8
-+.set v_x_shift,         1
-+.set v_v_shift,         8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         2
-+.set v_v_shift,         i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
-+.set v_x_mul,           (1 << v_x_shift)
++#include "rpi_hevcdec.h"
 +
-+# per-channel shifts were calculated on the *previous* invocation
++#include "rpi_hevcpred.h"
 +
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
++#define PRED_C 0
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
 +
-+  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
 +
-+  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
-+  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
-+  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
-+  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
 +
-+.if v_bit_depth <= 8
-+  shl ra_xshift_next, r0, 3
-+.endif
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
 +
-+  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
-+  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
 +
-+# set up VPM write
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
 +
-+  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
-+  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
-+  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
 +
-+  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
-+  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
-+  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
 +
-+# L1 - uniform layout could possibly be optimized
++void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
 +
-+  shl r0, ra3.16b, v_x_shift                    # r0=x*2
-+  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
-+  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
-+  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
-+  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
 +
-+.if v_bit_depth <= 8
-+  shl rb_xshift2_next, r0, 3
-+.endif
++#define HEVC_PRED_Y(depth)                                \
++    hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
++    hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
++    hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
++    hpc->intra_pred[3]   = FUNC(intra_pred_5, depth);   \
++    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
++    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
++    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
++    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
++    hpc->pred_dc         = FUNC(pred_dc, depth);        \
++    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
++    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
++    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++    hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
 +
-+  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
-+  and r1, r0, r1        ; mov rb10, ra3.8c
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
-+  add rb_base2_next, r3, r0
++#define HEVC_PRED_C(depth)                                \
++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
 +
-+  mov ra9, rb_max_y     ; mov rb11, ra3.8d
-+  shl r1, ra_wt_off_l1, rb_wt_den_p15
-+  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth); \
++    HEVC_PRED_C(depth);
 +
-+# r5        loop counter
-+# ra0       H coeffs L0
-+# ra1       H coeffs L1
-+# ra2       V coeffs L0
-+# ra3       temp
-+# ra4-7     L0 H FIFO
-+# rb4-7     L1 H FIFO
-+# rb8-rb11  V coeffs L1
-+# ra9       rb_max_y alias
++    switch (bit_depth) {
++    case 9:
++        HEVC_PRED(9);
++        break;
++    case 10:
++        HEVC_PRED(10);
++        break;
++    case 12:
++        HEVC_PRED(12);
++        break;
++    default:
++        HEVC_PRED(8);
++        break;
++    }
 +
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
-+  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
-+  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
-+  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
-+  add ra_y, 1, ra_y     ; mov r3, ra_y
++    if (ARCH_MIPS)
++        ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++}
+diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
+new file mode 100644
+index 0000000000..03c6eb3295
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.h
+@@ -0,0 +1,57 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+  max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++#ifndef AVCODEC_RPI_HEVCPRED_H
++#define AVCODEC_RPI_HEVCPRED_H
 +
-+  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
-+  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
++#include <stddef.h>
++#include <stdint.h>
++#include "config.h"
 +
-+# L0 H-filter
-+# H FIFO scrolls are spread all over this loop
-+  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
++struct HEVCRpiContext;
++struct HEVCRpiLocalContext;
 +
-+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
-+  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
-+.if v_bit_depth <= 8
-+  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
-+.else
-+  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
-+  asr ra3, r2, (v_bit_depth - 8)
-+.endif
++typedef struct HEVCPredContext {
++    void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
 +
-+  shr r2, r4, rb_xshift2 ; mov ra5, ra6
-+  shr r1, r2, v_v_shift ; mov r3, ra_y2
-+  add ra_y2, r3, ra_k1  ; mov rb6, rb7
++    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride, int log2_size, int c_idx);
++    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int c_idx, int mode);
++    void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx);
 +
-+  max r3, r3, ra_k0     ; mov      r0, r1 << 15
-+  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride, int log2_size, int c_idx);
++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int c_idx, int mode);
++} HEVCPredContext;
 +
-+  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
-+  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
++void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth);
++void ff_hevc_rpi_pred_init_mips(HEVCPredContext *hpc, int bit_depth);
 +
-+# L1 H-filter
++#endif /* AVCODEC_RPI_HEVCPRED_H */
+diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
+new file mode 100644
+index 0000000000..4ee776f955
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred_template.c
+@@ -0,0 +1,850 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
 +
-+  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
-+  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
-+# V filters - start in branch delay slots of H
-+# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
-+  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
-+  brr.anyn -, r:1b
-+  mov ra6, ra7          ; mul24 r3, ra7, rb10
-+  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
-+  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
-+# >>> .anyn 1b
++#include "config.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "bit_depth_template.c"
 +
-+  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
-+  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
-+  sub r2, r1, r0        ; mul24 r0, ra4, rb8
-+  sub r1, r3, r0        ; mul24 r0, ra5, rb9
-+  add r1, r1, r0        ; mul24 r0, ra7, rb11
-+  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++#include "rpi_hevcdec.h"
++#include "rpi_hevcpred.h"
 +
-+  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
-+  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
 +
-+  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
-+  add r1, r1, r2        ; mov r3, ra_blk_height
++#define DUMP_PRED 0
 +
-+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
++#define POS(x, y) src[(x) + stride * (y)]
 +
-+  brr.anyn -, r:1b
-+  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
-+  min r1, r1, ra_pmax   ; mov -, vw_wait
-+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
-+# >>> .anyn 1b
++// INCLUDED_ONCE defined at EOF
++#ifndef INCLUDED_ONCE
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
 +
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++    uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++    uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++    pixel4_16 t = {{x, x, x, x}};
++    return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++    pixel4_32 t = {{x, x, x, x}};
++    return t;
++}
++#endif
 +
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
 +
-+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++#if BIT_DEPTH == 8
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t  c8_src_ptr_t
++#define c_dst_ptr_t  c8_dst_ptr_t
++#else
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
++#endif
 +
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
 +
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  add rb_lcount, rb_lcount, r0
-+  brr -, r:1b
-+  add rb_dma0, rb_dma0, r1
-+  add rb_dest, rb_dest, r2
-+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
++#endif
 +
-+::mc_filter_c_b
-+  m_filter_c_b 8
 +
-+################################################################################
-+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
-+# conflicts
++#if DUMP_PRED && !defined(INCLUDE_ONCE)
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++        for (unsigned int x = 0; x != size; x++) {
++            printf("%4d", data[x * 2]);
++        }
++        printf("\n");
++    }
++    printf("\n");
++}
++#endif
 +
-+.macro m_exit_drain
-+.if PREREAD == 2
-+# Special case 2 as loop is wasteful
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  nop                   ; nop           ; ldtmu0
-+  mov -, vw_wait        ; nop           ; ldtmu1
-+.else
-+  mov.setf r3, PREREAD - 1
-+:1
-+  brr.anynz -, r:1b
-+  nop                   ; nop           ; ldtmu0
-+  nop                   ; nop           ; ldtmu1
-+  sub.setf r3, r3, 1
-+ # >>>
-+  mov  -, vw_wait
-+.endif
-+.endm
++static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
++                                              int log2_size, int c_idx_arg)
++{
++#define PU(x) \
++    ((x) >> s->ps.sps->log2_min_pu_size)
++#define MVF(x, y) \
++    (s->ref->tab_mvf[(x) + (y) * min_pu_width])
++#define MVF_PU(x, y) \
++    MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift))))
++#define IS_INTRA(x, y) \
++    (MVF_PU(x, y).pred_flag == PF_INTRA)
++#define MIN_TB_ADDR_ZS(x, y) \
++    s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)]
++#define EXTEND(ptr, val, len)         \
++do {                                  \
++    pixel4 pix = PIXEL_SPLAT_X4(val); \
++    for (i = 0; i < (len); i += 4)    \
++        AV_WN4P(ptr + i, pix);        \
++} while (0)
++
++#define EXTEND_RIGHT_CIP(ptr, start, length)                                   \
++        for (i = start; i < (start) + (length); i += 4)                        \
++            if (!IS_INTRA(i, -1))                                              \
++                AV_WN4P(&ptr[i], a);                                           \
++            else                                                               \
++                a = PIXEL_SPLAT_X4(ptr[i+3])
++#define EXTEND_LEFT_CIP(ptr, start, length) \
++        for (i = start; i > (start) - (length); i--) \
++            if (!IS_INTRA(i - 1, -1)) \
++                ptr[i - 1] = ptr[i]
++#define EXTEND_UP_CIP(ptr, start, length)                                      \
++        for (i = (start); i > (start) - (length); i -= 4)                      \
++            if (!IS_INTRA(-1, i - 3))                                          \
++                AV_WN4P(&ptr[i - 3], a);                                       \
++            else                                                               \
++                a = PIXEL_SPLAT_X4(ptr[i - 3])
++#define EXTEND_DOWN_CIP(ptr, start, length)                                    \
++        for (i = start; i < (start) + (length); i += 4)                        \
++            if (!IS_INTRA(-1, i))                                              \
++                AV_WN4P(&ptr[i], a);                                           \
++            else                                                               \
++                a = PIXEL_SPLAT_X4(ptr[i + 3])
++    // c_idx will alaways be 1 for _c versions and 0 for y
++    const unsigned int c_idx = PRED_C;
++    int i;
++    const unsigned int hshift = ctx_hshift(s, c_idx);
++    const unsigned int vshift = ctx_vshift(s, c_idx);
++    int size = (1 << log2_size);
++    int size_in_luma_h = size << hshift;
++    int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
++    int size_in_luma_v = size << vshift;
++    int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
++    const int x = x0 >> hshift;
++    const int y = y0 >> vshift;
++    int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++    int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
 +
-+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
-+# All qpus start at the beginning and after that (group - 1) must have finished
-+# before (group) can start
-+#
-+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
-+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
-+# lockup otherwise)
-+#
-+# There is some, currently ill defined, potential lockup if we have the VDM active
-+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
-+#
-+# The code stalled when I had many waiters on a single sem so we have a
-+# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
-+# and we currently have both the memory & sems to support it.
-+.macro m_sync_q, n_qpu, n_quads
-+# Do not generate code for qpu >= quads * 4 -  fns should never be called
-+.if n_qpu < n_quads * 4
-+  mov ra_link, unif     # Can only branch to an a reg (not r0)
-+  mov -, vw_wait        # [ra_link delay]
++    int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
 +
-+.set n_sem_sync, n_qpu - (n_qpu % 4)
-+.set n_sem_in, n_qpu
-+.set n_sem_out, n_qpu + 1
++    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
++    pixel *const src = c_idx == 0 ?
++        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
 +
-+.if n_qpu % 4 == 0
++    int min_pu_width = s->ps.sps->min_pu_width;
 +
-+.set n_sem_quad_in,  12 + n_qpu / 4
-+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++    const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
++                              lc->tu.intra_pred_mode;
++    pixel4 a;
++    pixel  left_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
++    pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
++#endif
++    pixel  top_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
++    pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
++#endif
 +
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  sacq -, n_sem_sync
-+  bra -, ra_link
-+  sacq -, n_sem_quad_in
-+  srel -, n_sem_out
-+  srel -, n_sem_quad_out
++    pixel  *left          = left_array + 1;
++    pixel  *top           = top_array  + 1;
++#if !PRED_C
++    pixel  *filtered_left = filtered_left_array + 1;
++    pixel  *filtered_top  = filtered_top_array  + 1;
++#endif
++    int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
++    int cand_left        = lc->na.cand_left;
++    int cand_up_left     = lc->na.cand_up_left;
++    int cand_up          = lc->na.cand_up;
++    int cand_up_right    = lc->na.cand_up_right    && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1);
 +
-+.else
-+  bra -, ra_link
-+  srel -, n_sem_sync
-+  sacq -, n_sem_in
-+.if n_sem_out % 4 != 0
-+  srel -, n_sem_out
-+.else
-+  nop
-+.endif
-+.endif
-+.endif
-+.endm
++    int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) -
++                           (y0 + size_in_luma_v)) >> vshift;
++    int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
++                           (x0 + size_in_luma_h)) >> hshift;
 +
-+.set v_quads8, N_QPU_8 / 4
++    pixel * src_l = src - 1;
++    pixel * src_u = src - stride;
++    pixel * src_ur = src_u + size;
 +
-+::mc_sync_q0
-+  m_sync_q 0, v_quads8
-+::mc_sync_q1
-+  m_sync_q 1, v_quads8
-+::mc_sync_q2
-+  m_sync_q 2, v_quads8
-+::mc_sync_q3
-+  m_sync_q 3, v_quads8
-+::mc_sync_q4
-+  m_sync_q 4, v_quads8
-+::mc_sync_q5
-+  m_sync_q 5, v_quads8
-+::mc_sync_q6
-+  m_sync_q 6, v_quads8
-+::mc_sync_q7
-+  m_sync_q 7, v_quads8
-+::mc_sync_q8
-+  m_sync_q 8, v_quads8
-+::mc_sync_q9
-+  m_sync_q 9, v_quads8
-+::mc_sync_q10
-+  m_sync_q 10, v_quads8
-+::mc_sync_q11
-+  m_sync_q 11, v_quads8
++    {
++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
++        const AVFrame * const frame = s->frame;
++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
++        if ((x & mask) == 0)
++            src_l -= stripe_adj;
++        if (((x + size) & mask) == 0)
++            src_ur += stripe_adj;
++    }
 +
-+# mc_exit()
-+# Chroma & Luma the same now
++    if (s->ps.pps->constrained_intra_pred_flag == 1) {
++        int size_in_luma_pu_v = PU(size_in_luma_v);
++        int size_in_luma_pu_h = PU(size_in_luma_h);
++        int on_pu_edge_x    = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size);
++        int on_pu_edge_y    = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size);
++        if (!size_in_luma_pu_h)
++            size_in_luma_pu_h++;
++        if (cand_bottom_left == 1 && on_pu_edge_x) {
++            int x_left_pu   = PU(x0 - 1);
++            int y_bottom_pu = PU(y0 + size_in_luma_v);
++            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu);
++            cand_bottom_left = 0;
++            for (i = 0; i < max; i += 2)
++                cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA);
++        }
++        if (cand_left == 1 && on_pu_edge_x) {
++            int x_left_pu   = PU(x0 - 1);
++            int y_left_pu   = PU(y0);
++            int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu);
++            cand_left = 0;
++            for (i = 0; i < max; i += 2)
++                cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA);
++        }
++        if (cand_up_left == 1) {
++            int x_left_pu   = PU(x0 - 1);
++            int y_top_pu    = PU(y0 - 1);
++            cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA;
++        }
++        if (cand_up == 1 && on_pu_edge_y) {
++            int x_top_pu    = PU(x0);
++            int y_top_pu    = PU(y0 - 1);
++            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu);
++            cand_up = 0;
++            for (i = 0; i < max; i += 2)
++                cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA);
++        }
++        if (cand_up_right == 1 && on_pu_edge_y) {
++            int y_top_pu    = PU(y0 - 1);
++            int x_right_pu  = PU(x0 + size_in_luma_h);
++            int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu);
++            cand_up_right = 0;
++            for (i = 0; i < max; i += 2)
++                cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA);
++        }
++        memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel));
++        memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel));
++        top[-1] = 128;
++    }
++    if (cand_up_left) {
++        left[-1] = src_l[-stride];
++        top[-1]  = left[-1];
++    }
++    if (cand_up)
++        // Always good - even with sand
++        memcpy(top, src_u, size * sizeof(pixel));
++    if (cand_up_right) {
++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
++               size - top_right_size);
++    }
++    if (cand_left)
++        for (i = 0; i < size; i++)
++            left[i] = src_l[stride * i];
++    if (cand_bottom_left) {
++        for (i = size; i < size + bottom_left_size; i++)
++            left[i] = src_l[stride * i];
++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
++               size - bottom_left_size);
++    }
 +
-+.macro m_exit_qn
-+  m_exit_drain
-+  nop                   ; nop           ; thrend
-+  nop
-+  nop
-+# >>> thrend <<<
-+.endm
++    if (s->ps.pps->constrained_intra_pred_flag == 1) {
++        if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) {
++            int size_max_x = x0 + ((2 * size) << hshift) < s->ps.sps->width ?
++                                    2 * size : (s->ps.sps->width - x0) >> hshift;
++            int size_max_y = y0 + ((2 * size) << vshift) < s->ps.sps->height ?
++                                    2 * size : (s->ps.sps->height - y0) >> vshift;
++            int j = size + (cand_bottom_left? bottom_left_size: 0) -1;
++            if (!cand_up_right) {
++                size_max_x = x0 + ((size) << hshift) < s->ps.sps->width ?
++                                                    size : (s->ps.sps->width - x0) >> hshift;
++            }
++            if (!cand_bottom_left) {
++                size_max_y = y0 + (( size) << vshift) < s->ps.sps->height ?
++                                                     size : (s->ps.sps->height - y0) >> vshift;
++            }
++            if (cand_bottom_left || cand_left || cand_up_left) {
++                while (j > -1 && !IS_INTRA(-1, j))
++                    j--;
++                if (!IS_INTRA(-1, j)) {
++                    j = 0;
++                    while (j < size_max_x && !IS_INTRA(j, -1))
++                        j++;
++                    EXTEND_LEFT_CIP(top, j, j + 1);
++                    left[-1] = top[-1];
++                }
++            } else {
++                j = 0;
++                while (j < size_max_x && !IS_INTRA(j, -1))
++                    j++;
++                if (j > 0)
++                    if (x0 > 0) {
++                        EXTEND_LEFT_CIP(top, j, j + 1);
++                    } else {
++                        EXTEND_LEFT_CIP(top, j, j);
++                        top[-1] = top[0];
++                    }
++                left[-1] = top[-1];
++            }
++            left[-1] = top[-1];
++            if (cand_bottom_left || cand_left) {
++                a = PIXEL_SPLAT_X4(left[-1]);
++                EXTEND_DOWN_CIP(left, 0, size_max_y);
++            }
++            if (!cand_left)
++                EXTEND(left, left[-1], size);
++            if (!cand_bottom_left)
++                EXTEND(left + size, left[size - 1], size);
++            if (x0 != 0 && y0 != 0) {
++                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
++                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
++                if (!IS_INTRA(-1, - 1))
++                    left[-1] = left[0];
++            } else if (x0 == 0) {
++                EXTEND(left, 0, size_max_y);
++            } else {
++                a = PIXEL_SPLAT_X4(left[size_max_y - 1]);
++                EXTEND_UP_CIP(left, size_max_y - 1, size_max_y);
++            }
++            top[-1] = left[-1];
++            if (y0 != 0) {
++                a = PIXEL_SPLAT_X4(left[-1]);
++                EXTEND_RIGHT_CIP(top, 0, size_max_x);
++            }
++        }
++    }
++    // Infer the unavailable samples
++    if (!cand_bottom_left) {
++        if (cand_left) {
++            EXTEND(left + size, left[size - 1], size);
++        } else if (cand_up_left) {
++            EXTEND(left, left[-1], 2 * size);
++            cand_left = 1;
++        } else if (cand_up) {
++            left[-1] = top[0];
++            EXTEND(left, left[-1], 2 * size);
++            cand_up_left = 1;
++            cand_left    = 1;
++        } else if (cand_up_right) {
++            EXTEND(top, top[size], size);
++            left[-1] = top[size];
++            EXTEND(left, left[-1], 2 * size);
++            cand_up      = 1;
++            cand_up_left = 1;
++            cand_left    = 1;
++        } else { // No samples available
++#if PRED_C
++            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
++#else
++            left[-1] = (1 << (BIT_DEPTH - 1));
++#endif
++            EXTEND(top,  left[-1], 2 * size);
++            EXTEND(left, left[-1], 2 * size);
++        }
++    }
 +
-+::mc_exit_c_qn
-+::mc_exit_y_qn
-+  m_exit_qn
++    if (!cand_left)
++        EXTEND(left, left[size], size);
++    if (!cand_up_left) {
++        left[-1] = left[0];
++    }
++    if (!cand_up)
++        EXTEND(top, left[-1], size);
++    if (!cand_up_right)
++        EXTEND(top + size, top[size - 1], size);
 +
++    top[-1] = left[-1];
 +
++    // Filtering process
++    // Sand can only apply to chroma_format_idc == 1 so we don't need to
++    // worry about chroma smoothing for that case
++#if !PRED_C
++    if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || ctx_cfmt(s) == 3)) {
++        if (mode != INTRA_DC && size != 4){
++            int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
++            int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)),
++                                          FFABS((int)(mode - 10U)));
++            if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) {
++                int threshold = 1 << (BIT_DEPTH - 5);
++                if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 &&
++                    log2_size == 5 &&
++                    FFABS(top[-1]  + top[63]  - 2 * top[31])  < threshold &&
++                    FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) {
++                    // We can't just overwrite values in top because it could be
++                    // a pointer into src
++                    filtered_top[-1] = top[-1];
++                    filtered_top[63] = top[63];
++                    for (i = 0; i < 63; i++)
++                        filtered_top[i] = ((64 - (i + 1)) * top[-1] +
++                                           (i + 1)  * top[63] + 32) >> 6;
++                    for (i = 0; i < 63; i++)
++                        left[i] = ((64 - (i + 1)) * left[-1] +
++                                   (i + 1)  * left[63] + 32) >> 6;
++                    top = filtered_top;
++                } else {
++                    filtered_left[2 * size - 1] = left[2 * size - 1];
++                    filtered_top[2 * size - 1]  = top[2 * size - 1];
++                    for (i = 2 * size - 2; i >= 0; i--)
++                        filtered_left[i] = (left[i + 1] + 2 * left[i] +
++                                            left[i - 1] + 2) >> 2;
++                    filtered_top[-1]  =
++                    filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
++                    for (i = 2 * size - 2; i >= 0; i--)
++                        filtered_top[i] = (top[i + 1] + 2 * top[i] +
++                                           top[i - 1] + 2) >> 2;
++                    left = filtered_left;
++                    top  = filtered_top;
++                }
++            }
++        }
++    }
 +
-+# mc_interrupt_exit12()
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc((uint8_t *)src, (uint8_t *)top,
++                       (uint8_t *)left, stride, log2_size, c_idx);
++        break;
++    default:
++        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                           (uint8_t *)left, stride, c_idx,
++                                           mode);
++        break;
++    }
++#else
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
++                       (uint8_t *)left, stride, log2_size, c_idx);
++        break;
++    default:
++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                           (uint8_t *)left, stride, c_idx,
++                                           mode);
++        break;
++    }
 +
-+.macro m_exit_q0
-+  m_exit_drain
-+  sacq -, 12
-+  nop                   ; nop           ; thrend
-+  mov interrupt, 1
-+  nop
-+# >>> thrend <<<
-+.endm
++#if DUMP_PRED
++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
++}
 +
-+::mc_exit_c_q0
-+::mc_exit_y_q0
-+  m_exit_q0
++#define INTRA_PRED(size)                                                            \
++static void FUNC(intra_pred_ ## size)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx)    \
++{                                                                                   \
++    FUNC(intra_pred)(s, lc, x0, y0, size, c_idx);                                       \
++}
 +
-+# LUMA CODE
++INTRA_PRED(2)
++INTRA_PRED(3)
++INTRA_PRED(4)
++INTRA_PRED(5)
 +
-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-+# For P frames we make the second x,y coordinates offset by +8
++#undef INTRA_PRED
 +
++#if !PRED_C
++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++                                  const uint8_t *_left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++    int size = 1 << trafo_size;
++    for (y = 0; y < size; y++)
++        for (x = 0; x < size; x++)
++            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
++                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
++}
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++                                  const uint8_t * _left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    int size = 1 << trafo_size;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
 +
-+################################################################################
-+# mc_setup
-+#
-+# typedef struct qpu_mc_pred_y_s_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t pic_h;
-+#    uint16_t pic_w;
-+#    uint32_t stride2;
-+#    uint32_t stride1;
-+#    uint32_t wdenom;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_s_t;
++    for (y = 0; y < size; y++, src += stride)
++    {
++        for (x = 0; x < size; x++)
++        {
++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++        }
++    }
++}
++#endif
 +
-+.macro m_setup_y, v_bit_depth
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
++}
 +
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_pmask,           0xff
-+.set v_blk_height,      Y_BLK_HEIGHT_8
-+.else
-+.set v_x_shift,         1
-+.set v_pmask,           0xffff
-+.set v_blk_height,      Y_BLK_HEIGHT_16
-+.endif
++PRED_PLANAR(0)
++PRED_PLANAR(1)
++PRED_PLANAR(2)
++PRED_PLANAR(3)
 +
++#undef PRED_PLANAR
 +
-+  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
-+  mov ra9, unif                                 # ref_y_base
-+  mov ra1, unif                                 # x2_y2
-+  mov ra11, unif                                # ref_y2_base
++#if !PRED_C
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size, int c_idx)
++{
++    int i, j, x, y;
++    int size          = (1 << log2_size);
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++    int dc            = size;
++    pixel4 a;
++    for (i = 0; i < size; i++)
++        dc += left[i] + top[i];
 +
-+# load constants
-+  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+  shl rb_ef, r0, i_shift30
++    dc >>= log2_size + 1;
 +
++    a = PIXEL_SPLAT_X4(dc);
 +
-+  mov ra_kff100100, 0xff100100
-+  mov rb_pmask, v_pmask
-+  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++    for (i = 0; i < size; i++)
++        for (j = 0; j < size; j+=4)
++            AV_WN4P(&POS(j, i), a);
++
++    if (c_idx == 0 && size < 32) {
++        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
++        for (x = 1; x < size; x++)
++            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
++        for (y = 1; y < size; y++)
++            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++    }
++}
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size, int c_idx)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
++    unsigned int dc0 = size;
++    unsigned int dc1 = size;
 +
-+# Compute part of VPM to use
++    for (i = 0; i < size; i++)
++    {
++        dc0 += left[i][0] + top[i][0];
++        dc1 += left[i][1] + top[i][1];
++    }
 +
-+# Read image dimensions
-+  mov ra3, unif                                 # width_height
-+  mov rb_xpitch, unif                           # stride2
-+.if v_x_shift == 0
-+  sub rb_max_x, ra3.16b, 1
-+.else
-+  sub r0, ra3.16b, 1
-+  shl rb_max_x, r0, v_x_shift
-+.endif
-+  sub rb_max_y, ra3.16a, 1
-+  mov rb_pitch, unif                            # stride1
++    dc0 >>= log2_size + 1;
++    dc1 >>= log2_size + 1;
 +
-+# get destination pitch
-+  mov r1, vdw_setup_1(0)
-+  or  rb_dma1_base, r1, rb_pitch
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = dc0;
++            src[j][1] = dc1;
 +
-+# Compute base address for first and second access
-+  mov r3, elem_num
-+  add r0, ra0.16b, r3                           # Load x + elem_num
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl ra_xshift_next, r0, 3 # Compute shifts
++        }
++    }
++}
++#endif
 +
-+# X is byte offset - we can only load words - mask
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++};
++static const int inv_angle[] = {
++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++    -630, -910, -1638, -4096
++};
++#endif
 +
-+  and r0, r0, -4        ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch
-+  and r1, r0, r2
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                # Add stripe offsets
-+  add ra_base, ra9, r0
++#if !PRED_C
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride, int c_idx,
++                                                int mode, int size)
++{
++    int x, y;
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
 +
-+  # r3 still contains elem_num
-+  add r0, ra1.16b, r3                           # Load x
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+  shl rb_xshift2_next, r0, 3                    # Compute shifts
++    int angle = intra_pred_angle[mode - 2];
++    pixel ref_array[3 * MAX_TB_SIZE + 4];
++    pixel *ref_tmp = ref_array + size;
++    const pixel *ref;
++    int last = (size * angle) >> 5;
 +
-+  # r2 still contains mask
-+  and r0, r0, -4
-+  and r1, r0, r2
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1                                # Add stripe offsets
-+  add ra_base2, ra11, r0
++    if (mode >= 18) {
++        ref = top - 1;
++        if (angle < 0 && last < -1) {
++            for (x = 0; x <= size; x += 4)
++                AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1]));
++            for (x = last; x <= -1; x++)
++                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++            ref = ref_tmp;
++        }
 +
-+# Do preloads
-+  nop                   ; mov r0, ra0.16a       # ; r0 = y
-+  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
++        for (y = 0; y < size; y++) {
++            int idx  = ((y + 1) * angle) >> 5;
++            int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; x += 4) {
++                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
++                                           fact  * ref[x + idx + 2] + 16) >> 5;
++                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
++                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
++                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
++                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
++                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
++                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
++                }
++            } else {
++                for (x = 0; x < size; x += 4)
++                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
++            }
++        }
++        if (mode == 26 && c_idx == 0 && size < 32) {
++            for (y = 0; y < size; y++)
++                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
++        }
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            for (x = 0; x <= size; x += 4)
++                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
++            for (x = last; x <= -1; x++)
++                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++            ref = ref_tmp;
++        }
 +
-+:1
-+  sub.setf r3, r3, 1
-+  max r1, r0, 0
-+  min r1, r1, rb_max_y
-+  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1  ; mov ra_y, r0
++        for (x = 0; x < size; x++) {
++            int idx  = ((x + 1) * angle) >> 5;
++            int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
++                                       fact  * ref[y + idx + 2] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                    POS(x, y) = ref[y + idx + 1];
++            }
++        }
++        if (mode == 10 && c_idx == 0 && size < 32) {
++            for (x = 0; x < size; x += 4) {
++                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - top[-1]) >> 1));
++                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
++                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1));
++                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1));
++            }
++        }
++    }
++}
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride, int c_idx,
++                                                int mode, int size)
++{
++    int x, y;
++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
++    c_src_ptr_t top  = (c_src_ptr_t)_top;
++    c_src_ptr_t left = (c_src_ptr_t)_left;
 +
-+  max r1, r2, 0
-+  brr.anynz -, r:1b
-+  min r1, r1, rb_max_y
-+  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
-+  add t1s, ra_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz 1b
++    const int angle = intra_pred_angle[mode - 2];
++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++    c_dst_ptr_t ref_tmp = ref_array + size;
++    c_src_ptr_t ref;
++    const int last = (size * angle) >> 5;
 +
-+  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
++    if (mode >= 18) {
++        ref = top - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c_src_ptr_t)ref_tmp;
++        }
 +
-+  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++        for (y = 0; y < size; y++, src += stride) {
++            const int idx  = ((y + 1) * angle) >> 5;
++            const int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; ++x) {
++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                memcpy(src, ref + idx + 1, size * 2 * PW);
++            }
++        }
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c_src_ptr_t)ref_tmp;
++        }
 +
-+  mov ra_link, unif                             # Next fn
++        for (x = 0; x < size; x++, src++) {
++            const int idx  = ((x + 1) * angle) >> 5;
++            const int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                {
++                    src[y * stride][0] = ref[y + idx + 1][0];
++                    src[y * stride][1] = ref[y + idx + 1][1];
++                }
++            }
++        }
++    }
++}
++#endif
++
++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int c_idx, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2);
++}
++
++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int c_idx, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3);
++}
++
++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int c_idx, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4);
++}
 +
-+# touch vertical context to keep simulator happy
-+  mov ra8,  0           ; mov rb8,  0
-+  bra -, ra_link
-+  mov ra9,  0           ; mov rb9,  0
-+  mov ra10, 0           ; mov rb10, 0
-+  mov ra11, 0           ; mov rb11, 0
-+# >>> ra_link
-+.endm
++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int c_idx, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
++}
 +
-+::mc_setup_y_q0
-+  m_setup_q0
-+::mc_setup_y_qn
-+  m_setup_y 8
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
 +
-+################################################################################
-+#
-+# Start of per-block setup code
-+# P and B blocks share the same setup code to save on Icache space
++#undef EXTEND_LEFT_CIP
++#undef EXTEND_RIGHT_CIP
++#undef EXTEND_UP_CIP
++#undef EXTEND_DOWN_CIP
++#undef IS_INTRA
++#undef MVF_PU
++#undef MVF
++#undef PU
++#undef EXTEND
++#undef MIN_TB_ADDR_ZS
++#undef POS
++#undef PW
 +
-+# luma_setup_delay3 done in delay slots of branch that got us here
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
 +
-+# get base addresses and per-channel shifts for *next* invocation
-+# per-channel shifts were calculated on the *previous* invocation
+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+new file mode 100644
+index 0000000000..c16d9931bd
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,145 @@
++/*
++Copyright (c) 2012, Broadcom Europe Ltd.
++All rights reserved.
 +
-+# 1st 3 instructions of per_block-setup in branch delay
-+#
-+# typedef struct qpu_mc_pred_y_p_s {
-+#    qpu_mc_src_t next_src1;
-+#    qpu_mc_src_t next_src2;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t mymx21;
-+#    uint32_t wo1;
-+#    uint32_t wo2;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p_t;
-+#
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
 +
-+.macro m_luma_setup, v_bit_depth
-+# Hack - QASM may well have have label pasting but I have no idea how...
-+.if v_bit_depth == 8
-+  brr ra_link, r:per_block_setup_8
-+.elif v_bit_depth == 10
-+  brr ra_link, r:per_block_setup_10
-+.endif
-+  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
-+  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
-+  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
-+.endm
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
 +
-+.macro m_per_block_setup, v_bit_depth
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <stdint.h>
++#include <sys/ioctl.h>
 +
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
++#include <linux/ioctl.h>
 +
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
-+  min r0, r0, rb_max_x
++#define MAJOR_NUM 100
++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
++#define DEVICE_FILE_NAME "/dev/vcio"
 +
-+  shl ra_xshift_next, r0, 3         # Compute shifts
-+  and r0, r0, -4
-+  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
-+  and r1, r0, r2        ; mov ra_y_next, ra0.16a
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
-+  add ra_base_next, ra_base_next, r0            # [ra1 delay]
++#include "rpi_mailbox.h"
++//#include <interface/vctypes/vc_image_structs.h>
 +
-+  add r0, ra1.16b, r3                           # Load x2
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
-+  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
-+  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
-+  shl rb_xshift2_next, r0, 3                    # Compute shifts
-+  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
-+  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
-+  add rb_base2_next, rb_base2_next, r0
++/*
++ * use ioctl to send mbox property message
++ */
 +
-+# get width,height of block (unif load above), r1 = width * pel_size
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
-+  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
-+  add rb_lcount, r0, 7
-+  shl r0,   r0, v_dma_h_shift
-+  add r0,   r0, r1                              # Combine width and height of destination area
-+  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
++static int mbox_property(int file_desc, void *buf)
++{
++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
 +
-+# get filter coefficients and discard unused B frame values
-+  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
-+  shl ra8, r0, 3        ; mov r3, ra_k255
++   if (ret_val < 0) {
++      printf("ioctl_set_msg failed:%d\n", ret_val);
++   }
 +
-+# Pack the 1st 4 filter coefs for H & V tightly
-+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++#ifdef DEBUG
++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++   for (i=0; i<size/4; i++)
++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
++#endif
++   return ret_val;
++}
 +
-+  mov r1,0x00010100  # -ve                      [ra8 delay]
-+  ror ra2.8a, r1, ra8.8d
-+  ror ra0.8a, r1, ra8.8c
++unsigned mbox_mem_lock(int file_desc, unsigned handle)
++{
++   int i=0;
++   unsigned p[32];
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
 +
-+  mov r1, 0x01040400
-+  ror ra2.8b, r1, ra8.8d
-+  ror ra0.8b, r1, ra8.8c
++   p[i++] = 0x3000d; // (the tag id)
++   p[i++] = 4; // (size of the buffer)
++   p[i++] = 4; // (size of the data)
++   p[i++] = handle;
 +
-+  mov r1,0x050b0a00  # -ve
-+  ror ra2.8c, r1, ra8.8d
-+  ror ra0.8c, r1, ra8.8c
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
 +
-+  mov r1,0x11283a40
-+  ror ra2.8d, r1, ra8.8d
-+  ror ra0.8d, r1, ra8.8c
++   mbox_property(file_desc, p);
++   return p[5];
++}
 +
-+# In the 2nd vertical half we use b registers due to using a-side fifo regs
++unsigned mbox_mem_unlock(int file_desc, unsigned handle)
++{
++   int i=0;
++   unsigned p[32];
++   p[i++] = 0; // size
++   p[i++] = 0x00000000; // process request
 +
-+  mov r1,0x3a281100
-+  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
-+  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++   p[i++] = 0x3000e; // (the tag id)
++   p[i++] = 4; // (size of the buffer)
++   p[i++] = 4; // (size of the data)
++   p[i++] = handle;
 +
-+  mov r1,0x0a0b0500  # -ve
-+  ror r0, r1, ra8.8d
-+  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++   p[i++] = 0x00000000; // end tag
++   p[0] = i*sizeof *p; // actual size
 +
-+  mov r1,0x04040100
-+  ror r0, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++   mbox_property(file_desc, p);
++   return p[5];
++}
 +
-+  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
++#define GET_VCIMAGE_PARAMS 0x30044
 +
-+  mov r1,0x01010000  # -ve
-+  ror r0, r1, ra8.8d
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
++{
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
 +
-+  bra -, ra_link
-+  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
 +
-+  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
-+  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
-+  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
-+# >>> branch ra_link
++    rv = mbox_property(fd, buf);
++    memcpy(img, rimg, sizeof(*img));
 +
-+# r5 = 0
-+# ra_wt_mul_l1  = weight L1
-+# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
-+# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
-+# rb_wt_den_p15 = weight denom + 6 + 9
-+# rb_wt_mul_l0  = weight L0
-+.endm
++    return rv;
++}
 +
-+:per_block_setup_8
-+  m_per_block_setup 8
++int mbox_open() {
++   int file_desc;
 +
++   // open a char device file used for communicating with kernel mbox driver
++   file_desc = open(DEVICE_FILE_NAME, 0);
++   if (file_desc < 0) {
++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
++   }
++   return file_desc;
++}
 +
++void mbox_close(int file_desc) {
++  close(file_desc);
++}
 +
-+################################################################################
-+# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+# In a P block, y2_x2 should be y_x+8
-+# At this point we have already issued two pairs of texture requests for the current block
+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+new file mode 100644
+index 0000000000..b3168788d2
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,58 @@
++#ifndef RPI_MAILBOX_H
++#define RPI_MAILBOX_H
 +
-+.macro m_filter_y_pxx, v_bit_depth
-+  m_luma_setup v_bit_depth
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++  void *u, *v;
++  int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
 +
-+  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++typedef union {
++    VC_IMAGE_EXTRA_UV_T uv;
++//  VC_IMAGE_EXTRA_RGBA_T rgba;
++//  VC_IMAGE_EXTRA_PAL_T pal;
++//  VC_IMAGE_EXTRA_TF_T tf;
++//  VC_IMAGE_EXTRA_BAYER_T bayer;
++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++//  VC_IMAGE_EXTRA_CODEC_T codec;
++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
 +
-+# r5 = 0 (loop count)
 +
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++typedef struct VC_IMAGE_T {
++  unsigned short                  type;           /* should restrict to 16 bits */
++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++  unsigned short                  width;          /* width in pixels */
++  unsigned short                  height;         /* height in pixels */
++  int                             pitch;          /* pitch of image_data array in bytes */
++  int                             size;           /* number of bytes available in image_data array */
++  void                           *image_data;     /* pixel data */
++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++  void                           *metadata;       /* metadata header for the image */
++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
++  int                             metadata_size;  /* size of metadata of each channel in bytes */
++  int                             channel_offset; /* offset of consecutive channels in bytes */
++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                            into a linked-mulitchannel image */
++  uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                            it is being linked. */
++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++} VC_IMAGE_T;
 +
-+# N.B. Whilst y == y2 as far as this loop is concerned we will start
-+# the grab for the next block before we finish with this block and that
-+# might be B where y != y2 so we must do full processing on both y and y2
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
 +
-+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
-+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++extern int mbox_open(void);
++extern void mbox_close(int file_desc);
 +
-+  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y          ; mov ra7, ra8
-+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
 +
-+  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
 +
-+# apply horizontal filter
-+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
-+  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++#endif
+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+new file mode 100644
+index 0000000000..3dfc35fa5c
+--- /dev/null
++++ b/libavcodec/rpi_qpu.c
+@@ -0,0 +1,939 @@
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++#include "libavutil/avassert.h"
 +
-+  sub.setf -, r5, 8     ; mov ra9,  ra10
-+  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
-+  brr.anyn -, r:1b
-+  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
-+  mov ra10, ra11        ; mov rb10, rb11
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
-+  # >>> .anyn 1b
++#include "config.h"
 +
-+  # apply vertical filter and write to VPM
-+  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0        ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0        ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0        ; mul24 r0, ra10, rb6
-+  add r1, r1, r0        ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0
-+# At this point r1 is a 22-bit signed quantity: 8 (original sample),
-+#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
-+# The top 8 bits have rubbish in them as mul24 is unsigned
-+# The low 6 bits need discard before weighting
-+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
-+  asr r1, r1, 14
-+  nop                   ; mul24 r1, r1, ra_wt_mul_l0
-+  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
++#include <pthread.h>
++#include <time.h>
 +
-+  shl r1, r1, 8         ; v8subs r0, ra_height, r3
-+  brr.anyn -, r:1b
-+  asr r1, r1, rb_wt_den_p15
-+  min r1, r1, ra_pmax   ; mov -, vw_wait
-+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++#include <interface/vcsm/user-vcsm.h>
 +
-+# >>> branch.anyn yloop
++#include "rpi_mailbox.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
 +
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include "interface/vmcs_host/vc_vchi_gpuserv.h"
++#pragma GCC diagnostic pop
 +
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
 +
-+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL       0
 +
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++// QPU "noflush" flags
++// a mixture of flushing & profiling
 +
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  add rb_lcount, rb_lcount, r0
-+  brr -, r:1b
-+  add rb_dma0, rb_dma0, r1
-+  add rb_dest, rb_dest, r2
-+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
 +
-+::mc_filter_y_pxx
-+  m_filter_y_pxx 8
++#define vcos_verify_ge0(x) ((x)>=0)
 +
++// Size in 32bit words
++#define QPU_CODE_SIZE 4098
++#define VPU_CODE_SIZE 2048
 +
-+################################################################################
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
++// Odd rows
++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
++};
 +
-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+# In a P block, only the first half of coefficients contain used information.
-+# At this point we have already issued two pairs of texture requests for the current block
-+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
-+# Or possibly by taking advantage of symmetry?
++// Code/constants on GPU
++struct GPU
++{
++  unsigned int qpu_code[QPU_CODE_SIZE];
++  unsigned int vpu_code8[VPU_CODE_SIZE];
++  unsigned int vpu_code10[VPU_CODE_SIZE];
++  short transMatrix2even[16*16*2];
++};
++
++#define CFE_ENTS_PER_A 8
++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
++// allow 128
++#define CFE_ENT_COUNT  128
++#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
 +
-+.macro m_filter_y_bxx, v_bit_depth
-+  m_luma_setup v_bit_depth
++struct rpi_cache_flush_env_s {
++//    unsigned int n;
++//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++  struct vcsm_user_clean_invalid2_s v;
++};
 +
-+:1
-+  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
-+  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
-+  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++#define WAIT_COUNT_MAX 16
 +
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++typedef struct trace_time_one_s
++{
++  int count;
++  int64_t start[WAIT_COUNT_MAX];
++  int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
 +
-+  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y          ; mov ra7, ra8
-+  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++typedef struct trace_time_wait_s
++{
++  unsigned int jcount;
++  int64_t start0;
++  int64_t last_update;
++  trace_time_one_t active;
++  trace_time_one_t wait;
++} trace_time_wait_t;
 +
-+  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++typedef struct vq_wait_s
++{
++  sem_t sem;
++  struct vq_wait_s * next;
++} vq_wait_t;
 +
-+# apply horizontal filter
-+  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
-+  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++  vq_wait_t * head;
++  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
 +
-+  sub.setf -, r5, 8     ; mov ra9,  ra10
-+  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
-+  brr.anyn -, r:1b
-+  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
-+  mov ra10, ra11        ; mov rb10, rb11
-+  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
-+  # >>> .anyn 1b
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
 +
-+  # apply vertical filter and write to VPM
-+  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0        ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0        ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0        ; mul24 r0, ra10, rb6
-+  add r1, r1, r0        ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0        ; mov r2, rb_wt_off
-+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
-+# Top 8 bits are bad - low 6 bits should be discarded
-+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++typedef struct gpu_env_s
++{
++  int open_count;
++  int init_count;
++  int mb;
++  int vpu_i_cache_flushed;
++  GPU_MEM_PTR_T code_gm_ptr;
++  vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
 +
-+  asr r1, r1, 14
-+  nop                   ; mul24 r0, r1, ra_wt_mul_l0
-+  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++// Stop more than one thread trying to allocate memory or use the processing resources at once
++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
++static gpu_env_t * gpu = NULL;
 +
-+  add r1, r1, r0        ; mov r3, ra_blk_height
-+  shl r1, r1, 8         ; v8subs r0, ra_height, r3
-+  brr.anyn -, r:1b
-+  asr r1, r1, rb_wt_den_p15
-+  min r1, r1, ra_pmax   ; mov -, vw_wait
-+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
 +
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++static int64_t ns_time(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
++}
 +
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
 +
-+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
 +
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
 +
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  add rb_lcount, rb_lcount, r0
-+  brr -, r:1b
-+  add rb_dma0, rb_dma0, r1
-+  add rb_dest, rb_dest, r2
-+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++  // Update totals for levels that are still pending
++  for (int i = 0; i < tto->count; ++i) {
++    tto->total[i] += now - tto->start[i];
++    tto->start[i] = now;
++  }
 +
-+::mc_filter_y_bxx
-+  m_filter_y_bxx 8
++  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++         prefix,
++         T_ARG(now - start0 - tto->total[0]),
++         T_ARG(tto->total[0]),
++         T_ARG(tto->total[1]),
++         T_ARG(tto->total[2]),
++         T_ARG(tto->total[3]));
++}
 +
-+################################################################################
-+#
-+# typedef struct qpu_mc_pred_y_p00_s {
-+#    qpu_mc_src_t next_src1;
-+#    uint16_t h;
-+#    uint16_t w;
-+#    uint32_t wo1;
-+#    uint32_t dst_addr;
-+#    uint32_t next_fn;
-+# } qpu_mc_pred_y_p00_t;
 +
-+.macro m_filter_y_p00, v_bit_depth
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++  av_assert0(tto->count < WAIT_COUNT_MAX);
++  tto->start[tto->count++] = now;
++}
 +
-+.if v_bit_depth <= 8
-+.set v_x_shift,         0
-+.set v_x_mul,           1
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     7
-+.set v_dma_wh_shift,    i_shift16
-+.else
-+.set v_x_shift,         1
-+.set v_x_mul,           2
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift,     8
-+.set v_dma_wh_shift,    15
-+.endif
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++  const int n = --tto->count;
++  av_assert0(n >= 0);
++  tto->total[n] += now - tto->start[n];
++}
 +
-+  mov ra0, unif         ; mov r3, elem_num      # y_x
-+  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
-+  add r0, ra0.16b, r3
-+.if v_x_shift != 0
-+  shl r0, r0, v_x_shift
-+.endif
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++  tto_print(&ttw->active, now, ttw->start0, "Active");
++  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
++}
 +
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
++#endif
 +
-+  shl ra_xshift_next, r0, 3                     # Compute shifts
-+  and r0, r0, -4        ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
-+  and r1, r0, r2        ; mov ra_y_next, ra0.16a
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
-+  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
++// GPU memory alloc fns (internal)
 +
-+# get width,height of block (unif load above)
-+# Compute vdw_setup1(dst_pitch-width)
-+  shl r1, ra_width, v_x_shift
-+  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
-+  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
-+  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
-+  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
-+  add rb_dma0, r0, rb_dma0_base
++// GPU_MEM_PTR_T alloc fns
++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = (numbytes + 255) & ~255;  // Round up
++  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
 +
-+  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
-+  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
-+  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
++  return 0;
++}
++
++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
++  return 0;
++}
 +
-+:1
-+  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
-+  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
-+  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
++  mbox_mem_unlock(mb, p->vc_handle);
++  vcsm_unlock_ptr(p->arm);
++  vcsm_free(p->vcsm_handle);
++  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++//  printf("***** %s\n", __func__);
++}
 +
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
 +
-+  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
-+  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++// GPU init, free, lock, unlock
 +
-+  brr.anyn -, r:1b
-+  asr r1, r1, rb_wt_den_p15
-+  min r1, r1, ra_pmax   ; mov -, vw_wait
-+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
++static void gpu_term(void)
++{
++  gpu_env_t * const ge = gpu;
 +
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++  // We have to hope that eveything has terminated...
++  gpu = NULL;
 +
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++  vc_gpuserv_deinit();
 +
-+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
 +
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
++  vcsm_exit();
 +
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  add rb_lcount, rb_lcount, r0
-+  brr -, r:1b
-+  add rb_dma0, rb_dma0, r1
-+  add rb_dest, rb_dest, r2
-+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> 1b
-+.endm
++  mbox_close(ge->mb);
 +
-+::mc_filter_y_p00
-+  m_filter_y_p00 8
++  vq_wait_pool_deinit(&ge->wait_pool);
 +
-+################################################################################
++  free(ge);
++}
 +
-+.macro m_filter_y_b00, v_bit_depth
-+# luma setup does a fair bit more than we need calculating filter coeffs
-+# that we will never use but it saves I-cache to use it (also simple!)
-+  m_luma_setup v_bit_depth
 +
-+# Fix up vals that were expecting a filter (somewhat icky)
-+  mov r0, 7
-+  sub rb_i_tmu, rb_i_tmu, r0
-+  sub rb_lcount, rb_lcount, r0
-+  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
-+  shl rb_wt_off, rb_wt_off, r0
-+  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// Connect to QPU, returns 0 on success.
++static int gpu_init(gpu_env_t ** const gpu) {
++  volatile struct GPU* ptr;
++  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++  *gpu = NULL;
 +
-+:1
-+  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
-+  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
-+  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++  if (ge == NULL)
++    return -1;
 +
-+  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
-+  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++  if ((ge->mb = mbox_open()) < 0)
++    return -1;
 +
-+  max r2, ra_y2, 0
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
-+  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++  vq_wait_pool_init(&ge->wait_pool);
 +
-+  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
-+  add r1, r0, r1
-+  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
-+  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++  vcsm_init();
 +
-+  brr.anyn -, r:1b
-+  asr r1, r1, rb_wt_den_p15
-+  min r1, r1, ra_pmax   ; mov -, vw_wait
-+  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
++  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
++  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
 +
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height (currently always 16)
++  // Zero everything so we have zeros between the code bits
++  memset((void *)ptr, 0, sizeof(*ptr));
 +
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc rb_dma0, rb_lcount based on new segment height
++  // Now copy over the QPU code into GPU memory
++  {
++    int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
++    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->qpu_code, ff_hevc_rpi_shader, num_bytes);
++  }
++  // And the VPU code
++  {
++    int num_bytes = sizeof(rpi_hevc_transform8);
++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++  }
++  {
++    int num_bytes = sizeof(rpi_hevc_transform10);
++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
++  }
++  // And the transform coefficients
++  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
 +
-+  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++  *gpu = ge;
++  return 0;
++}
 +
-+# DMA out
-+  bra.anyz -, ra_link
-+  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
-+  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
-+  shl r1, r1, i_shift23
-+# >>> .anyz ra_link
 +
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+  add rb_lcount, rb_lcount, r0
-+  brr -, r:1b
-+  add rb_dma0, rb_dma0, r1
-+  add rb_dest, rb_dest, r2
-+  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
-+# >>> 1b
-+.endm
 +
-+::mc_filter_y_b00
-+  m_filter_y_b00 8
++static void gpu_unlock(void) {
++  pthread_mutex_unlock(&gpu_mutex);
++}
 +
-+################################################################################
-+################################################################################
-+# 10 BIT
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++  pthread_mutex_lock(&gpu_mutex);
 +
-+::mc_setup_c10_q0
-+  m_setup_q0
-+::mc_setup_c10_qn
-+  m_setup_c 10
++  av_assert0(gpu != NULL);
++  return gpu;
++}
 +
-+::mc_filter_c10_p
-+  m_filter_c_p 0, 10
++static gpu_env_t * gpu_lock_ref(void)
++{
++  pthread_mutex_lock(&gpu_mutex);
 +
-+::mc_filter_c10_p_l1
-+  m_filter_c_p 1, 10
++  if (gpu == NULL) {
++    int rv = gpu_init(&gpu);
++    if (rv != 0) {
++      gpu_unlock();
++      return NULL;
++    }
++  }
 +
++  ++gpu->open_count;
++  return gpu;
++}
 +
-+::mc_filter_c10_b
-+  m_filter_c_b 10
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++  if (--ge->open_count == 0)
++    gpu_term();
 +
-+# Even if these fns are the same as for other bit depths we want our own copy
-+# to keep the code we are using in a single lump to avoid (direct map) cache
-+# thrashing
-+.set v_quads10, N_QPU_16 / 4
++  gpu_unlock();
++}
 +
-+::mc_sync10_q0
-+  m_sync_q 0, v_quads10
-+::mc_sync10_q1
-+  m_sync_q 1, v_quads10
-+::mc_sync10_q2
-+  m_sync_q 2, v_quads10
-+::mc_sync10_q3
-+  m_sync_q 3, v_quads10
-+::mc_sync10_q4
-+  m_sync_q 4, v_quads10
-+::mc_sync10_q5
-+  m_sync_q 5, v_quads10
-+::mc_sync10_q6
-+  m_sync_q 6, v_quads10
-+::mc_sync10_q7
-+  m_sync_q 7, v_quads10
-+::mc_sync10_q8
-+  m_sync_q 8, v_quads10
-+::mc_sync10_q9
-+  m_sync_q 9, v_quads10
-+::mc_sync10_q10
-+  m_sync_q 10, v_quads10
-+::mc_sync10_q11
-+  m_sync_q 11, v_quads10
++static inline gpu_env_t * gpu_ptr(void)
++{
++  av_assert0(gpu != NULL);
++  return gpu;
++}
 +
-+::mc_exit_y10_q0
-+::mc_exit_c10_q0
-+  m_exit_q0
++// Public gpu fns
 +
-+::mc_exit_y10_qn
-+::mc_exit_c10_qn
-+  m_exit_qn
++// Allocate memory on GPU
++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
++// Returns 0 on success.
++// This allocates memory that will not be cached in ARM's data cache.
++// Therefore safe to use without data cache flushing.
++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
++{
++  int r;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
++  gpu_unlock();
++  return r;
++}
 +
-+::mc_setup_y10_q0
-+  m_setup_q0
-+::mc_setup_y10_qn
-+  m_setup_y 10
++// This allocates data that will be
++//    Cached in ARM L2
++//    Uncached in VPU L2
++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
++{
++  int r;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
++  gpu_unlock();
++  return r;
++}
 +
-+:per_block_setup_10
-+  m_per_block_setup 10
++void gpu_free(GPU_MEM_PTR_T * const p) {
++  gpu_env_t * const ge = gpu_lock();
++  gpu_free_internal(ge->mb, p);
++  gpu_unlock_unref(ge);
++}
 +
-+::mc_filter_y10_pxx
-+  m_filter_y_pxx 10
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
++  // Make sure that the gpu is initialized
++  av_assert0(gpu != NULL);
++  switch (bit_depth){
++    case 8:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++    case 10:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++    default:
++      av_assert0(0);
++  }
++  return 0;
++}
 +
-+::mc_filter_y10_p00
-+  m_filter_y_p00 10
++unsigned int vpu_get_constants(void) {
++  av_assert0(gpu != NULL);
++  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
++}
 +
-+::mc_filter_y10_bxx
-+  m_filter_y_bxx 10
++int gpu_get_mailbox(void)
++{
++  av_assert0(gpu);
++  return gpu->mb;
++}
 +
-+::mc_filter_y10_b00
-+  m_filter_y_b00 10
++void gpu_ref(void)
++{
++  gpu_lock_ref();
++  gpu_unlock();
++}
 +
++void gpu_unref(void)
++{
++  gpu_env_t * const ge = gpu_lock();
++  gpu_unlock_unref(ge);
++}
 +
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
 +
-+::mc_end
-+# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
-new file mode 100644
-index 0000000000..9f8983da52
---- /dev/null
-+++ b/libavcodec/rpi_shader_cmd.h
-@@ -0,0 +1,128 @@
-+#ifndef RPI_SHADER_CMD_H
-+#define RPI_SHADER_CMD_H
++#define CACHE_EL_MAX 16
 +
-+#pragma pack(push, 4)
++rpi_cache_flush_env_t * rpi_cache_flush_init()
++{
++  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
++            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
++  if (rfe == NULL)
++    return NULL;
 +
-+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
-+// If mixed then we are just confused and get a lot of warnings....
-+typedef const uint8_t * qpu_mc_src_addr_t;
-+typedef uint8_t * qpu_mc_dst_addr_t;
-+#else
-+typedef uint32_t qpu_mc_src_addr_t;
-+typedef uint32_t qpu_mc_dst_addr_t;
-+#endif
++  rfe->v.op_count = 0;
++  return rfe;
++}
 +
-+typedef struct qpu_mc_src_s
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
 +{
-+    int16_t y;
-+    int16_t x;
-+    qpu_mc_src_addr_t base;
-+} qpu_mc_src_t;
-+
++  if (rfe != NULL)
++    free(rfe);
++}
 +
-+typedef struct qpu_mc_pred_c_p_s {
-+    qpu_mc_src_t next_src;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x;
-+    uint32_t coeffs_y;
-+    uint32_t wo_u;
-+    uint32_t wo_v;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_p_t;
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
++{
++    int rc = 0;
++    if (rfe->v.op_count != 0) {
++        if (vcsm_clean_invalid2(&rfe->v) != 0)
++        {
++          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno);
++          rc = -1;
++        }
++        rfe->v.op_count = 0;
++    }
++    return rc;
++}
 +
-+typedef struct qpu_mc_pred_c_b_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t coeffs_x1;
-+    uint32_t coeffs_y1;
-+    uint32_t weight_u1;
-+    uint32_t weight_v1;
-+    qpu_mc_src_t next_src2;
-+    uint32_t coeffs_x2;
-+    uint32_t coeffs_y2;
-+    uint32_t wo_u2;
-+    uint32_t wo_v2;
-+    qpu_mc_dst_addr_t dst_addr_c;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_b_t;
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++  int rc = rpi_cache_flush_execute(rfe);;
 +
-+typedef struct qpu_mc_pred_c_s_s {
-+    qpu_mc_src_t next_src1;
-+    uint32_t pic_cw;            // C Width (== Y width / 2)
-+    uint32_t pic_ch;            // C Height (== Y Height / 2)
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    uint32_t wdenom;
-+    qpu_mc_src_t next_src2;
-+    uint32_t next_fn;
-+} qpu_mc_pred_c_s_t;
++  free(rfe);
++  return rc;
++}
 +
-+typedef struct qpu_mc_pred_c_s {
-+    union {
-+        qpu_mc_pred_c_p_t p;
-+        qpu_mc_pred_c_b_t b;
-+        qpu_mc_pred_c_s_t s;
-+    };
-+} qpu_mc_pred_c_t;
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
++{
++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
 +
++  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
 +
-+typedef struct qpu_mc_pred_y_p_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t mymx21;
-+    uint32_t wo1;
-+    uint32_t wo2;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p_t;
++  b->invalidate_mode = mode;
++  b->block_count = blocks;
++  b->start_address = gm->arm + offset0;
++  b->block_size = block_size;
++  b->inter_block_stride = block_stride;
++}
 +
-+typedef struct qpu_mc_pred_y_p00_s {
-+    qpu_mc_src_t next_src1;
-+    uint16_t h;
-+    uint16_t w;
-+    uint32_t wo1;
-+    qpu_mc_dst_addr_t dst_addr;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_p00_t;
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset, const unsigned int size)
++{
++  // Deal with empty pointer trivially
++  if (gm == NULL || size == 0)
++    return;
 +
-+typedef struct qpu_mc_pred_y_s_s {
-+    qpu_mc_src_t next_src1;
-+    qpu_mc_src_t next_src2;
-+    uint16_t pic_h;
-+    uint16_t pic_w;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    uint32_t wdenom;
-+    uint32_t next_fn;
-+} qpu_mc_pred_y_s_t;
++  av_assert0(offset <= gm->numbytes);
++  av_assert0(size <= gm->numbytes);
++  av_assert0(offset + size <= gm->numbytes);
 +
-+// Only a useful structure in that it allows us to return something other than a void *
-+typedef struct qpu_mc_pred_y_s {
-+    union {
-+        qpu_mc_pred_y_p_t p;
-+        qpu_mc_pred_y_p00_t p00;
-+        qpu_mc_pred_y_s_t s;
-+    };
-+} qpu_mc_pred_y_t;
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
++}
 +
-+typedef union qpu_mc_pred_cmd_u {
-+    qpu_mc_pred_y_t y;
-+    qpu_mc_pred_c_t c;
-+    uint32_t data[1];
-+} qpu_mc_pred_cmd_t;
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
 +
-+#define QPU_MC_PRED_N_Y8        12
-+#define QPU_MC_PRED_N_C8        12
 +
-+#define QPU_MC_PRED_N_Y10       12
-+#define QPU_MC_PRED_N_C10       12
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++  if (gpu_is_buf1(frame)) {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++  }
++  else
++  {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++  }
++}
 +
-+#pragma pack(pop)
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++  const unsigned int y_offset = frame->linesize[0] * y0;
++  const unsigned int y_size = frame->linesize[0] * height;
++  // Round UV up/down to get everything
++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
 +
++#if 0
++  // *** frame->height is cropped height so not good
++  // As all unsigned they will also reject -ve
++  // Test individually as well as added to reject overflow
++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
++  av_assert0(n <= (unsigned int)frame->height);
++  av_assert0(start_line + n <= (unsigned int)frame->height);
 +#endif
 +
-diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
-new file mode 100644
-index 0000000000..2d763f54ef
---- /dev/null
-+++ b/libavcodec/rpi_shader_template.c
-@@ -0,0 +1,66 @@
-+#ifdef RPI
-+
-+#include "hevc.h"
-+#include "hevcdec.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "rpi_shader_cmd.h"
-+#include "rpi_shader_template.h"
++  if (!gpu_is_buf1(frame))
++  {
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    }
++  }
++  else if (!av_rpi_is_sand_frame(frame))
++  {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++    }
++  }
++  else
++  {
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
++    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
 +
-+typedef struct shader_track_s
-+{
-+    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+    const struct qpu_mc_src_s *last_l0;
-+    const struct qpu_mc_src_s *last_l1;
-+    uint32_t width;  // pic_width * PW
-+    uint32_t height;
-+    uint32_t stride2;
-+    uint32_t stride1;
-+    uint32_t wdenom;
-+} shader_track_t;
++    if (do_chroma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++      b->block_size = uv_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++    if (do_luma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++      b->block_size = y_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++  }
++}
 +
-+static int wtoidx(const unsigned int w)
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
 +{
-+    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+    return pel_weight[w];
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++  rpi_cache_flush_finish(rfe);
 +}
 +
-+static const int fctom(uint32_t x)
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
 +{
-+    int rv;
-+    // As it happens we can take the 2nd filter term & divide it by 8
-+    // (dropping fractions) to get the fractional move
-+    rv = 8 - ((x >> 11) & 0xf);
-+    av_assert2(rv >= 0 && rv <= 7);
-+    return rv;
++  unsigned int i;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_init(&wp->pool[i].sem, 0, 0);
++    wp->pool[i].next = wp->pool + i + 1;
++  }
++  wp->head = wp->pool + 0;
++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
 +}
 +
-+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
 +{
-+    return (x << shl) >> shr;
++  unsigned int i;
++  wp->head = NULL;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_destroy(&wp->pool[i].sem);
++    wp->pool[i].next = NULL;
++  }
 +}
 +
-+static inline int woff_p(HEVCContext *const s, int32_t x)
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(void)
 +{
-+    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++  gpu_env_t * const ge = gpu_lock_ref();
++  vq_wait_t * const wait = ge->wait_pool.head;
++  ge->wait_pool.head = wait->next;
++  wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  tto_start(&ge->ttw.active, ns_time());
++#endif
++
++  gpu_unlock();
++  return wait;
 +}
 +
-+static inline int woff_b(HEVCContext *const s, int32_t x)
++static void vq_wait_delete(vq_wait_t * const wait)
 +{
-+    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++  gpu_env_t * const ge = gpu_lock();
++  wait->next = ge->wait_pool.head;
++  ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    trace_time_wait_t * const ttw = &ge->ttw;
++    const int64_t now = ns_time();
++    ++ttw->jcount;
++    tto_end(&ttw->wait, now);
++
++    if (ttw->start0 == 0)
++    {
++      ttw->start0 = ttw->active.start[0];
++      ttw->last_update = ttw->start0;
++    }
++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++    {
++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++      ttw_print(ttw, now);
++    }
++  }
++#endif
++  gpu_unlock_unref(ge);
 +}
 +
-+static inline int wweight(int32_t x)
++static void vq_wait_wait(vq_wait_t * const wait)
 +{
-+    return ext(x, 16, 16);
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++      const int64_t now = ns_time();
++      gpu_env_t * const ge = gpu_lock();
++      tto_start(&ge->ttw.wait, now);
++      gpu_unlock();
++  }
++#endif
++
++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++    /* loop */;
 +}
 +
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    gpu_env_t *const ge = gpu_lock();
++    tto_end(&ge->ttw.active, ns_time());
++    gpu_unlock();
++  }
++#endif
 +
-+#define PW 1
-+#include "rpi_shader_template_fn.h"
++  sem_post(&wait->sem);
++}
 +
-+#undef PW
-+#define PW 2
-+#include "rpi_shader_template_fn.h"
 +
-+#endif
 +
-diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
-new file mode 100644
-index 0000000000..ecf5b8185a
---- /dev/null
-+++ b/libavcodec/rpi_shader_template.h
-@@ -0,0 +1,24 @@
-+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU  1
++#define VPU_QPU_MASK_VPU  2
 +
-+#ifdef RPI
-+struct HEVCContext;
-+struct HEVCRpiInterPredEnv;
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
++{
++  unsigned int n;
++  unsigned int mask;
++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
 +
-+void rpi_shader_c8(struct HEVCContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
 +
-+void rpi_shader_c16(struct HEVCContext *const s,
-+                  const struct HEVCRpiInterPredEnv *const ipe_y,
-+                  const struct HEVCRpiInterPredEnv *const ipe_c);
++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
++{
++  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++  return vqj;
++}
 +
-+void rpi_sand_dump8(const char * const name,
-+                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
++{
++  memset(vqj, 0, sizeof(*vqj));
++  free(vqj);
++}
 +
-+void rpi_sand_dump16(const char * const name,
-+                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++  struct gpu_job_s * const j = vqj->j + vqj->n++;
++  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
++  return j;
++}
 +
-+#endif
-+#endif
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++  if (vpu_code != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_VPU;
 +
-diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
-new file mode 100644
-index 0000000000..b5ac2ceed6
---- /dev/null
-+++ b/libavcodec/rpi_shader_template_fn.h
-@@ -0,0 +1,477 @@
-+#define STRCAT(x,y) x##y
++    j->command = EXECUTE_VPU;
++    // The bottom two bits of the execute address contain no-flush flags
++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
++    // as we never reload code
++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
++    j->u.v.q[1] = r0;
++    j->u.v.q[2] = r1;
++    j->u.v.q[3] = r2;
++    j->u.v.q[4] = r3;
++    j->u.v.q[5] = r4;
++    j->u.v.q[6] = r5;
++    gpu->vpu_i_cache_flushed = 1;
++  }
++}
 +
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
++{
++  if (n != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_QPU;
++
++    j->command = EXECUTE_QPU;
++    j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
 +#else
-+#error Unexpected PW
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
 +#endif
-+
-+#define PATCH_STRIDE (16 * PW)
-+
-+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
-+        const pixel s = *(const pixel *)src;
-+        pixel * d = (pixel *)dst;
-+        for (unsigned int j = 0; j < w; j += PW) {
-+            *d++ = s;
-+        }
-+    }
++    j->u.q.timeout = 5000;
++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  }
 +}
 +
-+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
 +{
-+    for (unsigned int i = 0; i != h; ++i, dst += stride) {
-+        memcpy(dst, src, w);
-+    }
++  vq_wait_post(v);
 +}
 +
-+static void FUNC(get_patch_y)(const shader_track_t * const st,
-+                         uint8_t * dst, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
 +{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > st->width) {
-+        if (x >= st->width)
-+            x = st->width - PW;
-+        dr = (x + w) - st->width;
-+        w = st->width - x;
-+    }
++  vq_wait_t * wait;
 +
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > st->height) {
-+        if (y >= st->height)
-+            y = st->height - 1;
-+        db = (y + h) - st->height;
-+        h = st->height - y;
-+    }
++  if (vqj->mask == 0) {
++    *wait_h = NULL;
++    return;
++  }
 +
-+    dst += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++  // We are going to want a sync object
++  wait = vq_wait_new();
 +
-+    // Edge dup
-+    if (dl != 0)
-+        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
-+    if (dr != 0)
-+        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
-+    w += dl + dr;
-+    dst -= dl;
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert0(j->callback.func == 0);
 +
-+    if (dt != 0)
-+        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
-+    if (db != 0)
-+        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
-+}
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
 +
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
 +
++  vqj->mask = 0;
++  *wait_h = wait;
++}
 +
-+static void FUNC(get_patch_c)(const shader_track_t * const st,
-+                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
-+                         const qpu_mc_src_t *src,
-+                         unsigned int _w, unsigned int _h)
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
 +{
-+    int x = src->x * PW;
-+    int y = src->y;
-+    int w = _w * PW;
-+    int h = _h;
-+    int dl = 0;
-+    int dr = 0;
-+    int dt = 0;
-+    int db = 0;
-+    const int width = st->width;
-+    const int height = st->height;
++  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
 +
-+    if (x < 0) {
-+        if (-x >= w)
-+            x = PW - w;
-+        dl = -x;
-+        w += x;
-+        x = 0;
-+    }
-+    if (x + w > width) {
-+        if (x >= width)
-+            x = width - PW;
-+        dr = (x + w) - width;
-+        w = width - x;
-+    }
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++  int rv;
++  rv = vpu_qpu_job_start(vqj);
++  vpu_qpu_job_delete(vqj);
++  return rv;
++}
 +
-+    // Y
-+    if (y < 0) {
-+        if (-y >= h)
-+            y = 1 - h;
-+        dt = -y;
-+        h += y;
-+        y = 0;
-+    }
-+    if (y + h > height) {
-+        if (y >= height)
-+            y = height - 1;
-+        db = (y + h) - height;
-+        h = height - y;
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++  if (wait_h != NULL)
++  {
++    vq_wait_t * const wait = *wait_h;
++    if (wait != NULL) {
++      *wait_h = NULL;
++      vq_wait_wait(wait);
++      vq_wait_delete(wait);
 +    }
++  }
++}
 +
-+    dst_u += dl + dt * dst_stride;
-+    dst_v += dl + dt * dst_stride;
-+    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++int vpu_qpu_init()
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
 +
-+    // Edge dup
-+    if (dl != 0)
-+    {
-+        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
-+        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
-+    }
-+    if (dr != 0)
-+    {
-+        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
-+        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
-+    }
-+    w += dl + dr;
-+    dst_u -= dl;
-+    dst_v -= dl;
++  if (ge->init_count++ == 0)
++  {
++    vc_gpuserv_init();
++  }
 +
-+    if (dt != 0)
-+    {
-+        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
-+        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
-+    }
-+    if (db != 0)
-+    {
-+        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
-+        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
-+    }
++  gpu_unlock();
++  return 0;
 +}
 +
-+// w, y, w, h in pixels
-+// stride1, stride2 in bytes
-+void FUNC(rpi_sand_dump)(const char * const name,
-+                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++void vpu_qpu_term()
 +{
-+    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
-+
-+    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++  gpu_env_t * const ge = gpu_lock();
 +
-+    if (is_c) {
-+        x *= 2;
-+        w *= 2;
-+    }
++  if (--ge->init_count == 0) {
++    vc_gpuserv_deinit();
 +
-+    for (int i = y; i != y + h; ++i) {
-+        for (int j = x; j != x + w; ++j) {
-+            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
-+            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
-+#if PW == 1
-+            if (j < 0 || i < 0)
-+                printf("..%c", sep);
-+            else
-+                printf("%02x%c", *(const pixel*)p, sep);
-+#else
-+            if (j < 0 || i < 0)
-+                printf("...%c", sep);
-+            else
-+                printf("%03x%c", *(const pixel*)p, sep);
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    ttw_print(&ge->ttw, ns_time());
 +#endif
-+        }
-+        printf("\n");
-+    }
++  }
++
++  gpu_unlock_unref(ge);
++}
++
++uint32_t qpu_fn(const int * const mc_fn)
++{
++  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader) + offsetof(struct GPU, qpu_code);
 +}
 +
 +
-+void FUNC(rpi_shader_c)(HEVCContext *const s,
-+                  const HEVCRpiInterPredEnv *const ipe_y,
-+                  const HEVCRpiInterPredEnv *const ipe_c)
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
 +{
-+    for (int c_idx = 0; c_idx < 2; ++c_idx)
-+    {
-+        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
-+        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
-+        unsigned int exit_n = 0;
++  // Dummy values we can catch with emulation
++  qf->y_pxx = ~1U;
++  qf->y_bxx = ~2U;
++  qf->y_p00 = ~3U;
++  qf->y_b00 = ~4U;
++  qf->c_pxx = ~5U;
++  qf->c_bxx = ~6U;
 +
-+        if (ipe == NULL || !ipe->used) {
-+            continue;
-+        }
++  switch (bit_depth) {
++    case 8:
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
++      qf->c_pxx = qpu_fn(mc_filter_c_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c_b);
++      break;
++    case 10:
++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++      break;
++    default:
++      return -1;
++  }
++  return 0;
++}
 +
-+        do {
-+            for (unsigned int i = 0; i != ipe->n; ++i) {
-+                const HEVCRpiInterPredQ * const q = ipe->q + i;
-+                shader_track_t * const st = tracka + i;
-+                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+new file mode 100644
+index 0000000000..9389047f8e
+--- /dev/null
++++ b/libavcodec/rpi_qpu.h
+@@ -0,0 +1,208 @@
++#ifndef RPI_QPU_H
++#define RPI_QPU_H
 +
-+                for (;;) {
-+                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++#define RPI_ONE_BUF 1
 +
-+                    if (link == q->code_setup) {
-+                        if (c_idx == 0) {
-+                            // Luma
-+                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++typedef struct gpu_mem_ptr_s {
++  unsigned char *arm; // Pointer to memory mapped on ARM side
++  int vc_handle;   // Videocore handle of relocatable memory
++  int vcsm_handle; // Handle for use by VCSM
++  int vc;       // Address for use in GPU code
++  int numbytes; // Size of memory block
++} GPU_MEM_PTR_T;
 +
-+                            st->height = c->pic_h;
-+                            st->width = c->pic_w * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->wdenom = c->wdenom;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                        else {
-+                            // Chroma
-+                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++// General GPU functions
++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
 +
-+                            st->height = c->pic_ch;
-+                            st->width = c->pic_cw * PW;
-+                            st->stride1 = c->stride1;
-+                            st->stride2 = c->stride2;
-+                            st->wdenom = c->wdenom;
-+                            st->last_l0 = &c->next_src1;
-+                            st->last_l1 = &c->next_src2;
-+                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                        }
-+                    }
-+                    else if (link == s->qpu.y_pxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+                        const int w1 = FFMIN(c->w, 8);
-+                        const int w2 = c->w - w1;
++#include "libavutil/frame.h"
++#if !RPI_ONE_BUF
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
++    return p->vc;
++}
 +
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
++    return p->vc;
++}
 +
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        if (w2 > 0) {
-+                            FUNC(get_patch_y)(st,
-+                                        patch_y2, PATCH_STRIDE,
-+                                        st->last_l1,
-+                                        16, c->h + 7);
-+                        }
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
++    return p->vc;
++}
 +
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
-+                        if (w2 > 0) {
-+                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
-+                        }
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_bxx) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
++}
 +
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
++}
 +
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h + 7);
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
++}
 +
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++#else
 +
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
-+                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_p00) {
-+                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++static inline int gpu_is_buf1(const AVFrame * const frame)
++{
++    return frame->buf[1] == NULL;
++}
 +
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
++{
++    return av_buffer_get_opaque(frame->buf[0]);
++}
 +
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h + 7);
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
++{
++    return av_buffer_pool_opaque(frame->buf[n]);
++}
 +
-+                        // wo[offset] = offset*2+1
-+                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
-+                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++    return gm->vc + (frame->data[n] - gm->arm);
++}
 +
-+                        st->last_l0 = &c->next_src1;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.y_b00) {
-+                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
 +
-+                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++    return get_vc_address3(frame, 0);
++}
 +
-+                        av_assert0(c->w <= 16 && c->h <= 64);
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++    return get_vc_address3(frame, 1);
++}
 +
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y1, PATCH_STRIDE,
-+                                    st->last_l0,
-+                                    16, c->h);
-+                        FUNC(get_patch_y)(st,
-+                                    patch_y2, PATCH_STRIDE,
-+                                    st->last_l1,
-+                                    16, c->h);
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++    return get_vc_address3(frame, 2);
++}
++
++#if 0
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.numbytes = frame->data[1] - frame->data[0];
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 0);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.arm += frame->data[1] - frame->data[0];
++        g.vc += frame->data[1] - frame->data[0];
++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 1);
++}
 +
-+                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
-+                           patch_y3, patch_y1, PATCH_STRIDE,
-+                           c->h, 0, 0, c->w);
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.arm += frame->data[2] - frame->data[0];
++        g.vc += frame->data[2] - frame->data[0];
++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 2);
++}
++#endif
++#endif
 +
-+                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
-+                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
-+                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
-+                            0, woff_b(s, c->wo2), 0, 0, c->w);
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
++// Cache flush stuff
 +
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
 +
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & clear but do not free the env
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
 +
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++typedef enum
++{
++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
++} rpi_cache_flush_mode_t;
 +
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
 +
-+                        st->last_l0 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_pxx_l1) {
-+                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+                        const int mx = fctom(c->coeffs_x);
-+                        const int my = fctom(c->coeffs_y);
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
 +
-+                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
 +
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++// QPU specific functions
 +
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++typedef struct HEVCRpiQpu {
++    uint32_t c_pxx;
++    uint32_t c_pxx_l1;
++    uint32_t c_bxx;
++    uint32_t y_pxx;
++    uint32_t y_bxx;
++    uint32_t y_p00;
++    uint32_t y_b00;
++} HEVCRpiQpu;
 +
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
 +
-+                        st->last_l1 = &c->next_src;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == s->qpu.c_bxx) {
-+                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
-+                        const int mx1 = fctom(c->coeffs_x1);
-+                        const int my1 = fctom(c->coeffs_y1);
-+                        const int mx2 = fctom(c->coeffs_x2);
-+                        const int my2 = fctom(c->coeffs_y2);
++uint32_t qpu_fn(const int * const mc_fn);
 +
-+                        uint8_t patch_u1[PATCH_STRIDE * 72];
-+                        uint8_t patch_v1[PATCH_STRIDE * 72];
-+                        uint8_t patch_u2[PATCH_STRIDE * 72];
-+                        uint8_t patch_v2[PATCH_STRIDE * 72];
-+                        uint8_t patch_u3[8 * 16 * PW];
-+                        uint8_t patch_v3[8 * 16 * PW];
-+                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
-+                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++#define QPU_N_GRP    4
++#define QPU_N_MAX    12
 +
-+                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++#define QPU_MAIL_EL_VALS  2
 +
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
-+                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+                           c->h, mx1, my1, c->w);
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
 +
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
-+                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
-+                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
-+                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
-+                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
-+                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++// VPU specific functions
 +
-+                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
 +
-+                        st->last_l0 = &c->next_src1;
-+                        st->last_l1 = &c->next_src2;
-+                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+                    }
-+                    else if (link == q->code_sync) {
-+                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
-+                        break;
-+                    }
-+                    else if (link == q->code_exit) {
-+                        // We expect exit to occur without other sync
-+                        av_assert0(i == exit_n);
-+                        ++exit_n;
-+                        break;
-+                    }
-+                    else {
-+                        av_assert0(0);
-+                    }
-+                }
++vpu_qpu_job_h vpu_qpu_job_new(void);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
 +
-+                st->qpu_mc_curr = cmd;
-+            }
-+        } while (exit_n == 0);
-+    }
-+}
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
++extern unsigned int vpu_get_constants(void);
 +
-+#undef FUNC
-+#undef pixel
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
++
++extern int gpu_get_mailbox(void);
++void gpu_ref(void);
++void gpu_unref(void);
 +
++#endif
 diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
 new file mode 100644
-index 0000000000..97d58abc0a
+index 0000000000..185288da5a
 --- /dev/null
 +++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,745 @@
-+#include "config.h"
-+#ifdef RPI
+@@ -0,0 +1,741 @@
 +#include "libavcodec/avcodec.h"
 +#include "rpi_qpu.h"
 +#include "rpi_mailbox.h"
@@ -32156,8 +40432,6 @@ index 0000000000..97d58abc0a
 +    }
 +}
 +
-+#endif  // RPI
-+
 diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
 new file mode 100644
 index 0000000000..26fb3be999
@@ -32285,6 +40559,51 @@ index 13668c2105..bebf9024ec 100644
  
      return 0;
  }
+diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+index 9551f312e7..a1f68b8e30 100644
+--- a/libavcodec/utils.c
++++ b/libavcodec/utils.c
+@@ -1277,6 +1277,40 @@ AVCodec *avcodec_find_decoder(enum AVCodecID id)
+     return find_encdec(id, 0);
+ }
+ 
++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
++{
++    const enum AVPixelFormat *pf = p->pix_fmts;
++
++    // Assume good if we lack info
++    if (pf == NULL)
++        return 1;
++    if (fmt == AV_PIX_FMT_NONE)
++        return 0;
++
++    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
++        if (*pf == fmt)
++            return 1;
++    }
++    return 0;
++}
++
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
++{
++    AVCodec *p, *experimental = NULL;
++    p = first_avcodec;
++    id= remap_deprecated_codec_id(id);
++    while (p) {
++        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
++            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
++                experimental = p;
++            } else
++                return p;
++        }
++        p = p->next;
++    }
++    return experimental;
++}
++
+ AVCodec *avcodec_find_decoder_by_name(const char *name)
+ {
+     AVCodec *p;
 diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
 index f0f849b326..cd97974748 100644
 --- a/libavfilter/avfilter.c
@@ -32310,21 +40629,8 @@ index ad5aedd5f7..0d2df8b870 100644
                                   frame->format);
          break;
      case AVMEDIA_TYPE_AUDIO:
-diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index 53cbcfb543..f93f06fcfb 100644
---- a/libavformat/mpegts.c
-+++ b/libavformat/mpegts.c
-@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
- #endif
-     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
-     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
--    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
-+    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
-     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
-     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
-     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
 diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 1a7996c4fd..154942fe74 100644
+index 1a7996c4fd..271e70ed84 100644
 --- a/libavformat/utils.c
 +++ b/libavformat/utils.c
 @@ -750,7 +750,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
@@ -32336,27 +40642,111 @@ index 1a7996c4fd..154942fe74 100644
                      continue;
                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
+@@ -2940,6 +2940,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
+     return 1;
+ }
+ 
++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
++// This should be quite general purpose but avoid possible conflicts
++// by limiting usage to cases wehere we know it works.
++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
++{
++    // Only try fallback if we know it is supported (HEVC only)
++    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
++        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
++    int err;
++
++    // Failed to find fallback or we are already at the fallback
++    if (new_codec == NULL || new_codec == old_codec)
++    {
++        return AVERROR_DECODER_NOT_FOUND;
++    }
++
++    // * This may be dodgy - header says to not use this fn,
++    //   especially if we are going to reopen the context...
++    //   (but it does seem to work for our cases)
++    if (avcodec_is_open(avctx)) {
++        avcodec_close(avctx);
++    }
++
++    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
++    {
++        return err;
++    }
++
++    return 0;
++}
++#else
++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
++#endif
++
+ /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
+ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+                             AVDictionary **options)
+@@ -2974,7 +3008,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+         av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
+         if (s->codec_whitelist)
+             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
+-        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
++        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
++        {
++            // Try fallback if if looks worth a try
++            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
++        }
+         if (!options)
+             av_dict_free(&thread_opt);
+         if (ret < 0) {
+@@ -3005,6 +3043,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
+             ret = avcodec_send_packet(avctx, &pkt);
++
++            // If we are going to want to fall back we should know here
++            if (ret == AVERROR_DECODER_NOT_FOUND) {
++                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
++                    break;
++                continue;
++            }
++
+             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
+                 break;
+             if (ret >= 0)
+@@ -3601,9 +3647,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
+         // Try to just open decoders, in case this is enough to get parameters.
+         if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
+             if (codec && !avctx->codec)
+-                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
+-                    av_log(ic, AV_LOG_WARNING,
+-                           "Failed to open codec in %s\n",__FUNCTION__);
++            {
++                int err;
++
++                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
++                {
++                    if (err == AVERROR_DECODER_NOT_FOUND) {
++                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
++                    }
++                    if (err < 0) {
++                        av_log(ic, AV_LOG_WARNING,
++                               "Failed to open codec in %s\n",__FUNCTION__);
++                    }
++                }
++            }
+         }
+         if (!options)
+             av_dict_free(&thread_opt);
 diff --git a/libavutil/Makefile b/libavutil/Makefile
-index 65e285a701..afb3effa2e 100644
+index 65e285a701..2ca778c59f 100644
 --- a/libavutil/Makefile
 +++ b/libavutil/Makefile
-@@ -62,6 +62,8 @@ HEADERS = adler32.h                                                     \
-           rational.h                                                    \
-           replaygain.h                                                  \
-           ripemd.h                                                      \
-+          rpi_sand_fns.h                                                \
-+          rpi_sand_fn_pw.h                                              \
-           samplefmt.h                                                   \
-           sha.h                                                         \
-           sha512.h                                                      \
-@@ -140,6 +142,7 @@ OBJS = adler32.o                                                        \
-        reverse.o                                                        \
-        rc4.o                                                            \
-        ripemd.o                                                         \
-+       rpi_sand_fns.o                                                   \
-        samplefmt.o                                                      \
-        sha.o                                                            \
-        sha512.o                                                         \
+@@ -165,6 +165,7 @@ OBJS-$(CONFIG_QSV)                   += hwcontext_qsv.o
+ OBJS-$(CONFIG_LIBDRM)                   += hwcontext_drm.o
+ OBJS-$(CONFIG_LZO)                      += lzo.o
+ OBJS-$(CONFIG_OPENCL)                   += opencl.o opencl_internal.o
++OBJS-$(CONFIG_RPI)                      += rpi_sand_fns.o
+ OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
+ OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
+ OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
 diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
 index 5da44b0542..b74b7c4e2f 100644
 --- a/libavutil/arm/Makefile
@@ -32441,22 +40831,33 @@ index 73b6bd0b14..d907de3f1c 100644
   * @}
   */
 diff --git a/libavutil/frame.c b/libavutil/frame.c
-index d5fd2932e3..1851e3655f 100644
+index d5fd2932e3..151a33a24d 100644
 --- a/libavutil/frame.c
 +++ b/libavutil/frame.c
-@@ -25,6 +25,7 @@
+@@ -16,6 +16,8 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "config.h"
++
+ #include "channel_layout.h"
+ #include "avassert.h"
+ #include "buffer.h"
+@@ -25,6 +27,9 @@
  #include "imgutils.h"
  #include "mem.h"
  #include "samplefmt.h"
++#if CONFIG_RPI
 +#include "rpi_sand_fns.h"
++#endif
  
  
  static AVFrameSideData *frame_new_side_data(AVFrame *frame,
-@@ -833,6 +834,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
+@@ -833,6 +838,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
          (frame->crop_top + frame->crop_bottom) >= frame->height)
          return AVERROR(ERANGE);
  
-+#ifdef RPI
++#if CONFIG_RPI
 +    // Sand cannot be cropped - do not try
 +    if (av_rpi_is_sand_format(frame->format))
 +        return 0;
@@ -32727,12 +41128,11 @@ index 0000000000..52d52a2a83
 +
 diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
 new file mode 100644
-index 0000000000..ec4cfadf8a
+index 0000000000..b8bfad915e
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,99 @@
+@@ -0,0 +1,96 @@
 +#include "config.h"
-+#ifdef RPI
 +#include <stdint.h>
 +#include <string.h>
 +#include "rpi_sand_fns.h"
@@ -32828,17 +41228,14 @@ index 0000000000..ec4cfadf8a
 +    }
 +}
 +
-+#endif  // RPI
-+
 diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
 new file mode 100644
-index 0000000000..aa880d0f63
+index 0000000000..ebaa2b6d08
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,129 @@
+@@ -0,0 +1,131 @@
 +#ifndef AVUTIL_RPI_SAND_FNS
 +#define AVUTIL_RPI_SAND_FNS
-+#ifdef RPI
 +
 +#include "libavutil/frame.h"
 +
@@ -32891,9 +41288,13 @@ index 0000000000..aa880d0f63
 +
 +static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
 +{
-+    // * We could repl;ace thios with a fixed 128 whic would allow the compiler
-+    //   to optimize a whole lot better
++#ifdef RPI_ZC_SAND128_ONLY
++    // If we are sure we only only support 128 byte sand formats replace the
++    // var with a constant which should allow for better optimisation
++    return 128;
++#else
 +    return frame->linesize[0];
++#endif
 +}
 +
 +static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
@@ -32963,7 +41364,6 @@ index 0000000000..aa880d0f63
 +}
 +
 +#endif
-+#endif
 +
 diff --git a/libswscale/input.c b/libswscale/input.c
 index bb2f4933ec..de5a17bc7f 100644
@@ -32995,14 +41395,14 @@ index bb2f4933ec..de5a17bc7f 100644
      if (c->chrSrcHSubSample) {
          switch (srcFormat) {
 diff --git a/libswscale/utils.c b/libswscale/utils.c
-index dcab707de6..403558db3c 100644
+index dcab707de6..5b24de889a 100644
 --- a/libswscale/utils.c
 +++ b/libswscale/utils.c
 @@ -256,6 +256,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
      [AV_PIX_FMT_P010BE]      = { 1, 1 },
      [AV_PIX_FMT_P016LE]      = { 1, 0 },
      [AV_PIX_FMT_P016BE]      = { 1, 0 },
-+#ifdef RPI
++#if CONFIG_RPI
 +    [AV_PIX_FMT_SAND128]     = { 1, 0 },
 +    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
 +#endif
@@ -33544,17 +41944,16 @@ index 0000000000..fc14f2a3c2
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
 new file mode 100755
-index 0000000000..ec25b81c31
+index 0000000000..59c0d3959e
 --- /dev/null
 +++ b/pi-util/conf_pi1.sh
-@@ -0,0 +1,31 @@
+@@ -0,0 +1,30 @@
 +echo "Configure for Pi1"
 +
 +RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
 +RPI_OPT_VC=`pwd`/../firmware/opt/vc
 +
 +RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
 +RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
 +#RPI_KEEPS="-save-temps=obj"
 +RPI_KEEPS=""
@@ -33566,8 +41965,8 @@ index 0000000000..ec25b81c31
 + --target-os=linux\
 + --disable-stripping\
 + --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_INCLUDES"\
 + --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
 + --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
 + --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
@@ -33581,18 +41980,18 @@ index 0000000000..ec25b81c31
 +# -Wa,-ahls
 diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
 new file mode 100755
-index 0000000000..f8e5e75375
+index 0000000000..4de256bc8a
 --- /dev/null
 +++ b/pi-util/conf_pi2.sh
-@@ -0,0 +1,30 @@
+@@ -0,0 +1,32 @@
 +echo "Configure for Pi2/3"
 +
 +RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
 +RPI_OPT_VC=`pwd`/../firmware/opt/vc
 +
 +RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
 +RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++RPI_DEFINES="-D__VCCOREVER__=0x4000000"
 +#RPI_KEEPS="-save-temps=obj"
 +RPI_KEEPS=""
 +
@@ -33603,12 +42002,14 @@ index 0000000000..f8e5e75375
 + --disable-stripping\
 + --disable-thumb\
 + --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --enable-rpi\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
 + --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
 + --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
 + --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
 +
++# --enable-decoder=hevc_rpi\
 +# --enable-extra-warnings\
 +# --arch=armv71\
 +# --enable-shared\
@@ -33617,10 +42018,10 @@ index 0000000000..f8e5e75375
 +# -Wa,-ahls
 diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
 new file mode 100755
-index 0000000000..70f7be22bb
+index 0000000000..e9556f0837
 --- /dev/null
 +++ b/pi-util/ffconf.py
-@@ -0,0 +1,174 @@
+@@ -0,0 +1,175 @@
 +#!/usr/bin/env python
 +
 +import string
@@ -33634,7 +42035,7 @@ index 0000000000..70f7be22bb
 +
 +ffmpeg_exec = "./ffmpeg"
 +
-+def testone(fileroot, srcname, es_file, md5_file):
++def testone(fileroot, srcname, es_file, md5_file, vcodec):
 +    tmp_root = "/tmp"
 +
 +    names = srcname.split('/')
@@ -33656,7 +42057,7 @@ index 0000000000..70f7be22bb
 +
 +    # Unaligned needed for cropping conformance
 +    rstr = subprocess.call(
-+        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
++        [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
 +        stdout=flog, stderr=subprocess.STDOUT)
 +
 +    try:
@@ -33720,7 +42121,7 @@ index 0000000000..70f7be22bb
 +            return True
 +    return False
 +
-+def doconf(csva, tests, test_root):
++def doconf(csva, tests, test_root, vcodec):
 +    unx_failures = []
 +    unx_success = []
 +    failures = 0
@@ -33732,7 +42133,7 @@ index 0000000000..70f7be22bb
 +            print "==== ", name,
 +            sys.stdout.flush()
 +
-+            rv = testone(os.path.join(test_root, name), name, a[2], a[3])
++            rv = testone(os.path.join(test_root, name), name, a[2], a[3], vcodec=vcodec)
 +            if (rv == 0):
 +                successes += 1
 +            else:
@@ -33783,6 +42184,7 @@ index 0000000000..70f7be22bb
 +    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
 +    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
 +    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
++    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
 +    args = argp.parse_args()
 +
 +    if args.csvgen:
@@ -33793,7 +42195,7 @@ index 0000000000..70f7be22bb
 +        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
 +
 +
-+    doconf(csva, args.tests, args.test_root)
++    doconf(csva, args.tests, args.test_root, args.vcodec)
 +
 diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
 new file mode 100755
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
new file mode 100644
index 00000000000..1d1fd1690ea
--- /dev/null
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
@@ -0,0 +1,283 @@
+From 8f170986cda0695f28eb2cd4e863aaae0e14d19f Mon Sep 17 00:00:00 2001
+From: Hendrik Leppkes <h.leppkes@gmail.com>
+Date: Sat, 9 Jan 2016 16:34:09 +0100
+Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles
+
+---
+ libavcodec/avcodec.h    | 3 +++
+ libavcodec/codec_desc.c | 7 +++++++
+ libavcodec/profiles.c   | 1 +
+ libavformat/mpegts.c    | 2 +-
+ 4 files changed, 12 insertions(+), 1 deletion(-)
+
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index 6c4b011b5c..8f1f5a3e53 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -449,6 +449,8 @@ enum AVCodecID {
+     AV_CODEC_ID_GDV,
+     AV_CODEC_ID_FITS,
+ 
++    AV_CODEC_ID_H264_MVC,
++
+     /* various PCM "codecs" */
+     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
+     AV_CODEC_ID_PCM_S16LE = 0x10000,
+@@ -3318,6 +3320,7 @@ typedef struct AVCodecContext {
+ #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
+ #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
+ #define FF_PROFILE_H264_CAVLC_444            44
++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
+ 
+ #define FF_PROFILE_VC1_SIMPLE   0
+ #define FF_PROFILE_VC1_MAIN     1
+diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
+index 478b7c0ffc..ff10f3b2bc 100644
+--- a/libavcodec/codec_desc.c
++++ b/libavcodec/codec_desc.c
+@@ -1700,6 +1700,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
+         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
+         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+     },
++    {
++        .id        = AV_CODEC_ID_H264_MVC,
++        .type      = AVMEDIA_TYPE_VIDEO,
++        .name      = "h264_mvc",
++        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
++        .props     = AV_CODEC_PROP_LOSSY,
++    },
+ 
+     /* various PCM "codecs" */
+     {
+diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
+index 30498efedf..9d3cf4b535 100644
+--- a/libavcodec/profiles.c
++++ b/libavcodec/profiles.c
+@@ -72,6 +72,7 @@ const AVProfile ff_h264_profiles[] = {
+     { FF_PROFILE_H264_CAVLC_444,            "CAVLC 4:4:4"           },
+     { FF_PROFILE_H264_MULTIVIEW_HIGH,       "Multiview High"        },
+     { FF_PROFILE_H264_STEREO_HIGH,          "Stereo High"           },
++    { FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH, "Multiview High Depth"  },
+     { FF_PROFILE_UNKNOWN },
+ };
+ 
+diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
+index 53cbcfb543..f93f06fcfb 100644
+--- a/libavformat/mpegts.c
++++ b/libavformat/mpegts.c
+@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
+ #endif
+     { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
+     { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC        },
+-    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
++    { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC   },
+     { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
+     { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
+     { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
+-- 
+2.14.1
+
+
+From 00de72f97e8f69f5d4c614bff956ec726f97fa2e Mon Sep 17 00:00:00 2001
+From: Hendrik Leppkes <h.leppkes@gmail.com>
+Date: Sat, 9 Jan 2016 16:34:40 +0100
+Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs
+
+---
+ libavcodec/allcodecs.c   |  1 +
+ libavcodec/h264.h        |  2 ++
+ libavcodec/h264_parser.c | 34 ++++++++++++++++++++++++++++++----
+ 3 files changed, 33 insertions(+), 4 deletions(-)
+
+diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+index 5361a22141..a5289a5e14 100644
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
+@@ -732,6 +732,7 @@ static void register_all(void)
+     REGISTER_PARSER(H261,               h261);
+     REGISTER_PARSER(H263,               h263);
+     REGISTER_PARSER(H264,               h264);
++    REGISTER_PARSER(H264_MVC,           h264_mvc);
+     REGISTER_PARSER(HEVC,               hevc);
+     REGISTER_PARSER(MJPEG,              mjpeg);
+     REGISTER_PARSER(MLP,                mlp);
+diff --git a/libavcodec/h264.h b/libavcodec/h264.h
+index 86df5eb9b3..22c4f1d82a 100644
+--- a/libavcodec/h264.h
++++ b/libavcodec/h264.h
+@@ -41,7 +41,9 @@ enum {
+     H264_NAL_END_STREAM      = 11,
+     H264_NAL_FILLER_DATA     = 12,
+     H264_NAL_SPS_EXT         = 13,
++    H264_NAL_SPS_SUBSET      = 15,
+     H264_NAL_AUXILIARY_SLICE = 19,
++    H264_NAL_SLICE_EXT       = 20,
+ };
+ 
+ #endif /* AVCODEC_H264_H */
+diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
+index 053325c26b..855c74896e 100644
+--- a/libavcodec/h264_parser.c
++++ b/libavcodec/h264_parser.c
+@@ -62,6 +62,7 @@ typedef struct H264ParseContext {
+     int parse_last_mb;
+     int64_t reference_dts;
+     int last_frame_num, last_picture_structure;
++    int is_mvc;
+ } H264ParseContext;
+ 
+ 
+@@ -109,14 +110,18 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
+         } else if (state <= 5) {
+             int nalu_type = buf[i] & 0x1F;
+             if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
+-                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
++                nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
++                nalu_type == H264_NAL_SPS_SUBSET) {
+                 if (pc->frame_start_found) {
+                     i++;
+                     goto found;
+                 }
+             } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
+-                       nalu_type == H264_NAL_IDR_SLICE)) {
++                       nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) {
+                 state += 8;
++
++                if (nalu_type == H264_NAL_SLICE_EXT)
++                    i += 3; // skip mvc extension
+                 continue;
+             }
+             state = 7;
+@@ -594,7 +599,8 @@ static int h264_parse(AVCodecParserContext *s,
+         }
+     }
+ 
+-    parse_nal_units(s, avctx, buf, buf_size);
++    if (!p->is_mvc)
++        parse_nal_units(s, avctx, buf, buf_size);
+ 
+     if (avctx->framerate.num)
+         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+@@ -651,7 +657,7 @@ static int h264_split(AVCodecContext *avctx,
+         if ((state & 0xFFFFFF00) != 0x100)
+             break;
+         nalu_type = state & 0x1F;
+-        if (nalu_type == H264_NAL_SPS) {
++        if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) {
+             has_sps = 1;
+         } else if (nalu_type == H264_NAL_PPS)
+             has_pps = 1;
+@@ -703,3 +709,23 @@ AVCodecParser ff_h264_parser = {
+     .parser_close   = h264_close,
+     .split          = h264_split,
+ };
++
++static av_cold int init_mvc(AVCodecParserContext *s)
++{
++    H264ParseContext *p = s->priv_data;
++    int ret = init(s);
++    if (ret < 0)
++        return ret;
++
++    p->is_mvc = 1;
++    return 0;
++}
++
++AVCodecParser ff_h264_mvc_parser = {
++    .codec_ids      = { AV_CODEC_ID_H264_MVC },
++    .priv_data_size = sizeof(H264ParseContext),
++    .parser_init    = init_mvc,
++    .parser_parse   = h264_parse,
++    .parser_close   = h264_close,
++    .split          = h264_split,
++};
+-- 
+2.14.1
+
+
+From bbf5daa149ccc2c462be1bd5f6f710eba0e82094 Mon Sep 17 00:00:00 2001
+From: Hendrik Leppkes <h.leppkes@gmail.com>
+Date: Tue, 28 Nov 2017 16:12:12 +0000
+Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame
+ start was found
+
+---
+ libavcodec/h264_parser.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
+index 855c74896e..90a99a19a8 100644
+--- a/libavcodec/h264_parser.c
++++ b/libavcodec/h264_parser.c
+@@ -587,6 +587,9 @@ static int h264_parse(AVCodecParserContext *s,
+     } else {
+         next = h264_find_frame_end(p, buf, buf_size, avctx);
+ 
++        if (next == END_NOT_FOUND && pc->frame_start_found == 0)
++            s->fetch_timestamp = 1;
++
+         if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
+             *poutbuf      = NULL;
+             *poutbuf_size = 0;
+-- 
+2.14.1
+
+
+From 3a0ebb0f7473a9a5ab93e01f7261862a3d324e50 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Tue, 28 Nov 2017 18:32:08 +0000
+Subject: [PATCH 4/4] extract_extradata_bsf: Support H264_MVC
+
+---
+ libavcodec/extract_extradata_bsf.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c
+index ed6509c681..188e62a42d 100644
+--- a/libavcodec/extract_extradata_bsf.c
++++ b/libavcodec/extract_extradata_bsf.c
+@@ -56,7 +56,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
+         HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS,
+     };
+     static const int extradata_nal_types_h264[] = {
+-        H264_NAL_SPS, H264_NAL_PPS,
++        H264_NAL_SPS, H264_NAL_SPS_SUBSET, H264_NAL_PPS,
+     };
+ 
+     ExtractExtradataContext *s = ctx->priv_data;
+@@ -88,14 +88,14 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
+                 if (nal->type == HEVC_NAL_SPS) has_sps = 1;
+                 if (nal->type == HEVC_NAL_VPS) has_vps = 1;
+             } else {
+-                if (nal->type == H264_NAL_SPS) has_sps = 1;
++                if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SPS_SUBSET) has_sps = 1;
+             }
+         }
+     }
+ 
+     if (extradata_size &&
+         ((ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) ||
+-         (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) {
++         ((ctx->par_in->codec_id == AV_CODEC_ID_H264 || ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC) && has_sps))) {
+         AVBufferRef *filtered_buf;
+         uint8_t *extradata, *filtered_data;
+ 
+@@ -247,6 +247,7 @@ static const struct {
+ } extract_tab[] = {
+     { AV_CODEC_ID_CAVS,       extract_extradata_mpeg4   },
+     { AV_CODEC_ID_H264,       extract_extradata_h2645   },
++    { AV_CODEC_ID_H264_MVC,   extract_extradata_h2645   },
+     { AV_CODEC_ID_HEVC,       extract_extradata_h2645   },
+     { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12  },
+     { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12  },
+@@ -306,6 +307,7 @@ fail:
+ static const enum AVCodecID codec_ids[] = {
+     AV_CODEC_ID_CAVS,
+     AV_CODEC_ID_H264,
++    AV_CODEC_ID_H264_MVC,
+     AV_CODEC_ID_HEVC,
+     AV_CODEC_ID_MPEG1VIDEO,
+     AV_CODEC_ID_MPEG2VIDEO,
+-- 
+2.14.1
+