diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
index 058b5a74c31..9449a1f9e28 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="inputstream.adaptive"
-PKG_VERSION="2.2.27"
-PKG_SHA256="15d1e2f05d3ddeb31a9509e9fc6c8a305a6055ba68329c717606bb895ed5aacf"
+PKG_VERSION="7f0d294f7d7bbc37b7f1fe2cd2e47dd5d4c2fcfa"
+PKG_SHA256="6c64725dabb29c37e022fa78469bb5a152a24cac38c85676003ecf1c7067f4bb"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
index 30bb3fd06b8..25375eb97a2 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.dvbviewer"
-PKG_VERSION="b63e867740a61cf5a9c530a636069fa8ec1e20c7"
-PKG_SHA256="51b51ef6ecb7ed0bfb774e6d17ede6621e84aea755038ed938ffc730e80d1d60"
+PKG_VERSION="f09e3eba97a0d4d588f1d1837e361129ce5b64e9"
+PKG_SHA256="246aad9a16ca160f1255c8994e55b10402c83bf4815aa1c97d06c74147beb295"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
index 8b77d569b75..ada45f99425 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.hts"
-PKG_VERSION="7c7c6cefc785ccd5ee7015eb997e0c688258a4e0"
-PKG_SHA256="c471894f9efe5b69bc10ab2bcff4d808e760cddb07f18e1a320c7f361d2b5b43"
+PKG_VERSION="8b66ec3d80527f8803ba3b8db4abb34a18cfde54"
+PKG_SHA256="54f0171cf3c03ad58f6e277a17d4935402f709e23fb6a659cdbd2376b4594943"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
index c05d617b94b..670f030fe77 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.mediaportal.tvserver"
-PKG_VERSION="cc3eaf05e1459bc9981b00c6bed32adddca53630"
-PKG_SHA256="28946df252cd1f29d34fec2fc7448d8b12a9e3249efea27ddc2284d301113b94"
+PKG_VERSION="b17d5ad3ce77ad844ba3e33b50a887606bd24dd3"
+PKG_SHA256="3f71ce08e0d9bf1bbda4c3012c774b05feeb913f0f093b8952c0475ff7564042"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
index 104c91583ac..a04c7d24413 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.mythtv"
-PKG_VERSION="e61f47ce6c00a3efa98f2b83e5e27b0fa1d40584"
-PKG_SHA256="e067e15534688eaa9a4bdd1033e7ef63dbcaa7b67f809c9b3937d5b6b7285afd"
+PKG_VERSION="22aa23b80c9f9ae8dd8e39b1d29487eb519e6101"
+PKG_SHA256="3f5107619d631319eb2b10590a0d02493e397d2052502676c2f928e5a7695515"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk
index 595ecb877db..459d58cbc9a 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.sledovanitv.cz"
-PKG_VERSION="293c7fce7e811d305caf43e55019534c093df533"
-PKG_SHA256="f405aed37bcddc85010388e47bfccca61efa7a5b32fcedee15e0c118e44f650c"
+PKG_VERSION="fcc62d88f50a6b49a18bbb51323338855f3be873"
+PKG_SHA256="d6f313892c865d043ce4a823b22739b2f2e07741c1cffcdb31d48a8817b3c956"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPLv2"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
index 8e53a00979a..c7d0af3e345 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.vuplus"
-PKG_VERSION="016a4ff6d8c607a0119e825605f8f83a073fc662"
-PKG_SHA256="c9a42c2f208cb4c0b833c93397e47463b1314099203dce4e64f3967ff79907dd"
+PKG_VERSION="6b36662707a096753e834d66d6c2a9c32dbdc240"
+PKG_SHA256="5eec48c068a39b1c7ef9b46c12578ffa096dffa3818eff6e614c9ac77bf6312f"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
index f4e4e4a2157..45e30868906 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.zattoo"
-PKG_VERSION="12134c8659fffd564505ebe1eb01ecc49f5e3cdc"
-PKG_SHA256="859e25c0f233be46eed7889bfabde6191e930a33cc78470c1aa32b264c6f6955"
+PKG_VERSION="8de69a10cd3c68e21a8ebb0c6b46111a5f9c8d66"
+PKG_SHA256="0aa8fb78d84c127f103c374e922bd361199b8869ba5333b569fa685cccb6a774"
 PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk
index 66b5c1690b2..73c44dbe9bd 100644
--- a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="visualization.pictureit"
-PKG_VERSION="11063b29c238a6e81d7e779f18933140221ac439"
-PKG_SHA256="94f0576a59a3bd08cfc6be94cd5b2ec8f57e7dd86bf9f5c41bc0c82a3f47f78d"
+PKG_VERSION="f08d0aa6d5f80cfa95a24adca48be15e333ca8e0"
+PKG_SHA256="f29112d232907b46a738e6971f03c75ec6a61b70be70de203db82b2252ae4f8d"
 PKG_REV="2"
 PKG_ARCH="x86_64"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/config/appliance.xml b/packages/mediacenter/kodi/config/appliance.xml
index 8889ff1410b..7831707eebd 100644
--- a/packages/mediacenter/kodi/config/appliance.xml
+++ b/packages/mediacenter/kodi/config/appliance.xml
@@ -4,9 +4,6 @@
   <section id="games">
     <category id="gamesgeneral">
       <group id="1">
-        <setting id="gamesgeneral.enable">
-          <default>true</default>
-        </setting>
         <setting id="gamesgeneral.enablerewind">
           <default>false</default>
         </setting>
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index 0428ef9f18f..e1c6aa6efa3 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -15,14 +15,14 @@ PKG_PATCH_DIRS="$KODI_VENDOR"
 
 case $KODI_VENDOR in
   raspberrypi)
-    PKG_VERSION="newclock5_18.0b1v2-Leia"
-    PKG_SHA256="7434263c55aa528f3e3d8f455cffe3148e3707a1c1068f80bd08829094e16576"
+    PKG_VERSION="newclock5_18.0b2-Leia"
+    PKG_SHA256="28ba41ea6a942f4399b98e300596ea4a85ac043c8358c9eae9f2d0e0bee9aa99"
     PKG_URL="https://github.com/popcornmix/xbmc/archive/$PKG_VERSION.tar.gz"
     PKG_SOURCE_NAME="kodi-$KODI_VENDOR-$PKG_VERSION.tar.gz"
     ;;
   *)
-    PKG_VERSION="18.0b1v2-Leia"
-    PKG_SHA256="3808aa97723b710a0774261116e3387f091bc3d8150b9ba49ef36cb30b3d7ba2"
+    PKG_VERSION="18.0b2-Leia"
+    PKG_SHA256="25fc0aabfb523d4db19e08b1990d4851592ee2adec0424f5fb729bd3672eae69"
     PKG_URL="https://github.com/xbmc/xbmc/archive/$PKG_VERSION.tar.gz"
     PKG_SOURCE_NAME="kodi-$PKG_VERSION.tar.gz"
     ;;
diff --git a/packages/mediacenter/kodi/patches/kodi-995.01-pr14354_wrapper_toolchain_nm.patch b/packages/mediacenter/kodi/patches/kodi-995.01-pr14354_wrapper_toolchain_nm.patch
deleted file mode 100644
index 92b634da191..00000000000
--- a/packages/mediacenter/kodi/patches/kodi-995.01-pr14354_wrapper_toolchain_nm.patch
+++ /dev/null
@@ -1,17 +0,0 @@
-wrapper.def:
-  - make nm binary configurable (-DCMAKE_NM=..)
-  - fail if an empty file is generated
-
-diff --git a/xbmc/cores/DllLoader/exports/CMakeLists.txt b/xbmc/cores/DllLoader/exports/CMakeLists.txt
-index 580a779fdc..efcd872cad 100644
---- a/xbmc/cores/DllLoader/exports/CMakeLists.txt
-+++ b/xbmc/cores/DllLoader/exports/CMakeLists.txt
-@@ -16,7 +16,7 @@ elseif(NOT CORE_SYSTEM_NAME STREQUAL windows AND NOT CORE_SYSTEM_NAME STREQUAL w
-   add_options(C ALL_BUILDS "-fPIC")
-   add_library(wrapper OBJECT wrapper.c)
- 
--  add_custom_target(wrapper.def ALL nm ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/wrapper.dir/wrapper.c.o | grep __wrap | awk '{ printf(\"%s \", \$\$3) }' | sed \"s/___wrap_/__wrap_/g\" | sed \"s/__wrap_/-Wl,-wrap,/g\" > wrapper.def)
-+  add_custom_target(wrapper.def ALL ${CMAKE_NM} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/wrapper.dir/wrapper.c.o | grep __wrap | awk '{ printf(\"%s \", \$\$3) }' | sed \"s/___wrap_/__wrap_/g\" | sed \"s/__wrap_/-Wl,-wrap,/g\" > wrapper.def && test -s wrapper.def)
- 
-   if(CORE_SYSTEM_NAME STREQUAL android)
-     add_custom_command(TARGET wrapper.def COMMAND echo \"-L${DEPENDS_PATH}/lib/dummy-lib${APP_NAME_LC} -l${APP_NAME_LC}\" >> wrapper.def)
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index 1b4d0da9066..1d65823f3da 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -19,7 +19,7 @@ index 0e57cb0b4c..b2e3374fea 100644
  /ffplay
  /ffprobe
 diff --git a/configure b/configure
-index dee507cb6a..9a93189107 100755
+index 827abfe694..28f630068e 100755
 --- a/configure
 +++ b/configure
 @@ -318,6 +318,7 @@ External library support:
@@ -55,7 +55,7 @@ index dee507cb6a..9a93189107 100755
  huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
  huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
  iac_decoder_select="imc_decoder"
-@@ -3392,6 +3397,8 @@ tinterlace_filter_deps="gpl"
+@@ -3393,6 +3398,8 @@ tinterlace_filter_deps="gpl"
  tinterlace_merge_test_deps="tinterlace_filter"
  tinterlace_pad_test_deps="tinterlace_filter"
  tonemap_filter_deps="const_nan"
@@ -65,7 +65,7 @@ index dee507cb6a..9a93189107 100755
  uspp_filter_deps="gpl avcodec"
  vaguedenoiser_filter_deps="gpl"
 diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index 4dbe72186d..0e48ecb9da 100644
+index c0214c42d8..faaea5772a 100644
 --- a/fftools/ffmpeg.c
 +++ b/fftools/ffmpeg.c
 @@ -24,6 +24,12 @@
@@ -409,7 +409,7 @@ index 4dbe72186d..0e48ecb9da 100644
          break;
      }
  
-@@ -2887,6 +3166,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+@@ -2891,6 +3170,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
@@ -1681,10 +1681,10 @@ index 0000000000..0211e447a8
 +
 diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
 new file mode 100644
-index 0000000000..3bbfb443bf
+index 0000000000..200eac416e
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevc_misc_neon.S
-@@ -0,0 +1,226 @@
+@@ -0,0 +1,238 @@
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
 +
@@ -1728,14 +1728,14 @@ index 0000000000..3bbfb443bf
 +.endif
 +.endm
 +
-+.macro cpy_compound val, p1, p2
++.set expected_next, 0
++
++.macro cpy_compound val, p1, p2, drop_thru=0
 +.if \p1 + \p2 != \val
 +.error "Bad addition!  \p1 + \p2 != \val"
 +.endif
-+.if \val <= 64
-+@ As max we deal with 128 vals above 64 will never be recursed to
-+100\val\():
-+        push       {r11, lr}
++.if expected_next != 0 && expected_next != \val
++.error "Drop thru failure"
 +.endif
 +\val\():
 +        push       {r0-r3}
@@ -1743,7 +1743,12 @@ index 0000000000..3bbfb443bf
 +        pop        {r0-r3}
 +        add         r0, #\p1
 +        add         r2, #\p1
++.if \drop_thru == 0
 +        b           \p2\()b
++.set expected_next, 0
++.else
++.set expected_next, \p2
++.endif
 +.endm
 +
 +@ ff_hevc_cpy_blks8x4_neon(
@@ -1763,9 +1768,12 @@ index 0000000000..3bbfb443bf
 +function ff_hevc_rpi_cpy_blks8x4_neon, export=1
 +        ldr         r12, [sp, #0]
 +        push       {r11, lr}
-+        sub         r12, #1
-+A       adr         lr,  98f
-+        ubfx        r12, r12, #3, #4
++.if jent_pic
++A       adr         lr,  98f - 2
++.else
++A       adr         lr,  98f - 4
++.endif
++        lsr         r12, #3
 +        ldr         r11, [sp, #(8 + 4)]
 +.if jent_pic
 +A       lsl         r12, #1
@@ -1778,6 +1786,7 @@ index 0000000000..3bbfb443bf
 +.endif
 +
 +98:
++T       .short      0 @ unused
 +        jent        8f
 +        jent        16f
 +        jent        24f
@@ -1835,8 +1844,6 @@ index 0000000000..3bbfb443bf
 +        bgt         1b
 +        pop        {r11, pc}
 +
-+cpy_compound 24, 16, 8
-+
 +10032:
 +        push       {r11, lr}
 +32:
@@ -1857,10 +1864,6 @@ index 0000000000..3bbfb443bf
 +        bgt         1b
 +        pop        {r11, pc}
 +
-+cpy_compound 40, 32, 8
-+cpy_compound 48, 32, 16
-+cpy_compound 56, 32, 24
-+
 +10064:
 +        push       {r11, lr}
 +64:
@@ -1879,14 +1882,6 @@ index 0000000000..3bbfb443bf
 +        bgt         1b
 +        pop        {r11, pc}
 +
-+cpy_compound 72, 64, 8
-+cpy_compound 80, 64, 16
-+cpy_compound 88, 64, 24
-+cpy_compound 96, 64, 32
-+cpy_compound 104, 64, 40
-+cpy_compound 112, 64, 48
-+cpy_compound 120, 64, 56
-+
 +128:
 +        push       {r4, r5}
 +        @ We could do this with fewer registers if we jump around but I
@@ -1909,8 +1904,539 @@ index 0000000000..3bbfb443bf
 +        bgt         1b
 +        pop        {r4, r5, r11, pc}
 +
++@ Use drop_thru where we can
++cpy_compound 104, 64, 40, 1
++cpy_compound 40, 32, 8
++
++cpy_compound 112, 64, 48, 1
++cpy_compound 48, 32, 16
++
++cpy_compound 120, 64, 56, 1
++cpy_compound 56, 32, 24, 1
++cpy_compound 24, 16, 8
++
++cpy_compound 72, 64, 8
++cpy_compound 80, 64, 16
++cpy_compound 88, 64, 24
++cpy_compound 96, 64, 32
++
++
 +endfunc
 +
+diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h
+new file mode 100644
+index 0000000000..9d21f6a882
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.h
+@@ -0,0 +1,438 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
++#define AVCODEC_ARM_RPI_HEVC_MISC_H
++
++#include "config.h"
++#if HAVE_NEON_INLINE && !CONFIG_THUMB
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                       ptrdiff_t stride_src)
++{
++    const uint8_t *src2 = src + stride_src;
++    stride_src <<= 1;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {q0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {q1}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.32     {q0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.32     {q1}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.16     d0, d1                            \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.16     d2, d3                            \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d2}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vzip.16     d0, d1                            \n\t"
++                "vst1.16     {d0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vzip.16     d2, d3                            \n\t"
++                "vst1.16     {d2}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.8      d0, d1                            \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.8      d2, d3                            \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d2}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vzip.8      d0, d1                            \n\t"
++                "vst1.8      {d0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vzip.8      d2, d3                            \n\t"
++                "vst1.8      {d2}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                      ptrdiff_t stride_dst)
++{
++    uint8_t *dst2 = dst + stride_dst;
++    stride_dst <<= 1;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "subs        %[height], #4                     \n\t"
++                "vld1.32     {q0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.32     {q1}, [%[src]]!                   \n\t"
++                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.32     {q0}, [%[src]]!                   \n\t"
++                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
++                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
++                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "subs        %[height], #4                     \n\t"
++                "vld1.16     {d0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.16     {d2}, [%[src]]!                   \n\t"
++                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.16     {d0}, [%[src]]!                   \n\t"
++                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
++                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
++                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "subs        %[height], #8                     \n\t"
++                "vld1.8      {d0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.8      {d2}, [%[src]]!                   \n\t"
++                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.8      {d0}, [%[src]]!                   \n\t"
++                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
++                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
++                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    int x, y;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "ldr         %[x], [%[src]], %[stride_src] \n\t"
++                "ldr         %[y], [%[src]], %[stride_src] \n\t"
++                "str         %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldr         %[x], [%[src]], %[stride_src] \n\t"
++                "str         %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldr         %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "str         %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "str         %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
++                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
++                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
++                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "strh        %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
++                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
++                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
++                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "strb        %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
++                                              int pixel_shift, int height,
++                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    if (stride_dst == 1 << pixel_shift)
++        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
++    else if (stride_src == 1 << pixel_shift)
++        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
++    else
++        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
++}
++
++#endif /* HAVE_NEON_INLINE */
++
++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
+diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h
+new file mode 100644
+index 0000000000..c73de55a48
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_mv_arm.h
+@@ -0,0 +1,64 @@
++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
++#define AVCODEC_ARM_RPI_HEVC_MV_H
++
++#if HAVE_ARMV6T2_INLINE
++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
++{
++    MvXY r;
++    __asm__ (
++        "sadd16    %[r], %[a], %[b]        \n\t"
++        : [r]"=r"(r)
++        : [a]"r"(a),
++          [b]"r"(b)
++        :
++        );
++    return r;
++}
++#define mvxy_add mvxy_add_arm
++#endif
++
++#if HAVE_ARMV6T2_INLINE
++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
++{
++    int t;
++    __asm__ (
++    "ssat   %[td], #8,    %[td]          \n\t"
++    "ssat   %[tb], #8,    %[tb]          \n\t"
++    "eor    %[t],  %[td], %[td], asr #31 \n\t"
++    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
++    "asr    %[t],  #1                    \n\t"
++    "add    %[t],  #0x4000               \n\t"
++    "it ne                               \n\t"
++    "sdivne %[t],  %[t],  %[td]          \n\t"
++    "mov    %[td], #32                   \n\t"
++    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
++    "ssat   %[td], #13,   %[td], asr #6  \n\t"
++    "mov    %[tb], #127                  \n\t"
++    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
++    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
++// This takes the sign of x & y for rounding at the "wrong" point
++// (i.e. after adding 127) but for the range of values (-1,-127)
++// where it does the wrong thing you get the right answer (0) anyway
++    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
++    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
++    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
++    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
++    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
++    :
++         [t]"=&r"(t),
++        [xy]"+r"(xy),
++        [td]"+r"(td),
++        [tb]"+r"(tb)
++    :
++    :
++        "cc"
++    );
++    return xy;
++}
++#define mv_scale_xy mv_scale_xy_arm
++#endif
++#endif
++
++#endif // AVCODEC_ARM_RPI_HEVC_MV_H
++
 diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
 new file mode 100644
 index 0000000000..62b9326532
@@ -1945,10 +2471,10 @@ index 0000000000..62b9326532
 +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
 diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
 new file mode 100644
-index 0000000000..98512d21dc
+index 0000000000..18a76a4112
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-@@ -0,0 +1,1625 @@
+@@ -0,0 +1,1633 @@
 +/*
 + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -3042,22 +3568,24 @@ index 0000000000..98512d21dc
 +.endm
 +
 +
-+#if 1 // NEON version
++@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
++@ But in real world testing it is ~20% slower, presumably due to code size
 +
++#if 0 // NEON version
 +
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
 + *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc)
++ *                                            int in_inc0, int in_inc1)
 + */
 +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
 +        mov         ip, sp
-+        push        {a2,v1-v8,lr}
-+        ldm         ip, {v1-v5}
++        push        {a1-a3,v1-v8,lr}
++        ldm         ip, {v1-v6}
 +        cmp         a1, #2
 +        bls         2f
 +        vpush       {d8-d13}
 +        sub         v5, v5, #10
-+        mov         v6, #32
++        sub         v6, v6, #10
 +1:
 +        vld2.32     {d0[0], d2[0]}, [a3]!
 +        vld2.32     {d4[0], d6[0]}, [a4]!
@@ -3069,7 +3597,7 @@ index 0000000000..98512d21dc
 +        add         a2, v1, a2, lsl #2
 +        vld1.8      {d24[0]}, [a3], v5
 +        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[0]}, [a4], v5
++        vld1.8      {d25[0]}, [a4], v6
 +        add         v8, v2, v8, lsl #2
 +        vld1.32     {d16[0]}, [a2]
 +        add         lr, v4, lr, lsl #2
@@ -3089,7 +3617,7 @@ index 0000000000..98512d21dc
 +        add         a2, v1, a2, lsl #2
 +        vld1.8      {d24[2]}, [a3], v5
 +        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[2]}, [a4], v5
++        vld1.8      {d25[2]}, [a4], v6
 +        add         v8, v2, v8, lsl #2
 +        vld1.32     {d16[1]}, [a2]
 +        add         lr, v4, lr, lsl #2
@@ -3106,7 +3634,7 @@ index 0000000000..98512d21dc
 +        add         a2, v1, a2, lsl #2
 +        vld1.8      {d24[4]}, [a3], v5
 +        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[4]}, [a4], v5
++        vld1.8      {d25[4]}, [a4], v6
 +        add         v8, v2, v8, lsl #2
 +        vld1.32     {d17[0]}, [a2]
 +        add         lr, v4, lr, lsl #2
@@ -3123,7 +3651,7 @@ index 0000000000..98512d21dc
 +        add         a2, v1, a2, lsl #2
 +        vld1.8      {d24[6]}, [a3], v5
 +        add         ip, v3, ip, lsl #2
-+        vld1.8      {d25[6]}, [a4], v5
++        vld1.8      {d25[6]}, [a4], v6
 +        add         v8, v2, v8, lsl #2
 +        vld1.32     {d17[1]}, [a2]
 +        add         lr, v4, lr, lsl #2
@@ -3150,16 +3678,16 @@ index 0000000000..98512d21dc
 +        vtst.16     d30, d25, d12
 +        vtst.16     d31, d25, d13
 +        veor        d26, d8, d9
-+          ldr         lr, [sp, 6*8]
++          ldr         lr, [sp, 6*8 + 1*4]
 +        vmovl.s16   q4, d28
 +        vmovl.s16   q5, d29
 +          teq         lr, #1
 +        vmovl.s16   q14, d30
-+        it ne
-+        lslne       v1, lr, #1
++          it ne
++          lslne       v1, lr, #1
 +        vmovl.s16   q15, d31
-+        it ne
-+        rsbne       v2, v1, #32
++          it ne
++          rsbne       v2, v1, #32
 +        vbif        q0, q1, q4
 +        vbif        q2, q3, q14
 +        vbif        q1, q0, q5
@@ -3212,7 +3740,6 @@ index 0000000000..98512d21dc
 +        vmov        v8, s1
 +        vmov.u16    ip, d0[1]
 +        vmov.u16    lr, d0[3]
-+        sub         v6, #8
 +        lsl         a2, #30
 +        lsl         v8, #30
 +        lsl         ip, #30
@@ -3224,9 +3751,12 @@ index 0000000000..98512d21dc
 +        orr         v7, a2, v7, lsr #8
 +        bhi         1b
 +
++        mov         a1, #32
++        ldr         a3, [sp, #6*8]
 +        vpop        {d8-d13}
-+        mov         a1, v7, lsr v6
-+        pop         {a2,v1-v8,pc}
++        sub         a1, a1, a3, lsl #1
++        mov         a1, v7, lsr a1
++        pop         {a2-a4,v1-v8,pc}
 +10:
 +        @ Merge results into result word, with duplicates
 +        vmul.i16    d0, d1
@@ -3234,13 +3764,12 @@ index 0000000000..98512d21dc
 +        vmov        v8, s1
 +        vmov.u16    ip, d0[1]
 +        vmov.u16    lr, d0[3]
-+        sub         v6, v6, v1, lsl #2
 +        lsl         a2, v2
 +        subs        a1, #4
 +        lsl         v8, v2
 +        lsl         ip, v2
 +        lsl         lr, v2
-+        ldr         v2, [sp, #6*8 + 10*4 + 1*4]
++        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
 +T       lsr         a2, v1
 +T       orr         a2, ip, a2
 +A       orr         a2, ip, a2, lsr v1
@@ -3252,19 +3781,24 @@ index 0000000000..98512d21dc
 +T       lsr         a2, ip
 +T       orr         a2, v8, a2
 +A       orr         a2, v8, a2, lsr ip
-+        ldr         v1, [sp, #6*8 + 10*4]
++        ldr         v1, [sp, #6*8 + 12*4]
 +T       lsr         v7, lr
 +T       orr         v7, a2, v7
 +A       orr         v7, a2, v7, lsr lr
 +        bhi         1b
 +
++        mov         a1, #32
++        ldrd        a3, a4, [sp, #6*8]
 +        vpop        {d8-d13}
-+        mov         a1, v7, lsr v6
-+        pop         {a2,v1-v8,pc}
++        mls         a1, a3, a4, a1
++        mls         a1, a3, a4, a1
++        mov         a1, v7, lsr a1
++        pop         {a2-a4,v1-v8,pc}
 +
 +
 +2:
 +        sub         v5, v5, #10
++        sub         v6, v6, #10
 +        vmov.u8     d16, #0
 +        blo         3f
 +        vld2.32     {d0[0], d1[0]}, [a3]!
@@ -3276,7 +3810,7 @@ index 0000000000..98512d21dc
 +        add         a2, v1, a2, lsl #2
 +        vld1.8      {d16[0]}, [a3], v5
 +        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[4]}, [a4], v5
++        vld1.8      {d16[4]}, [a4], v6
 +        add         v8, v2, v8, lsl #2
 +        vld1.32     {d4[0]}, [a2]
 +        add         lr, v4, lr, lsl #2
@@ -3297,7 +3831,7 @@ index 0000000000..98512d21dc
 +        add         a2, v1, a2, lsl #2
 +        vld1.8      {d16[2]}, [a3], v5
 +        add         ip, v3, ip, lsl #2
-+        vld1.8      {d16[6]}, [a4], v5
++        vld1.8      {d16[6]}, [a4], v6
 +        add         v8, v2, v8, lsl #2
 +        vld1.32     {d4[1]}, [a2]
 +        add         lr, v4, lr, lsl #2
@@ -3321,12 +3855,12 @@ index 0000000000..98512d21dc
 +        vtst.16     d22, d16, d18
 +        vadd.i16    d30, d16, d17
 +        vswp        d2, d3
-+        ldr         lr, [sp]
++        ldr         lr, [sp, #1*4]
 +        vmovl.s16   q10, d20
-+        teq         lr, #1
++          teq         lr, #1
 +        vmovl.s16   q11, d22
-+        it ne
-+        lslne       v1, lr, #1
++          it ne
++          lslne       v1, lr, #1
 +        vbif        d0, d1, d20
 +        vbif        d4, d6, d20
 +        vbif        d3, d2, d21
@@ -3352,8 +3886,8 @@ index 0000000000..98512d21dc
 +        vshrn.i32   d7, q11, #8
 +        vmovn.i32   d3, q10
 +        vand        q0, q3, q1
-+        it ne
-+        rsbne       v2, v1, #32
++          it ne
++          rsbne       v2, v1, #32
 +        vrev16.8    q3, q3
 +        vand        q0, q3
 +        vsra.u64    d30, #32
@@ -3372,7 +3906,7 @@ index 0000000000..98512d21dc
 +        vmov.u16    a2, d0[0]
 +        it eq
 +        orreq       a1, a2, a1, lsl #2
-+        pop         {a2,v1-v8,pc}
++        pop         {a2-a4,v1-v8,pc}
 +10:
 +        @ Construct result word, with duplicates
 +        cmp         a1, #2
@@ -3387,7 +3921,7 @@ index 0000000000..98512d21dc
 +T       lsleq       a1, v1
 +T       orreq       a1, a2, a1
 +A       orreq       a1, a2, a1, lsl v1
-+        pop         {a2,v1-v8,pc}
++        pop         {a2-a4,v1-v8,pc}
 +endfunc
 +
 +
@@ -3395,9 +3929,9 @@ index 0000000000..98512d21dc
 +#else // non-NEON version
 +
 +
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
 + *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ *                                            int in_inc)
++ *                                            int in_inc0, in_inc1)
 + */
 +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
 +        add         ip, sp, #4*4
@@ -3447,12 +3981,12 @@ index 0000000000..98512d21dc
 +T       orr         v7, v5, v7
 +        bhi         11b
 +
-+        ldr         v5, [sp, #16*4]
-+        add         ip, sp, #16*4
++        ldrd        v3, v4, [sp, #16*4]
 +        ldr         a2, [sp]
++        add         ip, sp, #16*4
 +        subs        a1, a1, #1
-+        add         a3, a3, v5
-+        add         a4, a4, v5
++        add         a3, a3, v3
++        add         a4, a4, v4
 +        bhi         1b
 +        mov         a1, v7, lsr v6
 +        pop         {a2-a4,v1-v8,pc}
@@ -3803,7 +4337,7 @@ index 0000000000..109fa98c29
 +}
 diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
 new file mode 100644
-index 0000000000..8a94a644a4
+index 0000000000..9294ab8010
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
 @@ -0,0 +1,467 @@
@@ -4038,9 +4572,9 @@ index 0000000000..8a94a644a4
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
 +
 +
-+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh,
++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
 +                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                                int in_inc);
++                                                int in_inc0, int in_inc1);
 +void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
 +
 +
@@ -4268,9 +4802,9 @@ index 0000000000..8a94a644a4
 +#endif
 +    }
 +
-+    assert(offsetof(MvField, mv) == 0);
-+    assert(offsetof(MvField, ref_idx) == 8);
-+    assert(offsetof(MvField, pred_flag) == 10);
++    assert(offsetof(HEVCRpiMvField, mv) == 0);
++    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
++    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
 +    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
 +    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
 +}
@@ -8133,10 +8667,10 @@ index 0000000000..21e7700174
 +
 diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
 new file mode 100644
-index 0000000000..ebf12e8684
+index 0000000000..3dd9246a16
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-@@ -0,0 +1,2973 @@
+@@ -0,0 +1,2975 @@
 +/*
 + * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
 + *
@@ -9698,6 +10232,8 @@ index 0000000000..ebf12e8684
 +        @ Standard sign
 +        .byte     2,   5,   9,  13,  17,  21,  26,  32
 +
++        .balign   2
++
 +        @ Sign inverted from standards table
 +inv_angle:
 +        .short   4096, 1638,  910,  630,  482,  390,  315
@@ -11813,10 +12349,10 @@ index 0000000000..75a1789c25
 +
 diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
 new file mode 100644
-index 0000000000..6ce3d3ca8d
+index 0000000000..7ea82b38fe
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-@@ -0,0 +1,872 @@
+@@ -0,0 +1,902 @@
 +/*
 + * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
 + *
@@ -11887,8 +12423,6 @@ index 0000000000..6ce3d3ca8d
 +.equ    AVAIL_S_UL_N_L_C, 32 - 3
 +.equ    AVAIL_S_L_N_DL_C, 32 - 4
 +
-+.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
-+
 +@ On entry
 +@  r2   req
 +@  r3   avail
@@ -11908,77 +12442,78 @@ index 0000000000..6ce3d3ca8d
 +@ If UR avail then d_ur == a_ur so U-filter good too
 +@
 +@ Data load pointers (only load if req & avail):
-+@  r4   DL + stride
-+@  r10  L
-+@  r6   U
-+@  r5   UR
++@  r8   DL + stride
++@  r6   L
++@  r7   U
++@  r4   UR
 +@
 +@ Others:
-+@  r2   req
-+@  r7   req & avail
-+@  r3   L + stride
-+@  r8   DL + stride * 2
-+@  r9   stride * 2
++@  r2   req (if preserve_req)
++@  r3   req & avail (if preserve_req)
++@  r2   req & avail (if !preserve_req)
++@  r10  L + stride
++@  r5   DL + stride * 2
++@  r12  stride * 2
 +@  cs   Load U
 +@  mi   Load UR
 +@
 +@ Clobbered:
-+@  r12
-+
-+.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
-+
-+.equ    src_l\@,   \sp_offset + 0
-+.equ    src_u\@,   \sp_offset + 4
-+.equ    src_ur\@,  \sp_offset + 8
-+.equ    stride\@,  \sp_offset + 12
-+.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
-+.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
-+
-+@ r9    stride
-+@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
-+@ r4 = b_dl, r10 = b_l,             r8 = b_u
-+
-+        ldr        r5,  [sp, #src_ur\@]
-+        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
-+        ldr        r10, [sp, #src_l\@]
-+        ldr        r9,  [sp, #stride\@]
-+        ldr        r6,  [sp, #src_u\@]
-+
-+        @ This is quite a slow instruction but it replaces
-+        @ a decent number of tests that yield a max of 2 flags/op
-+        @ It is annoying we can't branch on Q!
-+        @ If L navail (ne) then DL must be navail (pl)
-+        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
-+
-+        mov        r4,  r5
-+        sub        r7,  r10, r9
-+        it vs
-+        movvs      r4,  r6
-+        add        r8,  r6,  #b_size\@ - pw\@
-+        it cs
-+        movcs      r4,  r7
-+        ite ne
-+        movne      r10, r4
-+        addeq      r4,  r7,  r9,  lsl #\log2_s
-+        it cc
-+        movcc      r7,  r10
-+        it mi
-+        addmi      r4,  r10, r9,  lsl #\log2_s
-+        vld1.\d_type {\d_ul}, [r7]
-+        itt vc
-+        movvc      r8,  r7
-+        movvc      r6,  r7
-+        vld1.\d_type {\d_l }, [r4], r9
-+        tst        r3,  #AVAIL_UR
-+        vld1.\d_type {\d_u }, [r6]
-+        it eq
-+        moveq      r5,  r8
-+        and        r7,  r2,  r3
-+        add        r8,  r4,  r9
-+        vld1.\d_type {\d_ur}, [r5]
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        add        r3,  r10, r9
-+        lsl        r9,  #1
++@  r9, lr
++
++.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur, preserve_req, I1, I2
++
++.equ    src_l,   \sp_offset + 0
++.equ    src_u,   \sp_offset + 4
++.equ    src_ur,  \sp_offset + 8
++.equ    stride,  \sp_offset + 12
++.equ    pw,      (1 << \pw_s)                 @ pel width in bytes
++.equ    b_size,  (1 << (\pw_s + \log2_s))     @ size in bytes
++
++        ldrd        r4, r5, [sp, #src_ur] @ and stride
++        ldrd        r6, r7, [sp, #src_l]  @ and src_u
++        lsls        lr, r3, #AVAIL_S_U_N_UL_C
++        mov         r8, r4
++        sub         r9, r6, r5
++        it          mi
++        movmi       r8, r7
++        it          cs
++        movcs       r8, r9
++        lsls        lr, r3, #AVAIL_S_L_N_DL_C
++        ite         pl
++        movpl       r6, r8
++        addmi       r8, r9, r5, lsl #\log2_s
++        it          cs
++        addcs       r8, r6, r5, lsl #\log2_s
++        .if !\preserve_req
++        and         r2, r2, r3
++        .endif
++        add         r10, r6, r5
++        lsl         r12, r5, #1
++        lsls        lr, r3, #AVAIL_S_U_N_UL_C
++        it          cc
++        movcc       r9, r6
++        vld1.\d_type {\d_l}, [r8], r5
++        add         lr, r7, #b_size - pw
++        add         r5, r8, r5
++        itt         pl
++        movpl       lr, r9
++        movpl       r7, r9
++        tst         r3, #AVAIL_UR
++        vld1.\d_type {\d_ul}, [r9]
++        it          eq
++        moveq       r4, lr
++        \I1
++        .if \preserve_req
++        and         r3, r2, r3
++        .else
++        lsls        lr, r2, #AVAIL_S_UR_N_U_C
++        .endif
++        vld1.\d_type {\d_u}, [r7]
++        \I2
++        vld1.\d_type {\d_ur}, [r4]
++        .if \preserve_req
++        lsls        lr, r3, #AVAIL_S_UR_N_U_C
++        .endif
 +.endm
 +
 +
@@ -12001,33 +12536,33 @@ index 0000000000..6ce3d3ca8d
 +.set    log2_s,  2
 +
 +function ff_hevc_rpi_intra_filter_4_neon_8, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
-+
-+        it cs
-+        vldrcs     s2,  [r6]
-+        ite pl
-+        vmovpl     s3,  s4
-+        vldrmi     s3,  [r5]
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0,  #-pw
-+        bpl        1f
++        push        {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d2[0], d3[], d4[], 0
 +
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10]
-+        vld1.8    {d0[3]}, [r3]
++        sub         r3, r0, #pw
++        it          mi
++        vldrmi      s7, [r4]
++        it          cs
++        vldrcs      s6, [r7]
++        it          pl
++        vmovpl.f32  s7, s8
++        lsls        lr, r2, #AVAIL_S_L_N_DL_C
++        bpl         1f
++        vld1.8      {d0[0]}, [r6], r12
++        vld1.8      {d1[0]}, [r10], r12
++        vld1.8      {d0[1]}, [r6]
++        vld1.8      {d1[1]}, [r10]
 +1:
-+        bcc        1f
-+        vld1.8    {d0[5]}, [r4],  r9
-+        vld1.8    {d0[6]}, [r8]
-+        vld1.8    {d0[7]}, [r4]
++        bcc         1f
++        vld1.8      {d1[2]}, [r8], r12
++        vld1.8      {d0[3]}, [r5]
++        vld1.8      {d1[3]}, [r8]
 +1:
-+        vstr       d1,  [r1]            @ Up
-+        vst1.8    {d31[7]}, [r12]
-+        vstr       d0,  [r0]            @ Left
-+        pop       {r4-r10, pc}
++        vst1.8      {d2[0]}, [r3]
++        vst1.8      {d3}, [r1]
++        vzip.8      d0, d1
++        vst1.8      {d0}, [r0]
++        pop         {r4-r10, pc}
 +endfunc
 +
 +
@@ -12049,30 +12584,31 @@ index 0000000000..6ce3d3ca8d
 +.set    log2_s,  2
 +
 +function ff_hevc_rpi_intra_filter_4_neon_16, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
-+
-+        it cs
-+        vldrcs     d2,  [r6]
-+        it mi
-+        vldrmi     d3,  [r5]
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        add        r12, r0, #-pw
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10]
-+        vld1.16   {d0[3]}, [r3]
++        push        {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d2[0], d3[], d4[], 0
++
++        sub         r3, r0, #pw
++        it          mi
++        vldrmi      d4, [r4]
++        it          cs
++        vldrcs      d3, [r7]
++        lsls        lr, r2, #AVAIL_S_L_N_DL_C
++        bpl         1f
++        vld1.16     {d0[0]}, [r6], r12
++        vld1.16     {d1[0]}, [r10], r12
++        vld1.16     {d0[1]}, [r6]
++        vld1.16     {d1[1]}, [r10]
 +1:
-+        bcc        1f
-+        vld1.16   {d1[1]}, [r4],  r9
-+        vld1.16   {d1[2]}, [r8]
-+        vld1.16   {d1[3]}, [r4]
++        bcc         1f
++        vld1.16     {d1[2]}, [r8], r12
++        vld1.16     {d0[3]}, [r5]
++        vld1.16     {d1[3]}, [r8]
 +1:
-+        vst1.16   {q1}, [r1]           @ Up
-+        vst1.16   {d31[3]}, [r12]
-+        vst1.16   {q0}, [r0]           @ Left
-+        pop       {r4-r10, pc}
++        vst1.16     {d2[0]}, [r3]
++        vst1.16     {d3, d4}, [r1]
++        vzip.16     d0, d1
++        vst1.16     {q0}, [r0]
++        pop         {r4-r10, pc}
 +endfunc
 +
 +
@@ -12094,72 +12630,69 @@ index 0000000000..6ce3d3ca8d
 +.set    log2_s,  3
 +
 +function ff_hevc_rpi_intra_filter_8_neon_8, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
-+
-+        it cs
-+        vldrcs     d4,  [r6]
-+        it mi
-+        vldrmi     d5,  [r5]
++        push        {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d3[7], d4[], d5[], 1
 +
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        bpl        1f
-+        vld1.8    {d0[0]}, [r10], r9
-+        vld1.8    {d0[1]}, [r3],  r9
-+        vld1.8    {d0[2]}, [r10], r9
-+        vld1.8    {d0[3]}, [r3],  r9
-+        vld1.8    {d0[4]}, [r10], r9
-+        vld1.8    {d0[5]}, [r3],  r9
-+        vld1.8    {d0[6]}, [r10]
-+        vld1.8    {d0[7]}, [r3]
++        it          mi
++        vldrmi      d5, [r4]
++        sub         r0, #pw
++        it          cs
++        vldrcs      d4, [r7]
++        lsls        lr, r3, #AVAIL_S_L_N_DL_C
++        bpl         1f
++        vld1.8      {d0[0]}, [r6], r12
++        vld1.8      {d1[0]}, [r10], r12
++        vld1.8      {d0[1]}, [r6], r12
++        vld1.8      {d1[1]}, [r10], r12
++        vld1.8      {d0[2]}, [r6], r12
++        vld1.8      {d1[2]}, [r10], r12
++        vld1.8      {d0[3]}, [r6]
++        vld1.8      {d1[3]}, [r10]
 +1:
-+        bcc        1f
-+        vld1.8    {d1[1]}, [r4],  r9
-+        vld1.8    {d1[2]}, [r8],  r9
-+        vld1.8    {d1[3]}, [r4],  r9
-+        vld1.8    {d1[4]}, [r8],  r9
-+        vld1.8    {d1[5]}, [r4],  r9
-+        vld1.8    {d1[6]}, [r8]
-+        vld1.8    {d1[7]}, [r4]
++        bcc         1f
++        vld1.8      {d1[4]}, [r8], r12
++        vld1.8      {d0[5]}, [r5], r12
++        vld1.8      {d1[5]}, [r8], r12
++        vld1.8      {d0[6]}, [r5], r12
++        vld1.8      {d1[6]}, [r8], r12
++        vld1.8      {d0[7]}, [r5], r12
++        vld1.8      {d1[7]}, [r8], r12
 +1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
++        vext.8      q3, q1, q2, #15
++        vmov.u8     r4, d5[7]           @ Save final pel
++        tst         r2, #FILTER_LIGHT
++        vzip.8      d0, d1
++        beq         1f
 +
 +        @ Luma light filter
-+        vext.8     q8,  q15, q2,  #15
-+        vext.8     q12, q15, q0,  #15
-+        vaddl.u8   q9,  d17, d5
-+        vaddl.u8   q8,  d16, d4
-+        vaddl.u8   q13, d25, d1
-+        vaddl.u8   q12, d24, d0
-+        vmov.u8    r3,  d5[7]           @ Save final pel
-+        vmov.u8    r2,  d1[7]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshrn.u16 d4,  q2,  #2
-+        vrshrn.u16 d5,  q3,  #2
-+        vrshrn.u16 d0,  q0,  #2
-+        vrshrn.u16 d1,  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u8    d5[7], r3            @ Restore final pel
-+        vmov.u8    d1[7], r2            @ Restore final pel
-+        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
-+
-+10:
-+        vst1.8    {q2 }, [r1]           @ Up
-+        vst1.8    {d31[7]}, [r12]       @ Up-left
-+        vst1.8    {q0 }, [r0]           @ Left
-+        pop       {r4-r10, pc}
++        vaddl.u8    q8, d7, d5
++        vext.8      q1, q1, q0, #15
++        vaddl.u8    q2, d6, d4
++        vaddl.u8    q3, d3, d1
++        vaddl.u8    q9, d2, d0
++        vext.16     q10, q8, q8, #1
++        vext.16     q11, q3, q3, #1
++        vadd.u16    q10, q8
++        vadd.u16    q11, q3
++        vadd.u16    d2, d4, d18         @ d2[0] = l[0] + 2ul + u[0]
++        vmov.u8     r5, d1[7]           @ Save final pel
++        vext.16     q0, q2, q8, #1
++        vext.16     q3, q9, q3, #1
++        vadd.u16    q8, q0, q2
++        vadd.u16    q3, q9
++        vrshrn.u16  d5, q10, #2
++        vrshrn.u16  d1, q11, #2
++        vrshr.u16   d2, #2
++        vrshrn.u16  d4, q8, #2
++        vrshrn.u16  d0, q3, #2
++        vmov.8      d5[7], r4           @ Restore final pel
++        vmov.8      d1[7], r5           @ Restore final pel
++        vdup.8      d3, d2[0]
++1:
++        vst1.8      {d3[7]}, [r0]!
++        vst1.8      {q2}, [r1]
++        vst1.8      {q0}, [r0]
++        pop         {r4-r10, pc}
 +endfunc
 +
 +
@@ -12184,85 +12717,89 @@ index 0000000000..6ce3d3ca8d
 +.set    p_size,  (1 << log2_s)          @ size in pels
 +
 +function ff_hevc_rpi_intra_filter_8_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
++        push        {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d5[3], "d16[],d17[]", "d18[],d19[]", 1, \
++            "ldr         r9, [sp, #ur_size]", \
++            "sub         r0, #pw"
 +
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #4
-+        vldm       r5,  {d6, d7}
-+        bgt        1f
-+        vdup.16    d7,  d6[3]
-+1:
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vdup.16    q1,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10]
-+        vld1.16   {d1[3]}, [r3]
++        vmov        q1, q0
++        ldrh        lr, [r4, #3*2]
++        it          mi
++        vldmmi      r4, {d18, d19}
++        it          cs
++        vldmcs      r7, {d16, d17}
++        itt         mi
++        cmpmi       r9, #p_size
++        vdupmi.16   d19, lr
++        lsls        lr, r3, #AVAIL_S_L_N_DL_C
++        bpl         1f
++        vld1.16     {d0[0]}, [r6], r12
++        vld1.16     {d2[0]}, [r10], r12
++        vld1.16     {d0[1]}, [r6], r12
++        vld1.16     {d2[1]}, [r10], r12
++        vld1.16     {d0[2]}, [r6], r12
++        vld1.16     {d2[2]}, [r10], r12
++        vld1.16     {d0[3]}, [r6]
++        vld1.16     {d2[3]}, [r10]
 +1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.16   {d2[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.16   {d2[2]}, [r8],  r9
-+        vld1.16   {d2[3]}, [r4],  r9
-+        blt        2f
-+        vld1.16   {d3[0]}, [r8],  r9
-+        vld1.16   {d3[1]}, [r4],  r9
-+        vld1.16   {d3[2]}, [r8]
-+        vld1.16   {d3[3]}, [r4]
-+        b          1f
++        ldr         lr, [sp, #dl_size]
++        bcc         2f
++        vld1.16     {d3[0]}, [r8], r12
++        vld1.16     {d1[1]}, [r5], r12
++        cmp         lr, #p_size
++        vld1.16     {d3[1]}, [r8], r12
++        bcc         10f
++        vld1.16     {d1[2]}, [r5], r12
++        vld1.16     {d3[2]}, [r8], r12
++        vld1.16     {d1[3]}, [r5]
++        vld1.16     {d3[3]}, [r8]
 +2:
-+        vdup.16    d3,  d2[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
++        vext.16     q3, q8, q9, #7
++        vext.16     q10, q2, q8, #7
++        tst         r2, #FILTER_LIGHT
++        vzip.16     q0, q1
++        beq         3f
 +
 +        @ Luma light filter
-+        vext.16    q9,  q2,  q3,  #7
-+        vext.16    q8,  q15, q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+        vadd.u16   q9,  q3
-+        vadd.u16   q8,  q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r3,  d7[3]           @ Save final pel
-+        vmov.u16   r2,  d3[3]           @ Save final pel
-+
-+        vext.16    q2,  q8,  q9,  #1
-+        vext.16    q3,  q9,  q9,  #1
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q13, #1
-+        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q2,  q8
-+        vadd.u16   q3,  q9
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r3            @ Restore final pel
-+        vmov.u16   d3[3], r2            @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
++        vadd.i16    q3, q9
++        vext.16     q11, q0, q1, #7
++        vext.16     q2, q2, q0, #7
++        vadd.i16    q8, q10
++        vadd.i16    q10, q11, q1
++        vadd.i16    q0, q2
++        vext.16     q11, q3, q3, #1
++        vadd.i16    d4, d16, d0         @ d4[0] = l[0] + 2ul + u[0]
++        vmov.u16    r4, d19[3]          @ Save final pel
++        vext.16     q9, q10, q10, #1
++        vext.16     q12, q8, q3, #1
++        vext.16     q13, q0, q10, #1
++        vadd.i16    q3, q11
++        vadd.i16    q10, q9
++        vadd.i16    q8, q12
++        vadd.i16    q0, q13
++        vmov.u16    r5, d3[3]           @ Save final pel
++        vrshr.u16   d4, d4, #2
++        vrshr.u16   q9, q3, #2
++        vrshr.u16   q1, q10, #2
++        vrshr.u16   q8, #2
++        vrshr.u16   q0, #2
++        vmov.16     d19[3], r4          @ Restore final pel
++        vmov.16     d3[3], r5           @ Restore final pel
++        vdup.16     d5, d4[0]
++3:
++        vst1.16     {d5[3]}, [r0]!
++        vst1.16     {q8-q9}, [r1]
++        vst1.16     {q0-q1}, [r0]
++        pop         {r4-r10, pc}
 +
 +10:
-+        vst1.16   {q2,  q3}, [r1]       @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vst1.16   {q0,  q1}, [r0]       @ Left
-+        pop       {r4-r10, pc}
++A       ldrh        r9, [r8, -r12]
++T       sub         r9, r8, r12
++T       ldrh        r9, [r9]
++        orr         r9, r9, r9, lsl #16
++        vmov.32     d1[1], r9
++        vmov.32     d3[1], r9
++        b           2b
 +endfunc
 +
 +@ int ff_hevc_rpi_intra_filter_16_neon_16(
@@ -12286,152 +12823,163 @@ index 0000000000..6ce3d3ca8d
 +.set    p_size,  (1 << log2_s)          @ size in pels
 +
 +function ff_hevc_rpi_intra_filter_16_neon_16, export=1
-+        push      {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.16    q9,  d16[0]
-+        vdup.16    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {d16-d19}
-+        ldr        r12, [sp, #ur_size]
-+        bpl        1f
-+        cmp        r12, #12
++        push        {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d4[],d5[]", d17[3], "d18[],d19[]", "d22[],d23[]", 1, \
++            "ldr         r9, [sp, #ur_size]", \
++            "sub         r0, #pw"
++
++        vmov       q10, q9
++        ldr        lr, [sp, #dl_size]
++        vmov       q12, q11
++        it         cs
++        vldmcs     r7, {q9-q10}
 +        @ Given chroma frame layout, if UR exists then it is always legit to
 +        @ load all of it even if most of it is outside the frame.
-+        vldm       r5,  {d20-d23}
-+        bgt        1f
-+        bge        4f
-+        cmp        r5,  #8
-+        bge        3f
-+        vdup.16    d21, d20[3]
-+3:      vdup.16    d22, d21[3]
-+4:      vdup.16    d23, d22[3]
-+
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        ldr        r12, [sp, #dl_size]
-+        vdup.16    q1,  d0[0]
-+        vdup.16    q2,  d0[0]
-+        vdup.16    q3,  d0[0]
-+        bpl        1f
-+        vld1.16   {d0[0]}, [r10], r9
-+        vld1.16   {d0[1]}, [r3],  r9
-+        vld1.16   {d0[2]}, [r10], r9
-+        vld1.16   {d0[3]}, [r3],  r9
-+        vld1.16   {d1[0]}, [r10], r9
-+        vld1.16   {d1[1]}, [r3],  r9
-+        vld1.16   {d1[2]}, [r10], r9
-+        vld1.16   {d1[3]}, [r3],  r9
-+        vld1.16   {d2[0]}, [r10], r9
-+        vld1.16   {d2[1]}, [r3],  r9
-+        vld1.16   {d2[2]}, [r10], r9
-+        vld1.16   {d2[3]}, [r3],  r9
-+        vld1.16   {d3[0]}, [r10], r9
-+        vld1.16   {d3[1]}, [r3],  r9
-+        vld1.16   {d3[2]}, [r10]
-+        vld1.16   {d3[3]}, [r3]
++        itt        mi
++        vldmmi     r4, {q11-q12}
++        cmpmi      r9, #p_size
++        bmi        10f
 +1:
-+        bcc        1f
-+        vld1.16   {d4[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.16   {d4[2]}, [r8],  r9
-+        vld1.16   {d4[3]}, [r4],  r9
-+        ble        2f
-+        vld1.16   {d5[0]}, [r8],  r9
-+        vld1.16   {d5[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.16   {d5[2]}, [r8],  r9
-+        vld1.16   {d5[3]}, [r4],  r9
-+        blt        3f
-+        vld1.16   {d6[0]}, [r8],  r9
-+        vld1.16   {d6[1]}, [r4],  r9
-+        vld1.16   {d6[2]}, [r8],  r9
-+        vld1.16   {d6[3]}, [r4],  r9
-+        ble        4f
-+        vld1.16   {d7[0]}, [r8],  r9
-+        vld1.16   {d7[1]}, [r4],  r9
-+        vld1.16   {d7[2]}, [r8]
-+        vld1.16   {d7[3]}, [r4]
-+        b          1f
-+2:      vdup.16    d5,  d4[3]
-+3:      vdup.16    d6,  d5[3]
-+4:      vdup.16    d7,  d6[3]
-+1:
-+        tst        r2,  #FILTER_LIGHT
-+        add        r12, r0,  #-pw
-+        beq        10f
-+
-+        vpush     {q5}
-+        @ Luma light filter
-+        @ Left
-+        vext.16    q5,  q2,  q3,  #7
-+        vext.16    q14, q1,  q2,  #7
-+        vext.16    q13, q0,  q1,  #7
-+        vext.16    q12, q15, q0,  #7
-+
-+        vadd.u16   q5,  q3
-+        vadd.u16   q14, q2
-+        vadd.u16   q13, q1
-+        vadd.u16   q12, q0
-+        vmov.u16   r2,  d7[3]           @ Save final pel
-+
-+        vext.16    q0,  q12, q13, #1
-+        vext.16    q1,  q13, q14, #1
-+        vext.16    q2,  q14, q5,  #1
-+        vext.16    q3,  q5,  q5,  #1
-+
-+        vmov       d30, d24             @ d30[0] = l[0] + ul
-+        vadd.u16   q0,  q12
-+        vadd.u16   q1,  q13
-+        vadd.u16   q2,  q14
-+        vadd.u16   q3,  q5
-+
-+        vrshr.u16  q0,  #2
-+        vrshr.u16  q1,  #2
-+        vrshr.u16  q2,  #2
-+        vrshr.u16  q3,  #2
-+
-+        @ Up
-+        vext.16    q5,  q10, q11, #7
-+        vext.16    q14, q9,  q10, #7
-+        vext.16    q13, q8,  q9,  #7
-+        vext.16    q12, q15, q8,  #7
-+
-+        vadd.u16   q5,  q11
-+        vadd.u16   q14, q10
-+        vadd.u16   q13, q9
-+        vadd.u16   q12, q8
-+        vmov.u16   r3,  d23[3]          @ Save final pel
-+
-+        vext.16    q8,  q12, q13, #1
-+        vext.16    q9,  q13, q14, #1
-+        vext.16    q10, q14, q5,  #1
-+        vext.16    q11, q5,  q5,  #1
-+
-+        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
-+        vadd.u16   q8,  q12
-+        vadd.u16   q9,  q13
-+        vadd.u16   q10, q14
-+        vadd.u16   q11, q5
-+
-+        vrshr.u16  q8,  #2
-+        vrshr.u16  q9,  #2
++        lsls       r3, #AVAIL_S_L_N_DL_C
++        bpl        20f
++        vld1.16    {d0[0]}, [r6], r12
++        vld1.16    {d2[0]}, [r10], r12
++        vld1.16    {d0[1]}, [r6], r12
++        vld1.16    {d2[1]}, [r10], r12
++        vld1.16    {d0[2]}, [r6], r12
++        vld1.16    {d2[2]}, [r10], r12
++        vld1.16    {d0[3]}, [r6], r12
++        vld1.16    {d2[3]}, [r10], r12
++        vld1.16    {d1[0]}, [r6], r12
++        vld1.16    {d3[0]}, [r10], r12
++        vld1.16    {d1[1]}, [r6], r12
++        vld1.16    {d3[1]}, [r10], r12
++        vld1.16    {d1[2]}, [r6], r12
++        vld1.16    {d3[2]}, [r10], r12
++        vld1.16    {d1[3]}, [r6]
++        vld1.16    {d3[3]}, [r10]
++2:      bcc        30f
++        vld1.16    {d6[0]}, [r8], r12
++        vld1.16    {d4[1]}, [r5], r12
++        cmp        lr, #p_size
++        vld1.16    {d6[1]}, [r8], r12
++        bcc        40f
++        vld1.16    {d4[2]}, [r5], r12
++        vld1.16    {d6[2]}, [r8], r12
++        vld1.16    {d4[3]}, [r5], r12
++        vld1.16    {d6[3]}, [r8], r12
++        vld1.16    {d5[0]}, [r5], r12
++        vld1.16    {d7[0]}, [r8], r12
++        vld1.16    {d5[1]}, [r5], r12
++        vld1.16    {d7[1]}, [r8], r12
++        vld1.16    {d5[2]}, [r5], r12
++        vld1.16    {d7[2]}, [r8], r12
++        vld1.16    {d5[3]}, [r5]
++        vld1.16    {d7[3]}, [r8]
++3:
++        vzip.16    q0, q1
++        tst        r2, #FILTER_LIGHT
++        vzip.16    q2, q3
++        beq        4f
++
++        vext.16    q13, q8, q0, #7
++        vadd.i16   q13, q0
++        vext.16    q0, q0, q1, #7
++        vadd.i16   q0, q1
++        vext.16    q1, q1, q2, #7
++        vadd.i16   q1, q2
++        vext.16    q2, q2, q3, #7
++        vadd.i16   q2, q3
++        vext.16    q14, q8, q9, #7
++        vadd.i16   q14, q9
++        vext.16    q9, q9, q10, #7
++        vadd.i16   q9, q10
++        vext.16    q10, q10, q11, #7
++        vadd.i16   q10, q11
++        vext.16    q11, q11, q12, #7
++        vadd.i16   q11, q12
++        vadd.i16   d17, d26, d28        @ d17[0] = l[0] + 2ul + u[0]
++        vmov.u16   r4, d7[3]            @ Save final pel
++        vext.16    q3, q2, q2, #1
++        vadd.i16   q3, q2
++        vext.16    q2, q1, q2, #1
++        vadd.i16   q2, q1
++        vext.16    q1, q0, q1, #1
++        vadd.i16   q1, q0
++        vext.16    q0, q13, q0, #1
++        vadd.i16   q0, q13
++        vext.16    q13, q11, q11, #1
++        vadd.i16   q13, q11
++        vext.16    q11, q10, q11, #1
++        vadd.i16   q11, q10
++        vext.16    q10, q9, q10, #1
++        vadd.i16   q10, q9
++        vext.16    q9, q14, q9, #1
++        vadd.i16   q9, q14
++        vrshr.u16  d17, #2
++        vmov.u16   r5, d25[3]           @ Save final pel
++        vrshr.u16  q3, #2
++        vrshr.u16  q12, q13, #2
++        vrshr.u16  q0, #2
++        vrshr.u16  q1, #2
++        vrshr.u16  q2, #2
++        vrshr.u16  q9, #2
 +        vrshr.u16  q10, #2
 +        vrshr.u16  q11, #2
++        vdup.16    d17, d17[0]
++        vmov.16    d7[3], r4            @ Restore final pel
++        vmov.16    d25[3], r5           @ Restore final pel
++4:
++        vst1.16    {d17[3]}, [r0]!
++        vst1.16    {q9-q10}, [r1]!
++        vst1.16    {q0-q1}, [r0]!
++        vst1.16    {q11-q12}, [r1]
++        vst1.16    {q2-q3}, [r0]
++        pop        {r4-r10, pc}
 +
-+        @ Misc
-+        vrshr.u16  d30, #2
-+        vmov.u16   d7[3], r2            @ Restore final pel
-+        vmov.u16   d23[3], r3           @ Restore final pel
-+        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
-+        vpop      {q5}
-+
-+10:
-+        vstm       r1, {d16-d23}        @ Up
-+        vst1.16   {d31[3]}, [r12]       @ Up-left
-+        vstm       r0, { d0-d7 }        @ Left
-+        pop       {r4-r10, pc}
++10:     cmp        r9, #8
++        bhi        12f
++        beq        11f
++        vdup.16    d21, d20[3]
++11:     vdup.16    d22, d21[3]
++12:     vdup.16    d23, d22[3]
++        b          1b
++
++20:     vmov       q0, q2
++        vmov       q1, q2
++        b          2b
++
++30:     vmov       q3, q2
++        b          3b
++
++40:     cmp        lr, #8
++        bhi        42f
++        beq        41f
++        vdup.16    d5, d6[1]
++        vdup.16    d7, d6[1]
++        vmov.f32   s9, s10
++        vmov.f32   s13, s10
++        b          3b
++41:     vld1.16    {d4[2]}, [r5], r12
++        vld1.16    {d6[2]}, [r8], r12
++        vld1.16    {d4[3]}, [r5]
++        vld1.16    {d6[3]}, [r8]
++        vdup.16    d5, d6[3]
++        vdup.16    d7, d6[3]
++        b          3b
++42:     vld1.16    {d4[2]}, [r5], r12
++        vld1.16    {d6[2]}, [r8], r12
++        vld1.16    {d4[3]}, [r5], r12
++        vld1.16    {d6[3]}, [r8], r12
++        vld1.16    {d5[0]}, [r5], r12
++        ldrh       lr, [r8, r12]
++        vld1.16    {d7[0]}, [r8], r12
++        vld1.16    {d5[1]}, [r5]
++        vld1.16    {d7[1]}, [r8]
++        orr        lr, lr, lr, lsl #16
++        vmov       s11, lr
++        vmov       s15, lr
++        b          3b
 +endfunc
 +
 +@ int ff_hevc_rpi_intra_filter_4_neon_32(
@@ -12452,31 +13000,31 @@ index 0000000000..6ce3d3ca8d
 +.set    log2_s,  2
 +
 +function ff_hevc_rpi_intra_filter_4_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
-+
-+        it cs
-+        vldmcs     r6,  {d4, d5}
-+        it mi
-+        vldmmi     r5,  {d6, d7}
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        add        r12, r0,  #-pw
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10]
-+        vld1.32   {d1[1]}, [r3]
++        push        {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d16[0], "d4[],d5[]", "d6[],d7[]", 0, \
++            "vmov        q1, q0"
++
++        sub         r3, r0, #pw
++        it          mi
++        vldmmi      r4, {d6, d7}
++        it          cs
++        vldmcs      r7, {d4, d5}
++        lsls        lr, r2, #AVAIL_S_L_N_DL_C
++        bpl         1f
++        vld1.32     {d0[0]}, [r6], r12
++        vld1.32     {d0[1]}, [r10], r12
++        vld1.32     {d1[0]}, [r6]
++        vld1.32     {d1[1]}, [r10]
 +1:
-+        bcc        1f
-+        vld1.32   {d2[1]}, [r4],  r9
-+        vld1.32   {d3[0]}, [r8]
-+        vld1.32   {d3[1]}, [r4]
++        bcc         1f
++        vld1.32     {d2[1]}, [r8], r12
++        vld1.32     {d3[0]}, [r5]
++        vld1.32     {d3[1]}, [r8]
 +1:
-+        vst1.32    {q2,  q3 }, [r1]     @ Up
-+        vst1.32    {d31[1]}, [r12]
-+        vst1.32    {q0,  q1 }, [r0]     @ Left
-+        pop        {r4-r10, pc}
++        vst1.32     {d16[0]}, [r3]
++        vst1.32     {q2, q3}, [r1]
++        vst1.32     {q0, q1}, [r0]
++        pop         {r4-r10, pc}
 +endfunc
 +
 +
@@ -12502,54 +13050,57 @@ index 0000000000..6ce3d3ca8d
 +
 +function ff_hevc_rpi_intra_filter_8_neon_32, export=1
 +        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
-+
-+        vdup.32    q9,  d16[0]
-+        vdup.32    q11, d20[0]
-+
-+        it cs
-+        vldmcs     r6,  {q8,  q9 }
-+        ldr        r12, [sp, #ur_size]
++        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]", 0, \
++            "vmov       r3, s0"
++
++        vmov       q9, q8
++        ldr        r9, [r4, #3*4]
++        vmov       q11, q10
++        ldr        lr, [sp, #ur_size]
++        it         cs
++        vldmcs     r7, {q8, q9}
++        ittt       mi
++        vldmmi     r4, {q10, q11}
++        cmpmi      lr, #p_size
++        vdupmi.32  q11, r9
++        lsls       lr, r2, #AVAIL_S_L_N_DL_C
++        vdup.32    q1, r3
++        vdup.32    q2, r3
++        vdup.32    q3, r3
++        it         cs
++        ldrcs      r9, [r8, r12]
 +        bpl        1f
-+        cmp        r12, #p_size
-+        vldm       r5,  {q10, q11}
-+        bge        1f
-+        vdup.32    q11, d21[1]
-+1:
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q1,  d0[0]
-+        vdup.32    q2,  d0[0]
-+        vdup.32    q3,  d0[0]
-+        bpl        1f
-+        vld1.32   {d0[0]}, [r10], r9
-+        vld1.32   {d0[1]}, [r3],  r9
-+        vld1.32   {d1[0]}, [r10], r9
-+        vld1.32   {d1[1]}, [r3],  r9
-+        vld1.32   {d2[0]}, [r10], r9
-+        vld1.32   {d2[1]}, [r3],  r9
-+        vld1.32   {d3[0]}, [r10]
-+        vld1.32   {d3[1]}, [r3]
++        vld1.32    {d0[0]}, [r6], r12
++        vld1.32    {d0[1]}, [r10], r12
++        vld1.32    {d1[0]}, [r6], r12
++        vld1.32    {d1[1]}, [r10], r12
++        vld1.32    {d2[0]}, [r6], r12
++        vld1.32    {d2[1]}, [r10], r12
++        vld1.32    {d3[0]}, [r6]
++        vld1.32    {d3[1]}, [r10]
 +1:
++        ldr        lr, [sp, #dl_size]
++        bcc        2f
++        vld1.32    {d4[1]}, [r8], r12
++        vld1.32    {d5[0]}, [r5], r12
++        cmp        lr, #p_size
++        vld1.32    {d5[1]}, [r8], r12
 +        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vld1.32   {d4[1]}, [r4],  r9
-+        cmp        r12, #p_size
-+        vld1.32   {d5[0]}, [r8],  r9
-+        vld1.32   {d5[1]}, [r4],  r9
-+        blt        2f
-+        vld1.32   {d6[0]}, [r8],  r9
-+        vld1.32   {d6[1]}, [r4],  r9
-+        vld1.32   {d7[0]}, [r8]
-+        vld1.32   {d7[1]}, [r4]
-+        b          1f
-+2:
-+        vdup.32    q3,  d5[1]
++        vld1.32    {d6[0]}, [r5], r12
++        vld1.32    {d6[1]}, [r8], r12
++        vld1.32    {d7[0]}, [r5]
++        vld1.32    {d7[1]}, [r8]
 +1:
-+        add        r12, r0,  #-pw
-+        vstm       r1,  { q8-q11}       @ Up
-+        vst1.32   {d31[1]}, [r12]
-+        vstm       r0,  { q0-q3 }       @ Left
-+        pop       {r4-r10, pc}
++        it         cc
++        vdupcc.32  q3, r9
++2:
++        vst1.32    {q8-q9}, [r1]!
++        sub        r3, r0, #pw
++        vst1.32    {q0-q1}, [r0]!
++        vst1.32    {q10-q11}, [r1]
++        vst1.32    {q2-q3}, [r0]
++        vst1.32    {d31[1]}, [r3]
++        pop        {r4-r10, pc}
 +endfunc
 +
 +
@@ -12574,116 +13125,131 @@ index 0000000000..6ce3d3ca8d
 +.set    p_size,  (1 << log2_s)          @ size in pels
 +
 +function ff_hevc_rpi_intra_filter_16_neon_32, export=1
-+        push       {r4-r10, lr}
-+        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
++        push        {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1], 1, \
++            "ldr         r9, [sp, #ur_size]", \
++            "sub         r0, #pw"
 +
 +        @ Once we get this big we have run out of neon regs to store
 +        @ everything at once so do in pieces
 +
-+        @ Up (have)
-+        it cs
-+        vldmcs     r6,  { q0-q3 }
-+        ldr        r12, [sp, #ur_size]
-+        it mi
-+        vldmmi     r5,  { q8-q11}
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        bpl        1f
-+        cmp        r12, #12
-+        add        lr,  r1,  #(pw << log2_s)
-+        bgt        2f
-+        cmp        r12, #8
-+        bge        3f
-+        vdup.16    q9,  d17[1]
-+4:      vdup.16    d10, d19[1]
-+3:      vdup.16    q11, d21[1]
-+2:      vstm       lr, { q8-q11}
++        @ Up and/or up-right (have)
++        add         lr, r1, #(pw << log2_s)
++        bcc         1f
++        vldm        r7, {q0-q3}
++        vstm        r1, {q0-q3}
 +1:
++        bpl         3f
++        vldm        r4, {q8-q11}
++        cmp         r9, #16
++        blo         10f
++2:      vstm        lr, {q8-q11}
++3:
++        @ Up-left
++        vst1.32     {d30[1]}, [r0]!
++
++        @ Left and/or down-left (have)
++        lsls        lr, r3, #AVAIL_S_L_N_DL_C
++        ldr         r9, [sp, #dl_size]
++        bpl         4f
++        vld1.32     {d0[0]}, [r6], r12
++        vld1.32     {d0[1]}, [r10], r12
++        vld1.32     {d1[0]}, [r6], r12
++        vld1.32     {d1[1]}, [r10], r12
++        vld1.32     {d2[0]}, [r6], r12
++        vld1.32     {d2[1]}, [r10], r12
++        vld1.32     {d3[0]}, [r6], r12
++        vld1.32     {d3[1]}, [r10], r12
++        vld1.32     {d4[0]}, [r6], r12
++        vld1.32     {d4[1]}, [r10], r12
++        vld1.32     {d5[0]}, [r6], r12
++        vld1.32     {d5[1]}, [r10], r12
++        vld1.32     {d6[0]}, [r6], r12
++        vld1.32     {d6[1]}, [r10], r12
++        vld1.32     {d7[0]}, [r6]
++        vld1.32     {d7[1]}, [r10]
++        vstm        r0, {q0-q3}
++4:      add         lr, r0, #(pw << log2_s)
++        bcc         6f
++        vdup.32     d16, d30[0]
++        vld1.32     {d16[1]}, [r8], r12
++        vld1.32     {d17[0]}, [r5], r12
++        cmp         r9, #16
++        vld1.32     {d17[1]}, [r8], r12
++        blo         20f
++        vld1.32     {d18[0]}, [r5], r12
++        vld1.32     {d18[1]}, [r8], r12
++        vld1.32     {d19[0]}, [r5], r12
++        vld1.32     {d19[1]}, [r8], r12
++        vld1.32     {d20[0]}, [r5], r12
++        vld1.32     {d20[1]}, [r8], r12
++        vld1.32     {d21[0]}, [r5], r12
++        vld1.32     {d21[1]}, [r8], r12
++        vld1.32     {d22[0]}, [r5], r12
++        vld1.32     {d22[1]}, [r8], r12
++        vld1.32     {d23[0]}, [r5]
++        vld1.32     {d23[1]}, [r8]
++5:      vstm        lr, {q8-q11}
++6:
++        eors        r3, r2          @ (req & avail) ^ req = (req & ~avail)
++        bne         7f
++        pop         {r4-r10, pc}
++7:
++        @ Up and/or up-right (don't have)
++        vdup.32     q0, d31[0]
++        lsls        lr, r3, #AVAIL_S_UR_N_U_C
++        vdup.32     q1, d31[0]
++        add         lr, r1,  #(pw << log2_s)
++        vdup.32     q8, d31[1]
++        vdup.32     q9, d31[1]
++        it          cs
++        vstmcs      r1!, {q0-q1}
++        it          mi
++        vstmmi      lr!, {q8-q9}
++        it          cs
++        vstmcs      r1, {q0-q1}
++        it          mi
++        vstmmi      lr, {q8-q9}
 +
-+        @ Left (have)
-+        add        lr,  r0,  #-pw
-+        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
-+        vst1.32   {d30[1]}, [lr]        @ UL
-+        bpl        1f
-+        vld1.32   { d0[0]}, [r10], r9
-+        vld1.32   { d0[1]}, [r3],  r9
-+        vld1.32   { d1[0]}, [r10], r9
-+        vld1.32   { d1[1]}, [r3],  r9
-+        vld1.32   { d2[0]}, [r10], r9
-+        vld1.32   { d2[1]}, [r3],  r9
-+        vld1.32   { d3[0]}, [r10], r9
-+        vld1.32   { d3[1]}, [r3],  r9
-+        vld1.32   { d4[0]}, [r10], r9
-+        vld1.32   { d4[1]}, [r3],  r9
-+        vld1.32   { d5[0]}, [r10], r9
-+        vld1.32   { d5[1]}, [r3],  r9
-+        vld1.32   { d6[0]}, [r10], r9
-+        vld1.32   { d6[1]}, [r3],  r9
-+        vld1.32   { d7[0]}, [r10]
-+        vld1.32   { d7[1]}, [r3]
-+        vstm       r0,  { q0-q3 }
-+1:
-+        bcc        1f
-+        ldr        r12, [sp, #dl_size]
-+        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        vld1.32   {d16[1]}, [r4],  r9
-+        cmp        r12, #4
-+        vld1.32   {d17[0]}, [r8],  r9
-+        vld1.32   {d17[1]}, [r4],  r9
-+        ble        2f
-+        vld1.32   {d18[0]}, [r8],  r9
-+        vld1.32   {d18[1]}, [r4],  r9
-+        cmp        r12, #12
-+        vld1.32   {d19[0]}, [r8],  r9
-+        vld1.32   {d19[1]}, [r4],  r9
-+        blt        3f
-+        vld1.32   {d20[0]}, [r8],  r9
-+        vld1.32   {d20[1]}, [r4],  r9
-+        vld1.32   {d21[0]}, [r8],  r9
-+        vld1.32   {d21[1]}, [r4],  r9
-+        ble        4f
-+        vld1.32   {d22[0]}, [r8],  r9
-+        vld1.32   {d22[1]}, [r4],  r9
-+        vld1.32   {d23[0]}, [r8]
-+        vld1.32   {d23[1]}, [r4]
-+        b          5f
-+2:      vdup.32    q9,  d17[1]
-+3:      vdup.32    q10, d19[1]
-+4:      vdup.32    q11, d21[1]
-+5:      vstm       lr,  { q8-q11}
-+1:
-+        eors       r7,  r2
-+        beq        99f
-+
-+        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
-+        vdup.32    q0,  d31[0]
-+        vdup.32    q1,  d31[0]
-+        vdup.32    q2,  d31[0]
-+        vdup.32    q3,  d31[0]
-+        add        lr,  r1,  #(pw << log2_s)
-+        vdup.32    q8,  d31[1]
-+        vdup.32    q9,  d31[1]
-+        vdup.32    q10, d31[1]
-+        vdup.32    q11, d31[1]
-+        it cs
-+        vstmcs     r1,  { q0-q3 }
-+        it mi
-+        vstmmi     lr,  { q8-q11}
-+
-+        lsls       r7,  #AVAIL_S_L_N_DL_C
-+        vdup.32    q0,  d30[0]
-+        vdup.32    q1,  d30[0]
-+        vdup.32    q2,  d30[0]
-+        vdup.32    q3,  d30[0]
-+        add        lr,  r0,  #(pw << log2_s)
-+        it mi
-+        vstmmi     r0, { q0-q3 }
-+        it cs
-+        vstmcs     lr, { q0-q3 }
++        @ Left and/or down-left (don't have)
++        vdup.32     q0, d30[0]
++        lsls        lr, r3, #AVAIL_S_L_N_DL_C
++        vdup.32     q1, d30[0]
++        add         lr,  r0,  #(pw << log2_s)
++        it          mi
++        vstmmi      r0!, {q0-q1}
++        it          cs
++        vstmcs      lr!, {q0-q1}
++        it          mi
++        vstmmi      r0, {q0-q1}
++        it          cs
++        vstmcs      lr, {q0-q1}
++        pop         {r4-r10, pc}
 +
-+99:
-+        pop       {r4-r10, pc}
++10:     cmp         r9, #8
++        bhi         12f
++        beq         11f
++        vdup.32     q9, d17[1]
++11:     vdup.32     q10, d19[1]
++12:     vdup.32     q11, d21[1]
++        b           2b
++
++20:     cmp         r9, #8
++        blo         21f
++        vld1.32     {d18[0]}, [r5], r12
++        vld1.32     {d18[1]}, [r8], r12
++        vld1.32     {d19[0]}, [r5], r12
++        vld1.32     {d19[1]}, [r8], r12
++        beq         22f
++        vld1.32     {d20[0]}, [r5], r12
++        vld1.32     {d20[1]}, [r8], r12
++        vld1.32     {d21[0]}, [r5]
++        vld1.32     {d21[1]}, [r8]
++        b           23f
++21:     vdup.32     q9, d17[1]
++22:     vdup.32     q10, d19[1]
++23:     vdup.32     q11, d21[1]
++        b           5b
 +endfunc
 +
 +
@@ -14836,10 +15402,10 @@ index d181b74570..c52c450956 100644
      if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
 diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
 new file mode 100644
-index 0000000000..79549c411a
+index 0000000000..552c2e349e
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_cabac.c
-@@ -0,0 +1,2253 @@
+@@ -0,0 +1,2255 @@
 +/*
 + * HEVC CABAC decoding
 + *
@@ -16843,11 +17409,15 @@ index 0000000000..79549c411a
 +                        const int res = trans_scale_sat(
 +                            (levels[m] ^ k) - k, scale, dc_scale, shift);
 +#if RPI_COMPRESS_COEFFS
-+                      if (use_compress)
-+                        coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
-+                      else
++                        if (use_compress)
++                        {
++                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
++                        }
++                        else
 +#endif
-+                        blk_coeffs[0] = res;
++                        {
++                            blk_coeffs[0] = res;
++                        }
 +                        --m;
 +                    }
 +
@@ -16957,7 +17527,7 @@ index 0000000000..79549c411a
 +
 +#if !USE_BY22
 +// Stores results to lc
-+void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
 +{
 +    int x = abs_mvd_greater0_flag_decode(lc);
 +    int y = abs_mvd_greater0_flag_decode(lc);
@@ -16968,28 +17538,26 @@ index 0000000000..79549c411a
 +        y += abs_mvd_greater1_flag_decode(lc);
 +
 +    switch (x) {
-+    case 2: lc->pu.mvd.x = mvd_decode(lc);           break;
-+    case 1: lc->pu.mvd.x = mvd_sign_flag_decode(lc); break;
-+    case 0: lc->pu.mvd.x = 0;                       break;
++    case 2: x = mvd_decode(lc);           break;
++    case 1: x = mvd_sign_flag_decode(lc); break;
++    case 0: x = 0;                       break;
 +    }
 +
 +    switch (y) {
-+    case 2: lc->pu.mvd.y = mvd_decode(lc);           break;
-+    case 1: lc->pu.mvd.y = mvd_sign_flag_decode(lc); break;
-+    case 0: lc->pu.mvd.y = 0;                       break;
++    case 2: y = mvd_decode(lc);           break;
++    case 1: y = mvd_sign_flag_decode(lc); break;
++    case 0: y = 0;                       break;
 +    }
++    return MV_XY(x,y);
 +}
 +#else
-+void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
 +{
 +    int x = abs_mvd_greater0_flag_decode(lc);
 +    int y = abs_mvd_greater0_flag_decode(lc);
 +
-+    lc->pu.mvd.x = 0;
-+    lc->pu.mvd.y = 0;
-+
 +    if ((x | y) == 0)
-+        return;
++        return 0;
 +
 +    if (x != 0)
 +        x += abs_mvd_greater1_flag_decode(lc);
@@ -17000,9 +17568,9 @@ index 0000000000..79549c411a
 +    {
 +        // Not worth starting BY22
 +        if (x != 0)
-+            lc->pu.mvd.x = mvd_sign_flag_decode(lc);
++            x = mvd_sign_flag_decode(lc);
 +        if (y != 0)
-+            lc->pu.mvd.y = mvd_sign_flag_decode(lc);
++            y = mvd_sign_flag_decode(lc);
 +    }
 +    else
 +    {
@@ -17015,7 +17583,7 @@ index 0000000000..79549c411a
 +        b = val = get_cabac_by22_peek(cc);
 +
 +        if (x == 1) {
-+            lc->pu.mvd.x = ((int32_t)b >> 31) | 1;
++            x = ((int32_t)b >> 31) | 1;
 +            n = 1;
 +            b <<= 1;
 +        }
@@ -17044,7 +17612,7 @@ index 0000000000..79549c411a
 +            x = (b >> (32 - k)) + (1 << k);
 +            b <<= k;
 +            s = (int32_t)b >> 31;
-+            lc->pu.mvd.x = (x ^ s) - s;
++            x = (x ^ s) - s;
 +            b <<= 1;
 +
 +            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
@@ -17057,7 +17625,7 @@ index 0000000000..79549c411a
 +        }
 +
 +        if (y == 1) {
-+            lc->pu.mvd.y = ((int32_t)b >> 31) | 1;
++            y = ((int32_t)b >> 31) | 1;
 +            ++n;
 +            // don't care about b anymore
 +        }
@@ -17082,7 +17650,7 @@ index 0000000000..79549c411a
 +
 +            y = (b >> (32 - k)) + (1 << k);
 +            s = (int32_t)(b << k) >> 31;
-+            lc->pu.mvd.y = (y ^ s) - s;
++            y = (y ^ s) - s;
 +            // don't care about b anymore
 +        }
 +
@@ -17090,12 +17658,12 @@ index 0000000000..79549c411a
 +        bypass_finish(cc);
 +    }
 +
-+//    printf("BY: X=%d,Y=%d\n", lc->pu.mvd.x, lc->pu.mvd.y);
++    return MV_XY(x, y);
 +}
 +#endif
 diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
 new file mode 100644
-index 0000000000..47c9c7029d
+index 0000000000..a6587616ae
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_cabac_fns.h
 @@ -0,0 +1,191 @@
@@ -17130,7 +17698,7 @@ index 0000000000..47c9c7029d
 +                                const int log2_trafo_size, const enum ScanType scan_idx,
 +                                const int c_idx);
 +
-+void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
 +int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
 +
 +#define HEVC_BIN_SAO_MERGE_FLAG                         0
@@ -17410,10 +17978,10 @@ index 0000000000..0aee673d8b
 +#endif /* AVCODEC_RPI_HEVC_DATA_H */
 diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
 new file mode 100644
-index 0000000000..8e7695bcf9
+index 0000000000..dd5f65b5c4
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_filter.c
-@@ -0,0 +1,1204 @@
+@@ -0,0 +1,1206 @@
 +/*
 + * HEVC video decoder
 + *
@@ -17447,7 +18015,6 @@ index 0000000000..8e7695bcf9
 +#include "libavutil/common.h"
 +#include "libavutil/internal.h"
 +
-+#include "cabac_functions.h"
 +#include "rpi_hevcdec.h"
 +
 +#include "bit_depth_template.c"
@@ -17551,37 +18118,6 @@ index 0000000000..8e7695bcf9
 +    }
 +}
 +
-+static void copy_vert(uint8_t *dst, const uint8_t *src,
-+                      int pixel_shift, int height,
-+                      ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+    int i;
-+    switch (pixel_shift)
-+    {
-+        case 2:
-+            for (i = 0; i < height; i++) {
-+                *(uint32_t *)dst = *(uint32_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        case 1:
-+            for (i = 0; i < height; i++) {
-+                *(uint16_t *)dst = *(uint16_t *)src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+        default:
-+            for (i = 0; i < height; i++) {
-+                *dst = *src;
-+                dst += stride_dst;
-+                src += stride_src;
-+            }
-+            break;
-+    }
-+}
-+
 +static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
 +                           ptrdiff_t stride_src, int x, int y, int width, int height,
 +                           int c_idx, int x_ctb, int y_ctb)
@@ -17597,9 +18133,9 @@ index 0000000000..8e7695bcf9
 +        src + stride_src * (height - 1), width << sh);
 +
 +    /* copy vertical edges */
-+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
++    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
 +
-+    copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
++    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
 +}
 +
 +// N.B. Src & dst are swapped as this is a restore!
@@ -17629,14 +18165,7 @@ index 0000000000..8e7695bcf9
 +            const uint8_t * bs = dst1;
 +            while (m != 0) {
 +                if ((m & 1) != 0) {
-+                    unsigned int i;
-+                    uint8_t * d = bd;
-+                    const uint8_t * s = bs;
-+                    for (i = 0; i != bheight; ++i) {
-+                        memcpy(d, s, bwidth);
-+                        d += stride_src;
-+                        s += stride_dst;
-+                    }
++                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
 +                }
 +                m >>= 1;
 +                bs += bwidth;
@@ -17866,22 +18395,22 @@ index 0000000000..8e7695bcf9
 +            }
 +            if (src_l != NULL) {
 +                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    copy_vert(dst - (1 << sh),
++                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
 +                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
 +                              sh, height, stride_dst, 1 << sh);
 +                } else {
-+                    copy_vert(dst - (1 << sh),
++                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
 +                              src_l,
 +                              sh, height, stride_dst, stride_src);
 +                }
 +            }
 +            if (src_r != NULL) {
 +                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+                    copy_vert(dst + (width << sh),
++                    ff_hevc_rpi_copy_vert(dst + (width << sh),
 +                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
 +                              sh, height, stride_dst, 1 << sh);
 +                } else {
-+                    copy_vert(dst + (width << sh),
++                    ff_hevc_rpi_copy_vert(dst + (width << sh),
 +                              src_r,
 +                              sh, height, stride_dst, stride_src);
 +                }
@@ -18001,19 +18530,59 @@ index 0000000000..8e7695bcf9
 +
 +// Get block strength
 +// Given how we call we will always get within the 32bit boundries
-+static inline uint32_t bs_get32(const uint8_t * bs, const unsigned int stride2,
-+                                const unsigned int xl, const unsigned int xr, const unsigned int y)
++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
++                                unsigned int xl, unsigned int xr, const unsigned int y)
 +{
 +    if (xr <= xl) {
 +        return 0;
 +    }
 +    else
 +    {
++#if HAVE_ARMV6T2_INLINE
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#error This case not yet handled in bs_get32
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++        uint32_t tmp;
++        __asm__ (
++            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
++            "rsb         %[xr], %[xl], %[xr]                         \n\t"
++            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
++            "add         %[xr], %[xr], #7                            \n\t"
++            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
++            "bic         %[xr], %[xr], #7                            \n\t"
++            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
++            "lsr         %[xr], %[xr], #1                            \n\t"
++            "cmp         %[xr], #32                                  \n\t"
++            "mvn         %[tmp], #0                                  \n\t"
++            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
++            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
++            "lsr         %[xl], %[bs], %[xl]                         \n\t"
++            "it ne                                                   \n\t"
++            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
++            :  // Outputs
++                      [bs]"+r"(bs),
++                 [stride2]"+r"(stride2),
++                      [xl]"+r"(xl),
++                      [xr]"+r"(xr),
++                     [tmp]"=&r"(tmp)
++            :  // Inputs
++                       [y]"r"(y),
++                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
++                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
++                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++            :  // Clobbers
++                "cc"
++        );
++        return (uint32_t) bs;
++#else
 +        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
 +        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
 +
 +        return n == 32 ? a :
 +            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
++#endif
 +    }
 +}
 +
@@ -18335,14 +18904,15 @@ index 0000000000..8e7695bcf9
 +
 +static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
 +                              const unsigned int rep, const unsigned int dup,
-+                              const unsigned int mvf_stride,
++                              const unsigned int mvf_stride0,
++                              const unsigned int mvf_stride1,
 +                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
-+                              const MvField * const mvf_p, const MvField * const mvf_q)
++                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
 +{
 +    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
 +            mvf_p, mvf_q,
 +            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
-+            sizeof(MvField) * mvf_stride);
++            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
 +}
 +
 +
@@ -18352,13 +18922,11 @@ index 0000000000..8e7695bcf9
 +                                               const unsigned int log2_trafo_size,
 +                                               const int is_coded_block)
 +{
-+    const MvField * const tab_mvf       = s->ref->tab_mvf;
-+    const unsigned int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
-+    const unsigned int mvf_stride       = s->ps.sps->min_pu_width;  // width in pus; mvf stride
-+    const RefPicList * const rpl        = s->ref->refPicList;
++    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
++    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
++    const RefPicList * const rpl        = s->refPicList;
 +    // Rep count for bsf_mv when running with min_pu chuncks
 +    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
-+    const MvField * const mvf_curr      = tab_mvf + (y0 >> log2_min_pu_size) * mvf_stride + (x0 >> log2_min_pu_size);
 +    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
 +    const unsigned int trafo_size       = (1U << log2_trafo_size);
 +    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
@@ -18451,14 +19019,15 @@ index 0000000000..8e7695bcf9
 +                // If we aren't on the top boundary we must be in the middle
 +                // and in that case we know where mvf can change
 +                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
-+                const RefPicList *const rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
-+                                      ff_hevc_rpi_get_ref_list(s, s->ref, x0, y0 - 1) :
-+                                      rpl;
++                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
++                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
++                      rpl;
 +
 +                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
 +                    trafo_size >> (log2_min_pu_size + log2_rep),
++                    trafo_size >> (log2_min_pu_size + log2_rep),
 +                    rpl, rpl_top,
-+                    mvf_curr, mvf_curr - mvf_stride);
++                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
 +            }
 +
 +            // Finally put the results into bs
@@ -18468,16 +19037,16 @@ index 0000000000..8e7695bcf9
 +        // Max of 1 pu internal split - ignore if not on 8pel boundary
 +        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
 +        {
-+            const MvField * const mvf = tab_mvf +
-+                (lc->cu.y_split >> log2_min_pu_size) * mvf_stride + (x0 >> log2_min_pu_size);
++            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
 +            // If we have the x split as well then it must be in the middle
 +            const unsigned int log2_rep = has_x_split ? 1 : 0;
 +
 +            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
 +                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
 +                   trafo_size >> (log2_min_pu_size + log2_rep),
++                   trafo_size >> (log2_min_pu_size + log2_rep),
 +                   rpl, rpl,
-+                   mvf, mvf - mvf_stride));
++                   mvf, mvf - MVF_STASH_WIDTH_PU));
 +        }
 +    }
 +
@@ -18492,14 +19061,15 @@ index 0000000000..8e7695bcf9
 +            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
 +            {
 +                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
-+                const RefPicList *const rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
-+                                       ff_hevc_rpi_get_ref_list(s, s->ref, x0 - 1, y0) :
-+                                       rpl;
++                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
++                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
++                    rpl;
 +
 +                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                    (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
 +                    rpl, rpl_left,
-+                    mvf_curr, mvf_curr - 1);
++                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
 +            }
 +
 +            vbs_set(s, x0, y0, bsf_mask, bsf_v);
@@ -18507,13 +19077,13 @@ index 0000000000..8e7695bcf9
 +
 +        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
 +        {
-+            const MvField * const mvf = tab_mvf +
-+                (y0 >> log2_min_pu_size) * mvf_stride + (lc->cu.x_split >> log2_min_pu_size);
++            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
 +            const unsigned int log2_rep = has_y_split ? 1 : 0;
 +
 +            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
 +                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+                   (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
 +                   rpl, rpl,
 +                   mvf, mvf - 1));
 +        }
@@ -18618,12 +19188,89 @@ index 0000000000..8e7695bcf9
 +    return y;
 +}
 +
+diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h
+new file mode 100644
+index 0000000000..6b36f5e737
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mv.h
+@@ -0,0 +1,71 @@
++#ifndef AVCODEC_RPI_HEVC_MV_H
++#define AVCODEC_RPI_HEVC_MV_H
++
++#include "config.h"
++
++typedef int32_t MvXY;
++
++typedef struct HEVCRpiMvField {
++    MvXY xy[2];
++    int8_t ref_idx[2];
++    int8_t pred_flag;
++    int8_t dummy; // To 12 bytes
++} HEVCRpiMvField;
++
++
++#define MV_X(xy) (((xy) << 16) >> 16)
++#define MV_Y(xy) ((xy) >> 16)
++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_mv_arm.h"
++#endif
++
++#ifndef mvxy_add
++static inline MvXY mvxy_add(const MvXY a, const MvXY b)
++{
++    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
++}
++#endif
++
++
++#ifndef mv_scale_xy
++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
++{
++    int tx, scale_factor;
++
++    td = td == 0 ? 1 : av_clip_int8(td);
++    tb = av_clip_int8(tb);
++    tx = (0x4000 + (abs(td) >> 1)) / td;
++    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
++    return MV_XY(
++        av_clip_int16((scale_factor * MV_X(src) + 127 +
++                           (scale_factor * MV_X(src) < 0)) >> 8),
++        av_clip_int16((scale_factor * MV_Y(src) + 127 +
++                           (scale_factor * MV_Y(src) < 0)) >> 8));
++}
++#endif
++
++// 8.3.1 states that the bitstream may not contain poc diffs that do not
++// fit in 16 bits, so given that we don't care about the high bits we only
++// store the low 16 + LT & Inter flags
++
++#define COL_POC_INTRA   0
++#define COL_POC_INTER   (1 << 16)
++#define COL_POC_LT      (1 << 17)
++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
++
++typedef struct ColMv_s {
++    int32_t poc;
++    int32_t xy;
++} ColMv;
++
++typedef struct ColMvField_s {
++    ColMv L[2];
++} ColMvField;
++
++
++
++#endif // AVCODEC_RPI_HEVC_MV_H
 diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
 new file mode 100644
-index 0000000000..163e2558dc
+index 0000000000..221755fb6e
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_mvs.c
-@@ -0,0 +1,681 @@
+@@ -0,0 +1,486 @@
 +/*
 + * HEVC video decoder
 + *
@@ -18650,152 +19297,44 @@ index 0000000000..163e2558dc
 +#include "hevc.h"
 +#include "rpi_hevcdec.h"
 +
-+static const uint8_t l0_l1_cand_idx[12][2] = {
-+    { 0, 1, },
-+    { 1, 0, },
-+    { 0, 2, },
-+    { 2, 0, },
-+    { 1, 2, },
-+    { 2, 1, },
-+    { 0, 3, },
-+    { 3, 0, },
-+    { 1, 3, },
-+    { 3, 1, },
-+    { 2, 3, },
-+    { 3, 2, },
-+};
-+
-+
-+//check if the two luma locations belong to the same motion estimation region
-+static av_always_inline int is_diff_mer(const HEVCRpiContext * const s, int xN, int yN, int xP, int yP)
++static av_always_inline int
++is_eq_mer(const unsigned int plevel,
++    const unsigned int xN, const unsigned int yN,
++    const unsigned int xP, const unsigned int yP)
 +{
-+    uint8_t plevel = s->ps.pps->log2_parallel_merge_level;
-+
-+    return xN >> plevel == xP >> plevel &&
-+           yN >> plevel == yP >> plevel;
++    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
 +}
 +
-+#define MATCH_MV(x) (AV_RN32A(&A.x) == AV_RN32A(&B.x))
-+#define MATCH(x) (A.x == B.x)
-+
 +// check if the mv's and refidx are the same between A and B
-+static av_always_inline int compare_mv_ref_idx(const struct MvField A, const struct MvField B)
++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
 +{
-+    int a_pf = A.pred_flag;
-+    int b_pf = B.pred_flag;
-+    if (a_pf == b_pf) {
-+        if (a_pf == PF_BI) {
-+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) &&
-+                   MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-+        } else if (a_pf == PF_L0) {
-+            return MATCH(ref_idx[0]) && MATCH_MV(mv[0]);
-+        } else if (a_pf == PF_L1) {
-+            return MATCH(ref_idx[1]) && MATCH_MV(mv[1]);
-+        }
-+    }
++    return a->pred_flag == b->pred_flag &&
++        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
++        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
 +    return 0;
 +}
 +
-+static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
-+{
-+    int tx, scale_factor;
-+
-+    td = av_clip_int8(td);
-+    tb = av_clip_int8(tb);
-+    tx = (0x4000 + abs(td / 2)) / td;
-+    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
-+    dst->x = av_clip_int16((scale_factor * src->x + 127 +
-+                           (scale_factor * src->x < 0)) >> 8);
-+    dst->y = av_clip_int16((scale_factor * src->y + 127 +
-+                           (scale_factor * src->y < 0)) >> 8);
-+}
-+
-+static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
-+                       const int colPic, const int poc,
-+                       const RefPicList * const refPicList, const int X, const int refIdxLx,
-+                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
-+{
-+    int cur_lt = refPicList[X].isLongTerm[refIdxLx];
-+    int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
-+    int col_poc_diff, cur_poc_diff;
-+
-+    if (cur_lt != col_lt) {
-+        mvLXCol->x = 0;
-+        mvLXCol->y = 0;
-+        return 0;
-+    }
-+
-+    col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol];
-+    cur_poc_diff = poc    - refPicList[X].list[refIdxLx];
-+
-+    if (cur_lt || col_poc_diff == cur_poc_diff || !col_poc_diff) {
-+        mvLXCol->x = mvCol->x;
-+        mvLXCol->y = mvCol->y;
-+    } else {
-+        mv_scale(mvLXCol, mvCol, col_poc_diff, cur_poc_diff);
-+    }
-+    return 1;
-+}
-+
-+#define CHECK_MVSET(l)                                          \
-+    check_mvset(mvLXCol, temp_col.mv + l,                       \
-+                colPic, s->poc,                                 \
-+                refPicList, X, refIdxLx,                        \
-+                refPicList_col, L ## l, temp_col.ref_idx[l])
-+
-+// derive the motion vectors section 8.5.3.2.8
-+static int derive_temporal_colocated_mvs(const HEVCRpiContext * const s, const MvField temp_col,
-+                                         const int refIdxLx, Mv * const mvLXCol, const int X,
-+                                         const int colPic, const RefPicList * const refPicList_col)
-+{
-+    const RefPicList * const refPicList = s->ref->refPicList;
-+
-+    if (temp_col.pred_flag == PF_INTRA)
-+        return 0;
-+
-+    if (temp_col.pred_flag == PF_L0 ||
-+        (temp_col.pred_flag == PF_BI && (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+    {
-+        return CHECK_MVSET(0);
-+    }
-+    return CHECK_MVSET(1);
-+}
-+
-+#define TAB_MVF(x, y)                                                   \
-+    tab_mvf[(y) * min_pu_width + x]
-+
-+#define TAB_MVF_PU(v)                                                   \
-+    TAB_MVF(((x ## v) >> s->ps.sps->log2_min_pu_size),                     \
-+            ((y ## v) >> s->ps.sps->log2_min_pu_size))
-+
-+#define DERIVE_TEMPORAL_COLOCATED_MVS                                   \
-+    derive_temporal_colocated_mvs(s, temp_col,                          \
-+                                  refIdxLx, mvLXCol, X, colPic,         \
-+                                  ff_hevc_rpi_get_ref_list(s, ref, x, y))
-+
 +/*
 + * 8.5.3.1.7  temporal luma motion vector prediction
 + */
-+static int temporal_luma_motion_vector(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
++                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
 +                                       const int nPbW, const int nPbH, const int refIdxLx,
-+                                       Mv * const mvLXCol, const int X)
++                                       MvXY * const mvLXCol, const int X)
 +{
-+    MvField *tab_mvf;
-+    MvField temp_col;
-+    int x, y, x_pu, y_pu;
-+    const int min_pu_width = s->ps.sps->min_pu_width;
-+    int availableFlagLXCol = 0;
-+    int colPic;
++    int x, y;
++    const ColMv * cmv = NULL;
 +
-+    HEVCFrame * const ref = s->ref->collocated_ref;
++    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
++    const RefPicList * const refPicList = s->refPicList + X;
++    const int cur_lt = refPicList->isLongTerm[refIdxLx];
 +
-+    if (ref == NULL || ref->tab_mvf == NULL) {
-+        memset(mvLXCol, 0, sizeof(*mvLXCol));
++    *mvLXCol = 0;
++    // Unlikely but we might have a col_ref IDR frame!
++    if (col_ref->col_mvf == NULL)
 +        return 0;
-+    }
 +
-+    tab_mvf = ref->tab_mvf;
-+    colPic  = ref->poc;
++    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
 +
 +    //bottom right collocated motion vector
 +    x = x0 + nPbW;
@@ -18803,508 +19342,421 @@ index 0000000000..163e2558dc
 +
 +    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
 +        y < s->ps.sps->height &&
-+        x < s->ps.sps->width) {
-+        x                 &= ~15;
-+        y                 &= ~15;
-+        ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
-+        x_pu               = x >> s->ps.sps->log2_min_pu_size;
-+        y_pu               = y >> s->ps.sps->log2_min_pu_size;
-+        temp_col           = TAB_MVF(x_pu, y_pu);
-+        availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS;
++        x < s->ps.sps->width)
++    {
++        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++            (y >> 4) * s->col_mvf_stride;
++
++        if (col->L[0].poc != COL_POC_INTRA &&
++            (col->L[1].poc == COL_POC_INTRA ||
++             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++        {
++            cmv = col->L + 0;
++        }
++        else if (col->L[1].poc != COL_POC_INTRA)
++        {
++            cmv = col->L + 1;
++        }
 +    }
 +
 +    // derive center collocated motion vector
-+    if (!availableFlagLXCol) {
++    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
++    {
++        cmv = NULL;
 +        x                  = x0 + (nPbW >> 1);
 +        y                  = y0 + (nPbH >> 1);
-+        x                 &= ~15;
-+        y                 &= ~15;
-+        ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y);
-+        x_pu               = x >> s->ps.sps->log2_min_pu_size;
-+        y_pu               = y >> s->ps.sps->log2_min_pu_size;
-+        temp_col           = TAB_MVF(x_pu, y_pu);
-+        availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS;
-+    }
-+    return availableFlagLXCol;
-+}
-+
-+#define AVAILABLE(cand, v)                                      \
-+    (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA))
-+
-+#define COMPARE_MV_REFIDX(a, b)                                 \
-+    compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b))
 +
-+/*
-+ * 8.5.3.1.2  Derivation process for spatial merging candidates
-+ */
-+static void derive_spatial_merge_candidates(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0,
-+                                            int nPbW, int nPbH,
-+                                            int log2_cb_size, const unsigned int avail,
-+                                            int singleMCLFlag, int part_idx,
-+                                            int merge_idx,
-+                                            struct MvField mergecandlist[])
-+{
-+    const RefPicList * const refPicList = s->ref->refPicList;
-+    const MvField * const tab_mvf       = s->ref->tab_mvf;
-+
-+    const int min_pu_width = s->ps.sps->min_pu_width;
-+    const int xA1    = x0 - 1;
-+    const int yA1    = y0 + nPbH - 1;
-+
-+    const int xB1    = x0 + nPbW - 1;
-+    const int yB1    = y0 - 1;
++        {
++            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++              (y >> 4) * s->col_mvf_stride;
 +
-+    const int xB0    = x0 + nPbW;
-+    const int yB0    = y0 - 1;
++            if (col->L[0].poc != COL_POC_INTRA &&
++              (col->L[1].poc == COL_POC_INTRA ||
++               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++            {
++              cmv = col->L + 0;
++            }
++            else if (col->L[1].poc != COL_POC_INTRA)
++            {
++              cmv = col->L + 1;
++            }
++        }
++    }
 +
-+    const int xA0    = x0 - 1;
-+    const int yA0    = y0 + nPbH;
++    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
++        return 0;
 +
-+    const int xB2    = x0 - 1;
-+    const int yB2    = y0 - 1;
++    {
++        const int col_poc  = col_ref->poc;
++        const int ref_poc  = refPicList->list[refIdxLx];
 +
-+    const int nb_refs = (s->sh.slice_type == HEVC_SLICE_P) ?
-+                        s->sh.nb_refs[0] : FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]);
++        *mvLXCol = (cur_lt ||
++                        cmv->poc == col_poc ||
++                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
++                    cmv->xy :
++                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
++    }
 +
-+    int zero_idx = 0;
++    return cmv != NULL;
++}
 +
-+    int nb_merge_cand = 0;
-+    int nb_orig_merge_cand = 0;
++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++    return b != NULL && compare_mv_ref_idx(a, b);
++}
 +
-+    int is_available_a0;
-+    int is_available_a1;
-+    int is_available_b0;
-+    int is_available_b1;
-+    int is_available_b2;
 +
 +
-+    if (!singleMCLFlag && part_idx == 1 &&
-+        (lc->cu.part_mode == PART_Nx2N ||
-+         lc->cu.part_mode == PART_nLx2N ||
-+         lc->cu.part_mode == PART_nRx2N) ||
-+        is_diff_mer(s, xA1, yA1, x0, y0)) {
-+        is_available_a1 = 0;
-+    } else {
-+        is_available_a1 = AVAILABLE((avail & AVAIL_L) != 0, A1);
-+        if (is_available_a1) {
-+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1);
-+            if (merge_idx == 0)
-+                return;
-+            nb_merge_cand++;
-+        }
++/*
++ * 8.5.3.1.2  Derivation process for spatial merging candidates
++ */
++static inline const HEVCRpiMvField *
++derive_spatial_merge_candidates(
++    const HEVCRpiContext * const s,
++    const HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    const unsigned int part_idx,
++    const unsigned int merge_idx,
++    HEVCRpiMvField * const mvf_t)
++{
++    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
++    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
++
++    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
++    const unsigned int part_mode = lc->cu.part_mode;
++
++    const HEVCRpiMvField * perm[4];
++    unsigned int nb_merge_cand = 0;
++
++    // singleMCLFlag => part_idx == 0 so no need to test for it
++    if ((avail & AVAIL_L) == 0 ||
++        (part_idx == 1 &&
++            ((parts_a1 >> part_mode) & 1) != 0 ||
++                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
++        mvf_a1->pred_flag == PF_INTRA)
++    {
++        mvf_a1 = NULL;
++    }
++    else
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_a1;
++        perm[nb_merge_cand++] = mvf_a1;
 +    }
 +
-+    if (!singleMCLFlag && part_idx == 1 &&
-+        (lc->cu.part_mode == PART_2NxN ||
-+         lc->cu.part_mode == PART_2NxnU ||
-+         lc->cu.part_mode == PART_2NxnD) ||
-+        is_diff_mer(s, xB1, yB1, x0, y0)) {
-+        is_available_b1 = 0;
-+    } else {
-+        is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1);
-+        if (is_available_b1 &&
-+            !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) {
-+            mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1);
-+            if (merge_idx == nb_merge_cand)
-+                return;
-+            nb_merge_cand++;
-+        }
++    if ((avail & AVAIL_U) == 0 ||
++            (part_idx == 1 &&
++               ((parts_b1 >> part_mode) & 1) != 0 ||
++                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
++            mvf_b1->pred_flag == PF_INTRA)
++    {
++        mvf_b1 = NULL;
++    }
++    else if (!mvf_eq(mvf_b1, mvf_a1))
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_b1;
++        perm[nb_merge_cand++] = mvf_b1;
 +    }
 +
 +    // above right spatial merge candidate
-+    is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0) &&
-+                      !is_diff_mer(s, xB0, yB0, x0, y0);
-+
-+    if (is_available_b0 &&
-+        !(is_available_b1 && COMPARE_MV_REFIDX(B0, B1))) {
-+        mergecandlist[nb_merge_cand] = TAB_MVF_PU(B0);
++    // Never need mvf_b0 again so don't bother zeroing if navail
++    if ((avail & AVAIL_UR) != 0 &&
++        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
++        mvf_b0->pred_flag != PF_INTRA &&
++        !mvf_eq(mvf_b0, mvf_b1))
++    {
 +        if (merge_idx == nb_merge_cand)
-+            return;
-+        nb_merge_cand++;
++            return mvf_b0;
++        perm[nb_merge_cand++] = mvf_b0;
 +    }
 +
 +    // left bottom spatial merge candidate
-+    is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0) &&
-+                      !is_diff_mer(s, xA0, yA0, x0, y0);
-+
-+    if (is_available_a0 &&
-+        !(is_available_a1 && COMPARE_MV_REFIDX(A0, A1))) {
-+        mergecandlist[nb_merge_cand] = TAB_MVF_PU(A0);
++    // Never need mvf_a0 again so don't bother zeroing if navail
++    if ((avail & AVAIL_DL) != 0 &&
++        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
++        mvf_a0->pred_flag != PF_INTRA &&
++        !mvf_eq(mvf_a0, mvf_a1))
++    {
 +        if (merge_idx == nb_merge_cand)
-+            return;
-+        nb_merge_cand++;
++            return mvf_a0;
++        perm[nb_merge_cand++] = mvf_a0;
 +    }
 +
 +    // above left spatial merge candidate
-+    is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2) &&
-+                      !is_diff_mer(s, xB2, yB2, x0, y0);
-+
-+    if (is_available_b2 &&
-+        !(is_available_a1 && COMPARE_MV_REFIDX(B2, A1)) &&
-+        !(is_available_b1 && COMPARE_MV_REFIDX(B2, B1)) &&
-+        nb_merge_cand != 4) {
-+        mergecandlist[nb_merge_cand] = TAB_MVF_PU(B2);
-+        if (merge_idx == nb_merge_cand)
-+            return;
-+        nb_merge_cand++;
++    if (nb_merge_cand != 4 &&
++        (avail & AVAIL_UL) != 0 &&
++        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
++    {
++        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
++
++        if (mvf_b2->pred_flag != PF_INTRA &&
++            !mvf_eq(mvf_b2, mvf_a1) &&
++            !mvf_eq(mvf_b2, mvf_b1))
++        {
++            if (merge_idx == nb_merge_cand)
++                return mvf_b2;
++            perm[nb_merge_cand++] = mvf_b2;
++        }
 +    }
 +
 +    // temporal motion vector candidate
-+    if (s->sh.slice_temporal_mvp_enabled_flag &&
-+        nb_merge_cand < s->sh.max_num_merge_cand) {
-+        Mv mv_l0_col = { 0 }, mv_l1_col = { 0 };
-+        int available_l0 = temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                                       0, &mv_l0_col, 0);
-+        int available_l1 = (s->sh.slice_type == HEVC_SLICE_B) ?
-+                           temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+                                                       0, &mv_l1_col, 1) : 0;
-+
-+        if (available_l0 || available_l1) {
-+            mergecandlist[nb_merge_cand].pred_flag = available_l0 + (available_l1 << 1);
-+            AV_ZERO16(mergecandlist[nb_merge_cand].ref_idx);
-+            mergecandlist[nb_merge_cand].mv[0]      = mv_l0_col;
-+            mergecandlist[nb_merge_cand].mv[1]      = mv_l1_col;
++    if (s->sh.slice_temporal_mvp_enabled_flag)
++    {
++        static const HEVCRpiMvField mvf_z = {{0}};
++
++        *mvf_t = mvf_z;
++
++        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++                                        0, mvf_t->xy + 0, 0))
++            mvf_t->pred_flag = PF_L0;
++
++        if (s->sh.slice_type == HEVC_SLICE_B &&
++                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++                                            0, mvf_t->xy + 1, 1))
++            mvf_t->pred_flag |= PF_L1;
 +
++        if (mvf_t->pred_flag != 0)
++        {
 +            if (merge_idx == nb_merge_cand)
-+                return;
-+            nb_merge_cand++;
++                return mvf_t;
++            perm[nb_merge_cand++] = mvf_t;
 +        }
 +    }
 +
-+    nb_orig_merge_cand = nb_merge_cand;
-+
 +    // combined bi-predictive merge candidates  (applies for B slices)
-+    if (s->sh.slice_type == HEVC_SLICE_B && nb_orig_merge_cand > 1 &&
-+        nb_orig_merge_cand < s->sh.max_num_merge_cand) {
-+        int comb_idx = 0;
-+
-+        for (comb_idx = 0; nb_merge_cand < s->sh.max_num_merge_cand &&
-+                           comb_idx < nb_orig_merge_cand * (nb_orig_merge_cand - 1); comb_idx++) {
-+            int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
-+            int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
-+            MvField l0_cand = mergecandlist[l0_cand_idx];
-+            MvField l1_cand = mergecandlist[l1_cand_idx];
-+
-+            if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) &&
-+                (refPicList[0].list[l0_cand.ref_idx[0]] !=
-+                 refPicList[1].list[l1_cand.ref_idx[1]] ||
-+                 AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) {
-+                mergecandlist[nb_merge_cand].ref_idx[0]   = l0_cand.ref_idx[0];
-+                mergecandlist[nb_merge_cand].ref_idx[1]   = l1_cand.ref_idx[1];
-+                mergecandlist[nb_merge_cand].pred_flag    = PF_BI;
-+                AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]);
-+                AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]);
-+                if (merge_idx == nb_merge_cand)
-+                    return;
-+                nb_merge_cand++;
++    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
++    {
++        unsigned int comb_idx = 0;
++        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
++        const RefPicList * const refPicList = s->refPicList;
++
++        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
++        {
++            static const uint8_t l0_l1_cand_idx[12][2] = {
++                { 0, 1, },
++                { 1, 0, },
++                { 0, 2, },
++                { 2, 0, },
++                { 1, 2, },
++                { 2, 1, },
++                { 0, 3, },
++                { 3, 0, },
++                { 1, 3, },
++                { 3, 1, },
++                { 2, 3, },
++                { 3, 2, },
++            };
++
++            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
++            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
++            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
++            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
++
++            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
++                (mvf_c1->pred_flag & PF_L1) != 0 &&
++                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
++                 mvf_c0->xy[0] != mvf_c1->xy[1]))
++            {
++                if (merge_idx == nb_merge_cand++)
++                {
++                    // Need to be a bit careful as we will construct mvf_t and we
++                    // may already be using that as one of our condidates
++                    // so build & copy rather than build in place
++                    const HEVCRpiMvField mvf_m = {
++                        .xy = {
++                            mvf_c0->xy[0],
++                            mvf_c1->xy[1]},
++                        .ref_idx = {
++                            mvf_c0->ref_idx[0],
++                            mvf_c1->ref_idx[1]},
++                        .pred_flag = PF_BI
++                    };
++                    *mvf_t = mvf_m;
++                    return mvf_t;
++                }
 +            }
 +        }
 +    }
 +
-+    // append Zero motion vector candidates
-+    while (nb_merge_cand < s->sh.max_num_merge_cand) {
-+        mergecandlist[nb_merge_cand].pred_flag    = PF_L0 + ((s->sh.slice_type == HEVC_SLICE_B) << 1);
-+        AV_ZERO32(mergecandlist[nb_merge_cand].mv + 0);
-+        AV_ZERO32(mergecandlist[nb_merge_cand].mv + 1);
-+        mergecandlist[nb_merge_cand].ref_idx[0]   = zero_idx < nb_refs ? zero_idx : 0;
-+        mergecandlist[nb_merge_cand].ref_idx[1]   = zero_idx < nb_refs ? zero_idx : 0;
++    // "append" Zero motion vector candidates
++    {
++        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
++                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
++        const unsigned int zero_idx = merge_idx - nb_merge_cand;
++
++        const HEVCRpiMvField mvf_m = {
++            .xy = {0, 0},
++            .ref_idx = {
++                zero_idx < nb_refs ? zero_idx : 0,
++                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
++            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
++        };
 +
-+        if (merge_idx == nb_merge_cand)
-+            return;
-+        nb_merge_cand++;
-+        zero_idx++;
++        *mvf_t = mvf_m;
++        return mvf_t;
 +    }
 +}
 +
-+/*
-+ * 8.5.3.1.1 Derivation process of luma Mvs for merge mode
-+ */
++
++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
 +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
 +                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, MvField * const mv)
++                                int merge_idx, HEVCRpiMvField * const mv)
 +{
-+    int singleMCLFlag = 0;
-+    int nCS = 1 << log2_cb_size;
-+    LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]);
-+    int nPbW2 = nPbW;
-+    int nPbH2 = nPbH;
-+
-+    if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) {
-+        singleMCLFlag = 1;
-+        x0            = lc->cu.x;
-+        y0            = lc->cu.y;
-+        nPbW          = nCS;
-+        nPbH          = nCS;
-+        part_idx      = 0;
-+    }
++    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
++        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
++                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
++                                        0, merge_idx, mv) :
++        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
++                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
++                                        part_idx, merge_idx, mv);
 +
-+    derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-+                                    ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
-+                                    singleMCLFlag, part_idx,
-+                                    merge_idx, mergecand_list);
++    if (mvf_m != mv)
++        *mv = *mvf_m;
 +
-+    if (mergecand_list[merge_idx].pred_flag == PF_BI &&
-+        (nPbW2 + nPbH2) == 12) {
-+        mergecand_list[merge_idx].pred_flag = PF_L0;
-+    }
-+
-+    *mv = mergecand_list[merge_idx];
++    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
++        mv->pred_flag = PF_L0;
 +}
 +
-+static av_always_inline void dist_scale(const HEVCRpiContext * const s, Mv * const mv,
-+                                        int min_pu_width, int x, int y,
-+                                        int elist, int ref_idx_curr, int ref_idx)
-+{
-+    const RefPicList * const refPicList = s->ref->refPicList;
-+    const MvField * const tab_mvf       = s->ref->tab_mvf;
-+    int ref_pic_elist      = refPicList[elist].list[TAB_MVF(x, y).ref_idx[elist]];
-+    int ref_pic_curr       = refPicList[ref_idx_curr].list[ref_idx];
 +
-+    if (ref_pic_elist != ref_pic_curr) {
-+        int poc_diff = s->poc - ref_pic_elist;
-+        if (!poc_diff)
-+            poc_diff = 1;
-+        mv_scale(mv, mv, poc_diff, s->poc - ref_pic_curr);
-+    }
-+}
-+
-+static int mv_mp_mode_mx(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index,
-+                         Mv * const mv, const int ref_idx_curr, const int ref_idx)
++static av_always_inline const MvXY *
++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
 +{
-+    const MvField * const tab_mvf = s->ref->tab_mvf;
-+    const int min_pu_width = s->ps.sps->min_pu_width;
-+
-+    const RefPicList * const refPicList = s->ref->refPicList;
-+
-+    if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) &&
-+        refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) {
-+        *mv = TAB_MVF(x, y).mv[pred_flag_index];
-+        return 1;
++    if (mvf != NULL)
++    {
++        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
++            return mvf->xy + pfi0;
++        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
++            return mvf->xy + pfi1;
 +    }
-+    return 0;
++    return NULL;
 +}
 +
-+static int mv_mp_mode_mx_lt(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index,
-+                            Mv * const mv, const int ref_idx_curr, const int ref_idx)
++static av_always_inline const MvXY *
++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
++              const int islt0, const int poc0, const int poc_cur,
++              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
 +{
-+    MvField *tab_mvf = s->ref->tab_mvf;
-+    int min_pu_width = s->ps.sps->min_pu_width;
-+
-+    RefPicList *refPicList = s->ref->refPicList;
-+
-+    if ((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) {
-+        int currIsLongTerm     = refPicList[ref_idx_curr].isLongTerm[ref_idx];
-+
-+        int colIsLongTerm =
-+            refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])];
-+
-+        if (colIsLongTerm == currIsLongTerm) {
-+            *mv = TAB_MVF(x, y).mv[pred_flag_index];
-+            if (!currIsLongTerm)
-+                dist_scale(s, mv, min_pu_width, x, y,
-+                           pred_flag_index, ref_idx_curr, ref_idx);
-+            return 1;
++    if (mvf != NULL)
++    {
++        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
++        {
++            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
++            if (islt0 || poc1 == poc0) {
++                return mvf->xy + pfi0;
++            }
++            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
++            return mv_t;
++        }
++        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
++        {
++            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
++            if (islt0 || poc1 == poc0) {
++                return mvf->xy + pfi1;
++            }
++            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
++            return mv_t;
 +        }
 +    }
-+    return 0;
++    return NULL;
 +}
 +
-+#define MP_MX(v, pred, mx)                                      \
-+    mv_mp_mode_mx(s,                                            \
-+                  (x ## v) >> s->ps.sps->log2_min_pu_size,         \
-+                  (y ## v) >> s->ps.sps->log2_min_pu_size,         \
-+                  pred, &mx, ref_idx_curr, ref_idx)
-+
-+#define MP_MX_LT(v, pred, mx)                                   \
-+    mv_mp_mode_mx_lt(s,                                         \
-+                     (x ## v) >> s->ps.sps->log2_min_pu_size,      \
-+                     (y ## v) >> s->ps.sps->log2_min_pu_size,      \
-+                     pred, &mx, ref_idx_curr, ref_idx)
-+
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc,
-+                              int x0, int y0, int nPbW, int nPbH,
-+                              int log2_cb_size, const unsigned int avail, int part_idx,
-+                              int merge_idx, MvField * const mv,
-+                              int mvp_lx_flag, int LX)
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    HEVCRpiMvField * const mv,
++    const unsigned int mvp_lx_flag, const unsigned int LX)
 +{
-+    const MvField *tab_mvf = s->ref->tab_mvf;
-+    int isScaledFlag_L0 = 0;
-+    int availableFlagLXA0 = 1;
-+    int availableFlagLXB0 = 1;
-+    int numMVPCandLX = 0;
-+    int min_pu_width = s->ps.sps->min_pu_width;
++    const unsigned int pfi0 = LX;
++    const unsigned int pfi1 = LX == 0 ? 1 : 0;
++    const RefPicList * const rpl = s->refPicList;
++    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
++    const int poc_cur = s->poc;
++    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
 +
-+    int xA0, yA0;
-+    int is_available_a0;
-+    int xA1, yA1;
-+    int is_available_a1;
-+    int xB0, yB0;
-+    int is_available_b0;
-+    int xB1, yB1;
-+    int is_available_b1;
-+    int xB2, yB2;
-+    int is_available_b2;
++    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
++    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++    const MvXY * mva = NULL;
++    const MvXY * mvb;
++    MvXY * const mv_rv = mv->xy + LX;
++    MvXY mvt_a, mvt_b;
 +
-+    Mv mvpcand_list[2] = { { 0 } };
-+    Mv mxA;
-+    Mv mxB;
-+    int ref_idx_curr;
-+    int ref_idx = 0;
-+    int pred_flag_index_l0;
-+    int pred_flag_index_l1;
++    *mv_rv = 0;
 +
-+    ref_idx_curr       = LX;
-+    ref_idx            = mv->ref_idx[LX];
-+    pred_flag_index_l0 = LX;
-+    pred_flag_index_l1 = !LX;
++    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
++        mvf_a0 = NULL;
++    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
++        goto use_mva;
 +
-+    // left bottom spatial candidate
-+    xA0 = x0 - 1;
-+    yA0 = y0 + nPbH;
++    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
++        mvf_a1 = NULL;
 +
-+    is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0);
++    if (mva == NULL &&
++        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
++        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
++        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
 +
-+    //left spatial merge candidate
-+    xA1    = x0 - 1;
-+    yA1    = y0 + nPbH - 1;
++    if (mvp_lx_flag == 0 && mva != NULL)
++        goto use_mva;
 +
-+    is_available_a1 = AVAILABLE((avail & AVAIL_L), A1);
-+    if (is_available_a0 || is_available_a1)
-+        isScaledFlag_L0 = 1;
++    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
++        mvf_b0 = NULL;
++    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
++        mvf_b1 = NULL;
++    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
++        mvf_b2 = NULL;
 +
-+    if (is_available_a0) {
-+        if (MP_MX(A0, pred_flag_index_l0, mxA)) {
-+            goto b_candidates;
-+        }
-+        if (MP_MX(A0, pred_flag_index_l1, mxA)) {
-+            goto b_candidates;
-+        }
-+    }
++    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
++        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
++        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
 +
-+    if (is_available_a1) {
-+        if (MP_MX(A1, pred_flag_index_l0, mxA)) {
-+            goto b_candidates;
-+        }
-+        if (MP_MX(A1, pred_flag_index_l1, mxA)) {
-+            goto b_candidates;
-+        }
-+    }
++    if (mvf_a0 == NULL && mvf_a1 == NULL) {
++        mva = mvb;
++        if (mvp_lx_flag == 0 && mva != NULL)
++            goto use_mva;
 +
-+    if (is_available_a0) {
-+        if (MP_MX_LT(A0, pred_flag_index_l0, mxA)) {
-+            goto b_candidates;
-+        }
-+        if (MP_MX_LT(A0, pred_flag_index_l1, mxA)) {
-+            goto b_candidates;
-+        }
++        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
++            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
++            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
 +    }
 +
-+    if (is_available_a1) {
-+        if (MP_MX_LT(A1, pred_flag_index_l0, mxA)) {
-+            goto b_candidates;
-+        }
-+        if (MP_MX_LT(A1, pred_flag_index_l1, mxA)) {
-+            goto b_candidates;
-+        }
++    if (mva == NULL) {
++        mva = mvb;
++        mvb = NULL;
 +    }
-+    availableFlagLXA0 = 0;
-+
-+b_candidates:
-+    // B candidates
-+    // above right spatial merge candidate
-+    xB0    = x0 + nPbW;
-+    yB0    = y0 - 1;
-+
-+    is_available_b0 =  AVAILABLE((avail & AVAIL_UR) != 0, B0);
 +
-+    // above spatial merge candidate
-+    xB1    = x0 + nPbW - 1;
-+    yB1    = y0 - 1;
-+    is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1);
++    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
++        mvb = NULL;
 +
-+    // above left spatial merge candidate
-+    xB2 = x0 - 1;
-+    yB2 = y0 - 1;
-+    is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2);
-+
-+    // above right spatial merge candidate
-+    if (is_available_b0) {
-+        if (MP_MX(B0, pred_flag_index_l0, mxB)) {
-+            goto scalef;
-+        }
-+        if (MP_MX(B0, pred_flag_index_l1, mxB)) {
-+            goto scalef;
-+        }
++    if (mvp_lx_flag == 0 && mva != NULL) {
++        goto use_mva;
 +    }
-+
-+    // above spatial merge candidate
-+    if (is_available_b1) {
-+        if (MP_MX(B1, pred_flag_index_l0, mxB)) {
-+            goto scalef;
-+        }
-+        if (MP_MX(B1, pred_flag_index_l1, mxB)) {
-+            goto scalef;
-+        }
++    else if (mvp_lx_flag != 0 && mvb != NULL) {
++        *mv_rv = *mvb;
 +    }
-+
-+    // above left spatial merge candidate
-+    if (is_available_b2) {
-+        if (MP_MX(B2, pred_flag_index_l0, mxB)) {
-+            goto scalef;
-+        }
-+        if (MP_MX(B2, pred_flag_index_l1, mxB)) {
-+            goto scalef;
-+        }
-+    }
-+    availableFlagLXB0 = 0;
-+
-+scalef:
-+    if (!isScaledFlag_L0) {
-+        if (availableFlagLXB0) {
-+            availableFlagLXA0 = 1;
-+            mxA = mxB;
-+        }
-+        availableFlagLXB0 = 0;
-+
-+        // XB0 and L1
-+        if (is_available_b0) {
-+            availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l0, mxB);
-+            if (!availableFlagLXB0)
-+                availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l1, mxB);
-+        }
-+
-+        if (is_available_b1 && !availableFlagLXB0) {
-+            availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l0, mxB);
-+            if (!availableFlagLXB0)
-+                availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l1, mxB);
-+        }
-+
-+        if (is_available_b2 && !availableFlagLXB0) {
-+            availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l0, mxB);
-+            if (!availableFlagLXB0)
-+                availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l1, mxB);
-+        }
-+    }
-+
-+    if (availableFlagLXA0)
-+        mvpcand_list[numMVPCandLX++] = mxA;
-+
-+    if (availableFlagLXB0 && (!availableFlagLXA0 || mxA.x != mxB.x || mxA.y != mxB.y))
-+        mvpcand_list[numMVPCandLX++] = mxB;
-+
-+    //temporal motion vector prediction candidate
-+    if (numMVPCandLX < 2 && s->sh.slice_temporal_mvp_enabled_flag &&
-+        mvp_lx_flag == numMVPCandLX) {
-+        Mv mv_col;
-+        int available_col = temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
-+                                                        nPbH, ref_idx,
-+                                                        &mv_col, LX);
-+        if (available_col)
-+            mvpcand_list[numMVPCandLX++] = mv_col;
++    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
++        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
++                                    nPbH, mv->ref_idx[LX],
++                                    mv_rv, LX);
 +    }
++    return;
 +
-+    mv->mv[LX] = mvpcand_list[mvp_lx_flag];
++use_mva:
++    *mv_rv = *mva;
++    return;
 +}
++
 diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
 new file mode 100644
 index 0000000000..04f9231acc
@@ -19497,7 +19949,7 @@ index 0000000000..4b4d032a16
 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */
 diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
 new file mode 100644
-index 0000000000..98e2fd7009
+index 0000000000..0866a26702
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_ps.c
 @@ -0,0 +1,1940 @@
@@ -20613,7 +21065,7 @@ index 0000000000..98e2fd7009
 +
 +        // Inferred parameters
 +        sps->log2_ctb_size = CtbLog2SizeY;
-+        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
++//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
 +    }
 +
 +    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
@@ -20767,8 +21219,8 @@ index 0000000000..98e2fd7009
 +    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
 +    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
 +    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
-+    sps->min_pu_width  = sps->width  >> sps->log2_min_pu_size;
-+    sps->min_pu_height = sps->height >> sps->log2_min_pu_size;
++    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
++    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
 +    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
 +
 +    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
@@ -21043,7 +21495,7 @@ index 0000000000..98e2fd7009
 +    /**
 +     * 6.5
 +     */
-+    pic_area_in_ctbs     = sps->ctb_width    * sps->ctb_height;
++    pic_area_in_ctbs     = sps->ctb_size;
 +
 +    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
 +    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
@@ -21443,10 +21895,10 @@ index 0000000000..98e2fd7009
 +}
 diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
 new file mode 100644
-index 0000000000..77af463e31
+index 0000000000..11d9e26853
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_ps.h
-@@ -0,0 +1,442 @@
+@@ -0,0 +1,444 @@
 +/*
 + * HEVC parameter set parsing
 + *
@@ -21728,7 +22180,9 @@ index 0000000000..77af463e31
 +    unsigned int log2_min_tb_size;  // 2..5
 +    unsigned int log2_max_trafo_size;
 +    unsigned int log2_ctb_size;     // 4..6
-+    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
++//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
++#define LOG2_MIN_PU_SIZE 2
++#define LOG2_MIN_CU_SIZE 3
 +
 +    int max_transform_hierarchy_depth_inter;
 +    int max_transform_hierarchy_depth_intra;
@@ -21891,10 +22345,10 @@ index 0000000000..77af463e31
 +#endif /* AVCODEC_RPI_HEVC_PS_H */
 diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
 new file mode 100644
-index 0000000000..d7745711ab
+index 0000000000..8cc5796cf0
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_refs.c
-@@ -0,0 +1,515 @@
+@@ -0,0 +1,485 @@
 +/*
 + * HEVC video decoder
 + *
@@ -21926,7 +22380,7 @@ index 0000000000..d7745711ab
 +#include "hevc.h"
 +#include "rpi_hevcdec.h"
 +
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags)
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
 +{
 +    /* frame->frame can be NULL if context init failed */
 +    if (!frame->frame || !frame->frame->buf[0])
@@ -21936,27 +22390,13 @@ index 0000000000..d7745711ab
 +    if (!frame->flags) {
 +        ff_thread_release_buffer(s->avctx, &frame->tf);
 +
-+        av_buffer_unref(&frame->tab_mvf_buf);
-+        frame->tab_mvf = NULL;
-+
-+        av_buffer_unref(&frame->rpl_buf);
-+        av_buffer_unref(&frame->rpl_tab_buf);
-+        frame->rpl_tab    = NULL;
-+        frame->refPicList = NULL;
++        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
++        frame->col_mvf = NULL;
 +
 +        frame->collocated_ref = NULL;
 +    }
 +}
 +
-+const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref, int x0, int y0)
-+{
-+    int x_cb         = x0 >> s->ps.sps->log2_ctb_size;
-+    int y_cb         = y0 >> s->ps.sps->log2_ctb_size;
-+    int pic_width_cb = s->ps.sps->ctb_width;
-+    int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb];
-+    return (const RefPicList *)ref->rpl_tab[ctb_addr_ts];
-+}
-+
 +void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
 +{
 +    int i;
@@ -21973,11 +22413,11 @@ index 0000000000..d7745711ab
 +        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
 +}
 +
-+static HEVCFrame *alloc_frame(HEVCRpiContext *s)
++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
 +{
-+    int i, j, ret;
++    int i, ret;
 +    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCFrame *frame = &s->DPB[i];
++        HEVCRpiFrame * const frame = &s->DPB[i];
 +        if (frame->frame->buf[0])
 +            continue;
 +
@@ -21986,22 +22426,15 @@ index 0000000000..d7745711ab
 +        if (ret < 0)
 +            return NULL;
 +
-+        frame->rpl_buf = av_buffer_allocz(s->pkt.nb_nals * sizeof(RefPicListTab));
-+        if (!frame->rpl_buf)
-+            goto fail;
-+
-+        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
-+        if (!frame->tab_mvf_buf)
-+            goto fail;
-+        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
-+
-+        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
-+        if (!frame->rpl_tab_buf)
-+            goto fail;
-+        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
-+        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
-+        for (j = 0; j < frame->ctb_count; j++)
-+            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        frame->col_mvf = NULL;
++        frame->col_mvf_buf = NULL;
++        if (s->used_for_ref && !s->is_irap)
++        {
++            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
++            if (!frame->col_mvf_buf)
++                goto fail;
++            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
++        }
 +
 +        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
 +        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
@@ -22018,12 +22451,12 @@ index 0000000000..d7745711ab
 +
 +int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
 +{
-+    HEVCFrame *ref;
++    HEVCRpiFrame *ref;
 +    int i;
 +
 +    /* check that this POC doesn't already exist */
 +    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCFrame *frame = &s->DPB[i];
++        HEVCRpiFrame *frame = &s->DPB[i];
 +
 +        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
 +            frame->poc == poc) {
@@ -22064,7 +22497,7 @@ index 0000000000..d7745711ab
 +
 +        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
 +            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+                HEVCFrame *frame = &s->DPB[i];
++                HEVCRpiFrame *frame = &s->DPB[i];
 +                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
 +                        frame->sequence == s->seq_output) {
 +                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
@@ -22073,7 +22506,7 @@ index 0000000000..d7745711ab
 +        }
 +
 +        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCFrame *frame = &s->DPB[i];
++            HEVCRpiFrame *frame = &s->DPB[i];
 +            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
 +                frame->sequence == s->seq_output) {
 +                nb_output++;
@@ -22090,7 +22523,7 @@ index 0000000000..d7745711ab
 +            return 0;
 +
 +        if (nb_output) {
-+            HEVCFrame *frame = &s->DPB[min_idx];
++            HEVCRpiFrame *frame = &s->DPB[min_idx];
 +            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
 +                return 0;
 +
@@ -22122,7 +22555,7 @@ index 0000000000..d7745711ab
 +    int i;
 +
 +    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCFrame *frame = &s->DPB[i];
++        HEVCRpiFrame *frame = &s->DPB[i];
 +        if ((frame->flags) &&
 +            frame->sequence == s->seq_output &&
 +            frame->poc != s->poc) {
@@ -22132,7 +22565,7 @@ index 0000000000..d7745711ab
 +
 +    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
 +        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCFrame *frame = &s->DPB[i];
++            HEVCRpiFrame *frame = &s->DPB[i];
 +            if ((frame->flags) &&
 +                frame->sequence == s->seq_output &&
 +                frame->poc != s->poc) {
@@ -22143,7 +22576,7 @@ index 0000000000..d7745711ab
 +        }
 +
 +        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+            HEVCFrame *frame = &s->DPB[i];
++            HEVCRpiFrame *frame = &s->DPB[i];
 +            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
 +                frame->sequence == s->seq_output &&
 +                frame->poc <= min_poc) {
@@ -22157,19 +22590,10 @@ index 0000000000..d7745711ab
 +
 +static int init_slice_rpl(HEVCRpiContext *s)
 +{
-+    HEVCFrame *frame = s->ref;
-+    int ctb_count    = frame->ctb_count;
-+    int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+    int i;
-+
-+    if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
++    if (s->slice_idx >= s->rpl_tab_size)
 +        return AVERROR_INVALIDDATA;
 +
-+    for (i = ctb_addr_ts; i < ctb_count; i++)
-+        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
-+
-+    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
-+
++    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
 +    return 0;
 +}
 +
@@ -22193,7 +22617,7 @@ index 0000000000..d7745711ab
 +
 +    for (list_idx = 0; list_idx < nb_list; list_idx++) {
 +        RefPicList  rpl_tmp = { { 0 } };
-+        RefPicList *rpl     = &s->ref->refPicList[list_idx];
++        RefPicList *rpl     = &s->refPicList[list_idx];
 +
 +        /* The order of the elements is
 +         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
@@ -22243,13 +22667,13 @@ index 0000000000..d7745711ab
 +    return 0;
 +}
 +
-+static HEVCFrame *find_ref_idx(HEVCRpiContext *s, int poc)
++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
 +{
 +    int i;
 +    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
 +
 +    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCFrame *ref = &s->DPB[i];
++        HEVCRpiFrame *ref = &s->DPB[i];
 +        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
 +            if ((ref->poc & LtMask) == poc)
 +                return ref;
@@ -22257,7 +22681,7 @@ index 0000000000..d7745711ab
 +    }
 +
 +    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCFrame *ref = &s->DPB[i];
++        HEVCRpiFrame *ref = &s->DPB[i];
 +        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
 +            if (ref->poc == poc || (ref->poc & LtMask) == poc)
 +                return ref;
@@ -22270,15 +22694,15 @@ index 0000000000..d7745711ab
 +    return NULL;
 +}
 +
-+static void mark_ref(HEVCFrame *frame, int flag)
++static void mark_ref(HEVCRpiFrame *frame, int flag)
 +{
 +    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
 +    frame->flags |= flag;
 +}
 +
-+static HEVCFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
 +{
-+    HEVCFrame *frame;
++    HEVCRpiFrame *frame;
 +    int i, x, y;
 +
 +    frame = alloc_frame(s);
@@ -22311,7 +22735,7 @@ index 0000000000..d7745711ab
 +static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
 +                             int poc, int ref_flag)
 +{
-+    HEVCFrame *ref = find_ref_idx(s, poc);
++    HEVCRpiFrame *ref = find_ref_idx(s, poc);
 +
 +    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
 +        return AVERROR_INVALIDDATA;
@@ -22344,7 +22768,7 @@ index 0000000000..d7745711ab
 +
 +    /* clear the reference flags on all frames except the current one */
 +    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+        HEVCFrame *frame = &s->DPB[i];
++        HEVCRpiFrame *frame = &s->DPB[i];
 +
 +        if (frame == s->ref)
 +            continue;
@@ -27482,234 +27906,210 @@ index 0000000000..3caef20137
 +
 diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
 new file mode 100644
-index 0000000000..18128f4311
+index 0000000000..1c364492d0
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_transform10.h
-@@ -0,0 +1,106 @@
+@@ -0,0 +1,94 @@
 +static const unsigned char rpi_hevc_transform10 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xe8,   // 0000
-+0x20,  0x00,  0x00,  0x00,  0x0c,  0xf8,  0x00,  0x88,   // 0008
-+0x00,  0x00,  0xc0,  0xf8,  0x00,  0x00,  0x40,  0xe8,   // 0010
-+0x00,  0x02,  0x00,  0x00,  0x0c,  0xf8,  0x00,  0xa8,   // 0018
-+0x00,  0x00,  0xc0,  0xf8,  0x00,  0x00,  0x00,  0x60,   // 0020
-+0x03,  0xe8,  0x20,  0x00,  0x00,  0x00,  0x07,  0xe8,   // 0028
-+0x00,  0x02,  0x00,  0x00,  0x08,  0xe8,  0x00,  0x04,   // 0030
-+0x00,  0x00,  0x04,  0xe8,  0x40,  0x00,  0x00,  0x00,   // 0038
-+0x05,  0xe8,  0x00,  0x02,  0x00,  0x00,  0x39,  0xef,   // 0040
-+0xc0,  0xfd,  0xff,  0xff,  0x2b,  0xef,  0x40,  0x00,   // 0048
-+0x00,  0x00,  0x5b,  0x7a,  0x5b,  0x7c,  0x4a,  0xc3,   // 0050
-+0x50,  0x17,  0x02,  0x6f,  0x02,  0x6a,  0x32,  0x18,   // 0058
-+0x0a,  0x6a,  0x16,  0x40,  0x04,  0x18,  0x1a,  0x66,   // 0060
-+0x80,  0x90,  0x33,  0x00,  0x0c,  0xf8,  0x00,  0x80,   // 0068
-+0x00,  0x00,  0xc0,  0x08,  0x18,  0x00,  0x80,  0x90,   // 0070
-+0x5e,  0x00,  0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,   // 0078
-+0x20,  0x08,  0x10,  0x00,  0x4c,  0xfe,  0x30,  0xc0,   // 0080
-+0x09,  0x04,  0x20,  0x08,  0x00,  0x00,  0x04,  0xfe,   // 0088
-+0x00,  0x90,  0x80,  0x02,  0x00,  0x08,  0x02,  0x00,   // 0090
-+0x80,  0x90,  0x4d,  0x00,  0x04,  0xff,  0x30,  0xc0,   // 0098
-+0x80,  0x03,  0x20,  0x08,  0x14,  0x00,  0x4c,  0xfe,   // 00a0
-+0x30,  0xc0,  0x06,  0x04,  0x20,  0x08,  0x00,  0x00,   // 00a8
-+0x8c,  0xf8,  0x2c,  0x00,  0x00,  0x00,  0x20,  0x30,   // 00b0
-+0x04,  0x00,  0x80,  0x45,  0x71,  0x42,  0xf2,  0x8c,   // 00b8
-+0xd1,  0xc0,  0x39,  0xef,  0x40,  0x02,  0x00,  0x00,   // 00c0
-+0x00,  0x9e,  0x7f,  0x00,  0x29,  0x03,  0x00,  0xfe,   // 00c8
-+0x00,  0x80,  0x00,  0x04,  0x00,  0x00,  0x00,  0x00,   // 00d0
-+0xb6,  0x40,  0x8c,  0xf8,  0x20,  0x00,  0x00,  0x00,   // 00d8
-+0x00,  0x30,  0x18,  0x00,  0x15,  0x40,  0x08,  0xf8,   // 00e0
-+0x00,  0x80,  0x00,  0x00,  0xc0,  0x03,  0x14,  0x00,   // 00e8
-+0x66,  0xed,  0xe0,  0xff,  0xff,  0xff,  0x88,  0xf8,   // 00f0
-+0x20,  0x00,  0x00,  0x00,  0x00,  0xf0,  0x18,  0x00,   // 00f8
-+0x0c,  0x60,  0x64,  0x08,  0x46,  0xc0,  0x44,  0x37,   // 0100
-+0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,  0x84,  0x6e,   // 0108
-+0x09,  0x18,  0x69,  0xa0,  0x04,  0x5f,  0x1c,  0x8b,   // 0110
-+0xf6,  0xc8,  0x45,  0xe8,  0x20,  0x00,  0x00,  0x00,   // 0118
-+0x63,  0x1f,  0xb6,  0x40,  0x04,  0xe8,  0x40,  0x00,   // 0120
-+0x00,  0x00,  0x05,  0xe8,  0x00,  0x02,  0x00,  0x00,   // 0128
-+0x5a,  0x00,  0x46,  0xc0,  0x50,  0x07,  0xa4,  0xff,   // 0130
-+0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,  0x3e,  0x00,   // 0138
-+0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,  0xe0,  0x03,   // 0140
-+0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,  0x00,  0x67,   // 0148
-+0x5a,  0x00,  0x46,  0xc0,  0x50,  0x07,  0xa4,  0xff,   // 0150
-+0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,  0x3e,  0x00,   // 0158
-+0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,  0xe0,  0x03,   // 0160
-+0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,  0x00,  0x67,   // 0168
-+0x5a,  0x00,  0x00,  0xf6,  0x00,  0x80,  0x00,  0x04,   // 0170
-+0x20,  0xed,  0x00,  0x08,  0x00,  0x00,  0x04,  0xe8,   // 0178
-+0x20,  0x00,  0x00,  0x00,  0x8e,  0xf8,  0x20,  0x00,   // 0180
-+0x00,  0x00,  0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,   // 0188
-+0x00,  0x80,  0x81,  0x03,  0x26,  0xed,  0xe0,  0xff,   // 0190
-+0xff,  0xff,  0x88,  0xf0,  0x20,  0x00,  0x86,  0x03,   // 0198
-+0x08,  0x60,  0x64,  0x08,  0x46,  0xc0,  0x44,  0x37,   // 01a0
-+0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,  0xa4,  0x6e,   // 01a8
-+0x7f,  0x90,  0xb9,  0xff,  0x65,  0xa0,  0x04,  0x07,   // 01b0
-+0x18,  0x8b,  0xf5,  0xc8,  0x41,  0xe8,  0x20,  0x00,   // 01b8
-+0x00,  0x00,  0x66,  0x1f,  0x5a,  0x00,  0xe1,  0x40,   // 01c0
-+0xf2,  0x40,  0x4f,  0xc3,  0x50,  0x7f,  0x02,  0x6f,   // 01c8
-+0x03,  0xe8,  0x80,  0x00,  0x00,  0x00,  0x07,  0xe8,   // 01d0
-+0x00,  0x02,  0x00,  0x00,  0xe8,  0x00,  0x08,  0x6d,   // 01d8
-+0xe8,  0xbf,  0x80,  0x01,  0x04,  0x18,  0x08,  0xed,   // 01e0
-+0x20,  0x10,  0x00,  0x00,  0x89,  0x40,  0x1a,  0x40,   // 01e8
-+0x02,  0x6a,  0x2e,  0x18,  0xa1,  0x40,  0x98,  0x40,   // 01f0
-+0xf2,  0x4a,  0x07,  0x1e,  0xff,  0x9f,  0xbb,  0xff,   // 01f8
-+0x21,  0xed,  0x00,  0x08,  0x00,  0x00,  0x98,  0x40,   // 0200
-+0x04,  0xe8,  0x40,  0x00,  0x00,  0x00,  0x95,  0x60,   // 0208
-+0x80,  0x90,  0x20,  0x00,  0x48,  0xe8,  0x00,  0x04,   // 0210
-+0x00,  0x00,  0x41,  0xe8,  0x20,  0x00,  0x00,  0x00,   // 0218
-+0x80,  0x90,  0x18,  0x00,  0x04,  0xe8,  0x00,  0x02,   // 0220
-+0x00,  0x00,  0x65,  0x60,  0x91,  0x40,  0xa8,  0x40,   // 0228
-+0x80,  0x90,  0x10,  0x00,  0x48,  0xe8,  0x00,  0x04,   // 0230
-+0x00,  0x00,  0x41,  0xe8,  0x20,  0x00,  0x00,  0x00,   // 0238
-+0x80,  0x90,  0x08,  0x00,  0x4a,  0xe8,  0x00,  0x08,   // 0240
-+0x00,  0x00,  0xf2,  0x8c,  0xd5,  0xc0,  0x29,  0x03,   // 0248
-+0xef,  0x03,  0x0c,  0xf8,  0x00,  0x80,  0x00,  0x00,   // 0250
-+0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,  0x00,  0x84,   // 0258
-+0x40,  0x00,  0xc0,  0xf8,  0x04,  0x00,  0x00,  0x60,   // 0260
-+0xff,  0x9f,  0x65,  0xff,  0x00,  0xe8,  0x00,  0x04,   // 0268
-+0x00,  0x00,  0xff,  0x9f,  0x70,  0xff,  0x04,  0xff,   // 0270
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0278
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0280
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0288
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xe8,   // 0290
-+0x40,  0x00,  0x00,  0x00,  0x8c,  0xf8,  0x2f,  0x00,   // 0298
-+0x00,  0x00,  0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,   // 02a0
-+0xf0,  0xcf,  0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,   // 02a8
-+0x11,  0x13,  0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,   // 02b0
-+0x20,  0xf7,  0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,   // 02b8
-+0xf0,  0xce,  0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,   // 02c0
-+0x15,  0x53,  0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,   // 02c8
-+0x20,  0xf7,  0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,   // 02d0
-+0xf0,  0xcd,  0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,   // 02d8
-+0x19,  0x93,  0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,   // 02e0
-+0x20,  0xf7,  0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,   // 02e8
-+0xf0,  0xcc,  0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,   // 02f0
-+0x1d,  0xd3,  0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,   // 02f8
-+0x20,  0xf7,  0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,   // 0300
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,   // 0308
-+0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0310
-+0x14,  0x00,  0x00,  0xed,  0x20,  0x00,  0x00,  0x00,   // 0318
-+0x8c,  0xf8,  0x2f,  0x00,  0x00,  0x00,  0xe0,  0x63,   // 0320
-+0x00,  0x00,  0x6f,  0x03,  0x00,  0x00,  0x00,  0x00,   // 0328
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 0330
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 0338
++0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
++0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
++0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
++0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
++0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
++0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
++0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
++0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
++0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
++0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
++0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
++0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
++0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
++0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
++0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
++0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
++0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
++0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
++0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
++0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
++0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
++0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
++0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
++0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
++0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
++0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
++0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
++0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
++0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
++0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
++0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
++0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
++0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
++0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
++0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
++0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
++0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
++0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
++0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
++0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
++0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
++0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
++0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
++0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
++0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
++0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
++0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
++0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
++0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
++0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
++0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
++0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
++0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
++0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
++0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
++0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
++0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
++0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
++0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
++0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
++0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
++0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
++0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
++0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
++0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
++0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
++0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
++0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
++0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
++0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
++0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
++0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
++0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
++0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
++0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
++0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
++0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
++0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
++0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
++0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
++0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
++0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
++0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
++0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
 +};
 diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
 new file mode 100644
-index 0000000000..3557348e30
+index 0000000000..1128a2c054
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_transform8.h
-@@ -0,0 +1,106 @@
+@@ -0,0 +1,94 @@
 +static const unsigned char rpi_hevc_transform8 [] = {
-+0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xe8,   // 0000
-+0x20,  0x00,  0x00,  0x00,  0x0c,  0xf8,  0x00,  0x88,   // 0008
-+0x00,  0x00,  0xc0,  0xf8,  0x00,  0x00,  0x40,  0xe8,   // 0010
-+0x00,  0x02,  0x00,  0x00,  0x0c,  0xf8,  0x00,  0xa8,   // 0018
-+0x00,  0x00,  0xc0,  0xf8,  0x00,  0x00,  0x00,  0x60,   // 0020
-+0x03,  0xe8,  0x20,  0x00,  0x00,  0x00,  0x07,  0xe8,   // 0028
-+0x00,  0x02,  0x00,  0x00,  0x08,  0xe8,  0x00,  0x04,   // 0030
-+0x00,  0x00,  0x04,  0xe8,  0x40,  0x00,  0x00,  0x00,   // 0038
-+0x05,  0xe8,  0x00,  0x08,  0x00,  0x00,  0x39,  0xef,   // 0040
-+0xc0,  0xfd,  0xff,  0xff,  0x2b,  0xef,  0x40,  0x00,   // 0048
-+0x00,  0x00,  0x5b,  0x7a,  0x5b,  0x7c,  0x4a,  0xc3,   // 0050
-+0x50,  0x17,  0x02,  0x6f,  0x02,  0x6a,  0x32,  0x18,   // 0058
-+0x0a,  0x6a,  0x16,  0x40,  0x04,  0x18,  0x1a,  0x66,   // 0060
-+0x80,  0x90,  0x33,  0x00,  0x0c,  0xf8,  0x00,  0x80,   // 0068
-+0x00,  0x00,  0xc0,  0x08,  0x18,  0x00,  0x80,  0x90,   // 0070
-+0x5e,  0x00,  0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,   // 0078
-+0x20,  0x08,  0x10,  0x00,  0x4c,  0xfe,  0x30,  0xc0,   // 0080
-+0x09,  0x04,  0x20,  0x08,  0x00,  0x00,  0x04,  0xfe,   // 0088
-+0x00,  0x90,  0x80,  0x02,  0x00,  0x08,  0x02,  0x00,   // 0090
-+0x80,  0x90,  0x4d,  0x00,  0x04,  0xff,  0x30,  0xc0,   // 0098
-+0x80,  0x03,  0x20,  0x08,  0x14,  0x00,  0x4c,  0xfe,   // 00a0
-+0x30,  0xc0,  0x04,  0x04,  0x20,  0x08,  0x00,  0x00,   // 00a8
-+0x8c,  0xf8,  0x2c,  0x00,  0x00,  0x00,  0x20,  0x30,   // 00b0
-+0x04,  0x00,  0x80,  0x45,  0x71,  0x42,  0xf2,  0x8c,   // 00b8
-+0xd1,  0xc0,  0x39,  0xef,  0x40,  0x02,  0x00,  0x00,   // 00c0
-+0x00,  0x9e,  0x7f,  0x00,  0x29,  0x03,  0x00,  0xfe,   // 00c8
-+0x00,  0x80,  0x00,  0x04,  0x00,  0x00,  0x00,  0x00,   // 00d0
-+0xb6,  0x40,  0x8c,  0xf8,  0x20,  0x00,  0x00,  0x00,   // 00d8
-+0x00,  0x30,  0x18,  0x00,  0x15,  0x40,  0x08,  0xf8,   // 00e0
-+0x00,  0x80,  0x00,  0x00,  0xc0,  0x03,  0x14,  0x00,   // 00e8
-+0x66,  0xed,  0xe0,  0xff,  0xff,  0xff,  0x88,  0xf8,   // 00f0
-+0x20,  0x00,  0x00,  0x00,  0x00,  0xf0,  0x18,  0x00,   // 00f8
-+0x0c,  0x60,  0x64,  0x08,  0x46,  0xc0,  0x44,  0x37,   // 0100
-+0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,  0x84,  0x6e,   // 0108
-+0x09,  0x18,  0x69,  0xa0,  0x04,  0x5f,  0x1c,  0x8b,   // 0110
-+0xf6,  0xc8,  0x45,  0xe8,  0x20,  0x00,  0x00,  0x00,   // 0118
-+0x63,  0x1f,  0xb6,  0x40,  0x04,  0xe8,  0x40,  0x00,   // 0120
-+0x00,  0x00,  0x05,  0xe8,  0x00,  0x08,  0x00,  0x00,   // 0128
-+0x5a,  0x00,  0x46,  0xc0,  0x50,  0x07,  0xa4,  0xff,   // 0130
-+0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,  0x3e,  0x00,   // 0138
-+0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,  0xe0,  0x03,   // 0140
-+0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,  0x00,  0x67,   // 0148
-+0x5a,  0x00,  0x46,  0xc0,  0x50,  0x07,  0xa4,  0xff,   // 0150
-+0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,  0x3e,  0x00,   // 0158
-+0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,  0xe0,  0x03,   // 0160
-+0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,  0x00,  0x67,   // 0168
-+0x5a,  0x00,  0x00,  0xf6,  0x00,  0x80,  0x00,  0x04,   // 0170
-+0x20,  0xed,  0x00,  0x08,  0x00,  0x00,  0x04,  0xe8,   // 0178
-+0x20,  0x00,  0x00,  0x00,  0x8e,  0xf8,  0x20,  0x00,   // 0180
-+0x00,  0x00,  0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,   // 0188
-+0x00,  0x80,  0x81,  0x03,  0x26,  0xed,  0xe0,  0xff,   // 0190
-+0xff,  0xff,  0x88,  0xf0,  0x20,  0x00,  0x86,  0x03,   // 0198
-+0x08,  0x60,  0x64,  0x08,  0x46,  0xc0,  0x44,  0x37,   // 01a0
-+0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,  0xa4,  0x6e,   // 01a8
-+0x7f,  0x90,  0xb9,  0xff,  0x65,  0xa0,  0x04,  0x07,   // 01b0
-+0x18,  0x8b,  0xf5,  0xc8,  0x41,  0xe8,  0x20,  0x00,   // 01b8
-+0x00,  0x00,  0x66,  0x1f,  0x5a,  0x00,  0xe1,  0x40,   // 01c0
-+0xf2,  0x40,  0x4f,  0xc3,  0x50,  0x7f,  0x02,  0x6f,   // 01c8
-+0x03,  0xe8,  0x80,  0x00,  0x00,  0x00,  0x07,  0xe8,   // 01d0
-+0x00,  0x02,  0x00,  0x00,  0xe8,  0x00,  0x08,  0x6d,   // 01d8
-+0xe8,  0xbf,  0x80,  0x01,  0x04,  0x18,  0x08,  0xed,   // 01e0
-+0x20,  0x10,  0x00,  0x00,  0x89,  0x40,  0x1a,  0x40,   // 01e8
-+0x02,  0x6a,  0x2e,  0x18,  0xa1,  0x40,  0x98,  0x40,   // 01f0
-+0xf2,  0x4a,  0x07,  0x1e,  0xff,  0x9f,  0xbb,  0xff,   // 01f8
-+0x21,  0xed,  0x00,  0x08,  0x00,  0x00,  0x98,  0x40,   // 0200
-+0x04,  0xe8,  0x40,  0x00,  0x00,  0x00,  0x95,  0x60,   // 0208
-+0x80,  0x90,  0x20,  0x00,  0x48,  0xe8,  0x00,  0x04,   // 0210
-+0x00,  0x00,  0x41,  0xe8,  0x20,  0x00,  0x00,  0x00,   // 0218
-+0x80,  0x90,  0x18,  0x00,  0x04,  0xe8,  0x00,  0x08,   // 0220
-+0x00,  0x00,  0x45,  0x60,  0x91,  0x40,  0xa8,  0x40,   // 0228
-+0x80,  0x90,  0x10,  0x00,  0x48,  0xe8,  0x00,  0x04,   // 0230
-+0x00,  0x00,  0x41,  0xe8,  0x20,  0x00,  0x00,  0x00,   // 0238
-+0x80,  0x90,  0x08,  0x00,  0x4a,  0xe8,  0x00,  0x08,   // 0240
-+0x00,  0x00,  0xf2,  0x8c,  0xd5,  0xc0,  0x29,  0x03,   // 0248
-+0xef,  0x03,  0x0c,  0xf8,  0x00,  0x80,  0x00,  0x00,   // 0250
-+0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,  0x00,  0x84,   // 0258
-+0x40,  0x00,  0xc0,  0xf8,  0x04,  0x00,  0x00,  0x60,   // 0260
-+0xff,  0x9f,  0x65,  0xff,  0x00,  0xe8,  0x00,  0x04,   // 0268
-+0x00,  0x00,  0xff,  0x9f,  0x70,  0xff,  0x04,  0xff,   // 0270
-+0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0278
-+0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0280
-+0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0288
-+0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xe8,   // 0290
-+0x40,  0x00,  0x00,  0x00,  0x8c,  0xf8,  0x2f,  0x00,   // 0298
-+0x00,  0x00,  0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,   // 02a0
-+0xf0,  0xcf,  0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,   // 02a8
-+0x11,  0x13,  0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,   // 02b0
-+0x20,  0xf7,  0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,   // 02b8
-+0xf0,  0xce,  0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,   // 02c0
-+0x15,  0x53,  0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,   // 02c8
-+0x20,  0xf7,  0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,   // 02d0
-+0xf0,  0xcd,  0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,   // 02d8
-+0x19,  0x93,  0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,   // 02e0
-+0x20,  0xf7,  0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,   // 02e8
-+0xf0,  0xcc,  0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,   // 02f0
-+0x1d,  0xd3,  0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,   // 02f8
-+0x20,  0xf7,  0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,   // 0300
-+0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,   // 0308
-+0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0310
-+0x14,  0x00,  0x00,  0xed,  0x20,  0x00,  0x00,  0x00,   // 0318
-+0x8c,  0xf8,  0x2f,  0x00,  0x00,  0x00,  0xe0,  0x63,   // 0320
-+0x00,  0x00,  0x6f,  0x03,  0x00,  0x00,  0x00,  0x00,   // 0328
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 0330
-+0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 0338
++0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
++0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
++0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
++0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
++0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
++0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
++0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
++0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
++0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
++0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
++0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
++0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
++0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
++0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
++0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
++0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
++0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
++0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
++0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
++0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
++0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
++0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
++0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
++0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
++0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
++0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
++0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
++0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
++0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
++0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
++0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
++0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
++0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
++0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
++0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
++0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
++0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
++0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
++0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
++0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
++0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
++0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
++0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
++0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
++0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
++0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
++0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
++0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
++0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
++0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
++0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
++0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
++0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
++0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
++0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
++0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
++0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
++0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
++0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
++0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
++0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
++0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
++0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
++0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
++0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
++0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
++0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
++0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
++0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
++0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
++0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
++0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
++0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
++0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
++0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
++0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
++0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
++0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
++0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
++0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
++0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
++0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
++0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
++0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
 +};
 diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
 new file mode 100644
-index 0000000000..eef98e5643
+index 0000000000..25ae294ff4
 --- /dev/null
 +++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,5820 @@
+@@ -0,0 +1,6013 @@
 +/*
 + * HEVC video Decoder
 + *
@@ -27957,7 +28357,7 @@ index 0000000000..eef98e5643
 +
 +// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
 +// (4 not required)
-+static void set_cabac_stash(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
 +{
 +    switch (ln)
 +    {
@@ -27984,6 +28384,18 @@ index 0000000000..eef98e5643
 +            *(uint32_t *)b_l = a;
 +            *(uint32_t *)(b_l + 4) = a;
 +            break;
++        case 4:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b_u = a;
++            *(uint32_t *)(b_u + 4) = a;
++            *(uint32_t *)(b_u + 8) = a;
++            *(uint32_t *)(b_u + 12) = a;
++            *(uint32_t *)b_l = a;
++            *(uint32_t *)(b_l + 4) = a;
++            *(uint32_t *)(b_l + 8) = a;
++            *(uint32_t *)(b_l + 12) = a;
++            break;
 +    }
 +}
 +
@@ -28023,7 +28435,7 @@ index 0000000000..eef98e5643
 +
 +    switch (ln)
 +    {
-+        case 0:  // 1
++        default:  // 1
 +            f[0] |= 1 << sh;
 +            break;
 +        case 1:  // 3 * 2
@@ -28038,7 +28450,7 @@ index 0000000000..eef98e5643
 +            f[stride * 2] |= n;
 +            f[stride * 3] |= n;
 +            break;
-+        default:  // 0xff * 8
++        case 3:  // 0xff * 8
 +            for (n = 0; n != 8; ++n, f += stride)
 +                *f = 0xff;
 +            break;
@@ -28556,7 +28968,7 @@ index 0000000000..eef98e5643
 +}
 +
 +void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int val, const int field)
++                                     const HEVCRpiFrame * const ref, const int val, const int field)
 +{
 +    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
 +        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
@@ -28692,34 +29104,41 @@ index 0000000000..eef98e5643
 +    av_freep(&s->cabac_stash_up);
 +    s->cabac_stash_left = NULL;  // freed with _up
 +
-+    av_freep(&s->tab_ipm);
++    av_freep(&s->mvf_up);
++    av_freep(&s->mvf_left);
++
 +    av_freep(&s->is_pcm);
++    av_freep(&s->is_intra_store);
++    s->is_intra = NULL;
++    av_freep(&s->rpl_tab);
++    s->rpl_tab_size = 0;
 +
 +    av_freep(&s->qp_y_tab);
 +    av_freep(&s->tab_slice_address);
 +    av_freep(&s->filter_slice_edges);
 +
 +    av_freep(&s->bs_horizontal);
-+    av_freep(&s->bs_vertical);
++    s->bs_vertical = NULL;  // freed with H
 +    av_freep(&s->bsf_stash_left);
 +    av_freep(&s->bsf_stash_up);
 +
++    av_freep(&s->rpl_up);
++    av_freep(&s->rpl_left);
++
 +    alloc_entry_points(&s->sh, 0);
 +
-+    av_buffer_pool_uninit(&s->tab_mvf_pool);
-+    av_buffer_pool_uninit(&s->rpl_tab_pool);
++    av_buffer_pool_uninit(&s->col_mvf_pool);
 +}
 +
 +/* allocate arrays that depend on frame dimensions */
-+static int pic_arrays_init(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
 +{
-+    int log2_min_cb_size = sps->log2_min_cb_size;
-+    int width            = sps->width;
-+    int height           = sps->height;
-+    int pic_size_in_ctb  = ((width  >> log2_min_cb_size) + 1) *
++    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
++    const unsigned int width            = sps->width;
++    const unsigned int height           = sps->height;
++    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
 +                           ((height >> log2_min_cb_size) + 1);
-+    int ctb_count        = sps->ctb_width * sps->ctb_height;
-+    int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
++    const unsigned int ctb_count        = sps->ctb_size;
 +
 +    {
 +        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
@@ -28739,36 +29158,45 @@ index 0000000000..eef98e5643
 +    if (s->cabac_stash_up == NULL)
 +        goto fail;
 +
-+    s->tab_ipm  = av_mallocz(min_pu_size);
++    // Round width up to max ctb size
++    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++    // * Only needed if we have H tiles
++    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++
 +    // We can overread by 1 line & one byte in deblock so alloc & zero
 +    // We don't need to zero the extra @ start of frame as it will never be
 +    // written
 +    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+    if (!s->tab_ipm || !s->is_pcm)
++    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++    if (s->is_pcm == NULL || s->is_intra_store == NULL)
 +        goto fail;
 +
 +    s->filter_slice_edges = av_mallocz(ctb_count);
-+    s->tab_slice_address  = av_malloc_array(pic_size_in_ctb,
++    s->tab_slice_address  = av_malloc_array(ctb_count,
 +                                      sizeof(*s->tab_slice_address));
-+    s->qp_y_tab           = av_malloc_array(pic_size_in_ctb,
++    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
 +                                      sizeof(*s->qp_y_tab));
 +    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
 +        goto fail;
 +
-+    s->bs_horizontal = av_mallocz(s->bs_size);
-+    s->bs_vertical  = av_mallocz(s->bs_size);
-+    if (s->bs_horizontal == NULL || s->bs_vertical == NULL)
++    s->bs_horizontal = av_mallocz(s->bs_size * 2);
++    s->bs_vertical   = s->bs_horizontal + s->bs_size;
++    if (s->bs_horizontal == NULL)
++        goto fail;
++
++    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
++    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
++    if (s->rpl_left == NULL || s->rpl_up == NULL)
 +        goto fail;
 +
 +    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
 +        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
 +        goto fail;
 +
-+    s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField),
-+                                          av_buffer_allocz);
-+    s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab),
++    s->col_mvf_stride = (width + 15) >> 4;
++    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
 +                                          av_buffer_allocz);
-+    if (!s->tab_mvf_pool || !s->rpl_tab_pool)
++    if (s->col_mvf_pool == NULL)
 +        goto fail;
 +
 +    return 0;
@@ -29186,10 +29614,9 @@ index 0000000000..eef98e5643
 +        if (s->ps.pps->dependent_slice_segments_enabled_flag)
 +            sh->dependent_slice_segment_flag = get_bits1(gb);
 +
-+        slice_address_length = av_ceil_log2(s->ps.sps->ctb_width *
-+                                            s->ps.sps->ctb_height);
++        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
 +        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
-+        if (sh->slice_segment_addr >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) {
++        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
 +            av_log(s->avctx, AV_LOG_ERROR,
 +                   "Invalid slice segment address: %u.\n",
 +                   sh->slice_segment_addr);
@@ -30043,11 +30470,11 @@ index 0000000000..eef98e5643
 +}
 +
 +
-+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCFrame * const ref,
-+                                const Mv * const mv, const int y0, const int height)
++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
++                                const MvXY xy, const int y0, const int height)
 +{
 +    if (s->threads_type != 0) {
-+        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
 +
 +        // Progress has to be attached to current job as the actual wait
 +        // is in worker_core which can't use lc
@@ -30060,8 +30487,8 @@ index 0000000000..eef98e5643
 +
 +static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
 +                                  const int x0, const int y0, const int nPbW,
-+                                  const int nPbH, const int log2_cb_size, const int part_idx,
-+                                  const int merge_idx, MvField * const mv)
++                                  const int nPbH,
++                                  HEVCRpiMvField * const mv)
 +{
 +    enum InterPredIdc inter_pred_idc = PRED_L0;
 +    int mvp_flag;
@@ -30072,34 +30499,33 @@ index 0000000000..eef98e5643
 +        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
 +
 +    if (inter_pred_idc != PRED_L1) {
++        MvXY mvd;
++
 +        if (s->sh.nb_refs[L0])
 +            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
 +
 +        mv->pred_flag = PF_L0;
-+        ff_hevc_rpi_hls_mvd_coding(lc);
++        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
 +        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail,
-+                                 part_idx, merge_idx, mv, mvp_flag, 0);
-+        mv->mv[0].x += lc->pu.mvd.x;
-+        mv->mv[0].y += lc->pu.mvd.y;
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++                                 mv, mvp_flag, 0);
++        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
 +    }
 +
 +    if (inter_pred_idc != PRED_L0) {
++        MvXY mvd = 0;
++
 +        if (s->sh.nb_refs[L1])
-+            mv->ref_idx[1]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
++            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
 +
-+        if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) {
-+            AV_ZERO32(&lc->pu.mvd);
-+        } else {
-+            ff_hevc_rpi_hls_mvd_coding(lc);
-+        }
++        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
++            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
 +
 +        mv->pred_flag += PF_L1;
 +        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail,
-+                                 part_idx, merge_idx, mv, mvp_flag, 1);
-+        mv->mv[1].x += lc->pu.mvd.x;
-+        mv->mv[1].y += lc->pu.mvd.y;
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++                                 mv, mvp_flag, 1);
++        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
 +    }
 +}
 +
@@ -30217,14 +30643,14 @@ index 0000000000..eef98e5643
 +rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
 +           const int x0, const int y0,
 +           const int nPbW, const int nPbH,
-+           const Mv *const mv,
++           const MvXY mv_xy,
 +           const int weight_mul,
 +           const int weight_offset,
 +           AVFrame *const src_frame)
 +{
 +    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const unsigned int mx          = mv->x & 3;
-+    const unsigned int my          = mv->y & 3;
++    const unsigned int mx          = MV_X(mv_xy) & 3;
++    const unsigned int my          = MV_Y(mv_xy) & 3;
 +    const unsigned int my_mx       = (my << 8) | mx;
 +    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
 +    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
@@ -30235,8 +30661,8 @@ index 0000000000..eef98e5643
 +
 +    if (my_mx == 0)
 +    {
-+        const int x1 = x0 + (mv->x >> 2);
-+        const int y1 = y0 + (mv->y >> 2);
++        const int x1 = x0 + (MV_X(mv_xy) >> 2);
++        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
 +        const int bh = nPbH;
 +
 +        for (int start_x = 0; start_x < nPbW; start_x += 16)
@@ -30276,8 +30702,8 @@ index 0000000000..eef98e5643
 +    }
 +    else
 +    {
-+        const int x1_m3 = x0 + (mv->x >> 2) - 3;
-+        const int y1_m3 = y0 + (mv->y >> 2) - 3;
++        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
++        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
 +        const unsigned int bh = nPbH;
 +        int start_x = 0;
 +
@@ -30380,19 +30806,19 @@ index 0000000000..eef98e5643
 +rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
 +           const int x0, const int y0,
 +           const int nPbW, const int nPbH,
-+           const struct MvField *const mv_field,
++           const struct HEVCRpiMvField *const mv_field,
 +           const AVFrame *const src_frame,
 +           const AVFrame *const src_frame2)
 +{
 +    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+    const Mv * const mv  = mv_field->mv + 0;
-+    const Mv * const mv2 = mv_field->mv + 1;
++    const MvXY const mv  = mv_field->xy[0];
++    const MvXY const mv2 = mv_field->xy[1];
 +
-+    const unsigned int mx          = mv->x & 3;
-+    const unsigned int my          = mv->y & 3;
++    const unsigned int mx          = MV_X(mv) & 3;
++    const unsigned int my          = MV_Y(mv) & 3;
 +    const unsigned int my_mx = (my<<8) | mx;
-+    const unsigned int mx2          = mv2->x & 3;
-+    const unsigned int my2          = mv2->y & 3;
++    const unsigned int mx2          = MV_X(mv2) & 3;
++    const unsigned int my2          = MV_Y(mv2) & 3;
 +    const unsigned int my2_mx2 = (my2<<8) | mx2;
 +    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
 +    const unsigned int ref_idx0 = mv_field->ref_idx[0];
@@ -30410,10 +30836,10 @@ index 0000000000..eef98e5643
 +
 +    if (my2_mx2_my_mx == 0)
 +    {
-+        const int x1 = x0 + (mv->x >> 2);
-+        const int y1 = y0 + (mv->y >> 2);
-+        const int x2 = x0 + (mv2->x >> 2);
-+        const int y2 = y0 + (mv2->y >> 2);
++        const int x1 = x0 + (MV_X(mv) >> 2);
++        const int y1 = y0 + (MV_Y(mv) >> 2);
++        const int x2 = x0 + (MV_X(mv2) >> 2);
++        const int y2 = y0 + (MV_Y(mv2) >> 2);
 +        const int bh = nPbH;
 +
 +        // Can do chunks a full 16 wide if we don't want the H filter
@@ -30454,10 +30880,10 @@ index 0000000000..eef98e5643
 +    else
 +    {
 +        // Filter requires a run-up of 3
-+        const int x1 = x0 + (mv->x >> 2) - 3;
-+        const int y1 = y0 + (mv->y >> 2) - 3;
-+        const int x2 = x0 + (mv2->x >> 2) - 3;
-+        const int y2 = y0 + (mv2->y >> 2) - 3;
++        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
++        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
++        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
++        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
 +        const int bh = nPbH;
 +
 +        for (int start_x=0; start_x < nPbW; start_x += 8)
@@ -30512,7 +30938,7 @@ index 0000000000..eef98e5643
 +rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
 +  const unsigned int lx, const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
-+  const Mv * const mv,
++  const MvXY const mv,
 +  const int16_t * const c_weights,
 +  const int16_t * const c_offsets,
 +  AVFrame * const src_frame)
@@ -30521,11 +30947,11 @@ index 0000000000..eef98e5643
 +    const int hshift = 1; // = s->ps.sps->hshift[1];
 +    const int vshift = 1; // = s->ps.sps->vshift[1];
 +
-+    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
 +    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
-+    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
-+    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
 +    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
 +    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
 +    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
@@ -30563,7 +30989,7 @@ index 0000000000..eef98e5643
 +rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
 +  const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
-+  const struct MvField * const mv_field,
++  const struct HEVCRpiMvField * const mv_field,
 +  const int16_t * const c_weights,
 +  const int16_t * const c_offsets,
 +  const int16_t * const c_weights2,
@@ -30574,23 +31000,23 @@ index 0000000000..eef98e5643
 +    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
 +    const int hshift = 1; // s->ps.sps->hshift[1];
 +    const int vshift = 1; // s->ps.sps->vshift[1];
-+    const Mv * const mv = mv_field->mv + 0;
-+    const Mv * const mv2 = mv_field->mv + 1;
++    const MvXY const mv = mv_field->xy[0];
++    const MvXY const mv2 = mv_field->xy[1];
 +
-+    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
-+    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
++    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
 +    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
 +    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
 +
-+    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
-+    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
++    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
 +    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
 +    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
 +
-+    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
-+    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
++    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
 +
 +    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
 +    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
@@ -30637,22 +31063,65 @@ index 0000000000..eef98e5643
 +}
 +
 +
++static inline void
++col_stash(const HEVCRpiContext * const s,
++          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
++          const HEVCRpiMvField * const mvf)
++{
++    ColMvField * const col_mvf = s->ref->col_mvf;
++    const unsigned int x = (x0 + 15) >> 4;
++    const unsigned int y = (y0 + 15) >> 4;
++    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
++    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
++
++    if (col_mvf != NULL && w != 0 && h != 0)
++    {
++        // Only record MV from the top left of the 16x16 block
++
++        const RefPicList * const rpl = s->refPicList;
++        const ColMvField cmv = {
++            .L = {
++                {
++                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
++                            COL_POC_INTRA :
++                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
++                    .xy = mvf->xy[0]
++                },
++                {
++                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
++                            COL_POC_INTRA :
++                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
++                    .xy = mvf->xy[1]
++                }
++            }
++        };
++
++        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
++        const unsigned int stride = s->col_mvf_stride - w;
++        unsigned int j = h;
++
++        do
++        {
++            unsigned int k = w;
++            do
++            {
++                *p++ = cmv;
++            } while (--k != 0);
++            p += stride;
++        } while (--j != 0);
++    }
++}
++
 +static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+                                const int x0, const int y0,
-+                                const int nPbW, const int nPbH,
++                                const unsigned int x0, const unsigned int y0,
++                                const unsigned int nPbW, const unsigned int nPbH,
 +                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
 +{
 +    HEVCRpiJob * const jb = lc->jb0;
 +
-+    struct MvField current_mv = {{{ 0 }}};
-+
-+    int min_pu_width = s->ps.sps->min_pu_width;
-+
-+    MvField * const tab_mvf = s->ref->tab_mvf;
-+    const RefPicList  *const refPicList = s->ref->refPicList;
-+    const HEVCFrame *ref0 = NULL, *ref1 = NULL;
-+    int x_pu, y_pu;
-+    int i, j;
++    struct HEVCRpiMvField current_mv = {{0}};
++    const RefPicList  *const refPicList = s->refPicList;
++    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
 +
 +    if (lc->cu.pred_mode != MODE_SKIP)
 +        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
@@ -30664,28 +31133,34 @@ index 0000000000..eef98e5643
 +        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
 +                                   partIdx, merge_idx, &current_mv);
 +    } else {
-+        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-+                              partIdx, 0, &current_mv);
++        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
 +    }
 +
-+    x_pu = x0 >> s->ps.sps->log2_min_pu_size;
-+    y_pu = y0 >> s->ps.sps->log2_min_pu_size;
++    {
++        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++        unsigned int i, j;
 +
-+    for (j = 0; j < nPbH >> s->ps.sps->log2_min_pu_size; j++)
-+        for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++)
-+            tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv;
++        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
++        {
++            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
++                p[i] = current_mv;
++            p += MVF_STASH_WIDTH_PU;
++        }
++    }
++
++    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
 +
 +    if (current_mv.pred_flag & PF_L0) {
 +        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
 +        if (!ref0)
 +            return;
-+        hevc_await_progress(s, lc, ref0, &current_mv.mv[0], y0, nPbH);
++        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
 +    }
 +    if (current_mv.pred_flag & PF_L1) {
 +        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
 +        if (!ref1)
 +            return;
-+        hevc_await_progress(s, lc, ref1, &current_mv.mv[1], y0, nPbH);
++        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
 +    }
 +
 +    if (current_mv.pred_flag == PF_L0) {
@@ -30694,12 +31169,12 @@ index 0000000000..eef98e5643
 +        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
 +        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
 +
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 0,
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
 +          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
 +          ref0->frame);
 +
 +        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
 +              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
 +              ref0->frame);
 +            return;
@@ -30710,12 +31185,12 @@ index 0000000000..eef98e5643
 +        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
 +        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
 +
-+        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 1,
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
 +          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
 +          ref1->frame);
 +
 +        if (ctx_cfmt(s) != 0) {
-+            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
 +              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
 +              ref1->frame);
 +            return;
@@ -30747,30 +31222,65 @@ index 0000000000..eef98e5643
 +                    const unsigned int log2_cb_size,
 +                    const unsigned int ipm)
 +{
-+    const unsigned int min_pu_width     = s->ps.sps->min_pu_width;
-+    const unsigned int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
-+    const unsigned int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
++    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
++    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
 +
-+    set_bytes(s->tab_ipm + y_pu * min_pu_width + x_pu, min_pu_width, log2_cb_size - s->ps.sps->log2_min_pu_size, ipm);
++    {
++        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
++        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
++    }
 +
-+    if (lc->cu.pred_mode == MODE_INTRA)
++    // If IRAP then everything is Intra & we avoid ever looking at these
++    // stashes so don't bother setting them
++    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
 +    {
-+        unsigned int j, k;
-+        MvField * tab_mvf     = s->ref->tab_mvf + y_pu * min_pu_width + x_pu;
-+        const unsigned int size_in_pus = (1 << log2_cb_size) >> s->ps.sps->log2_min_pu_size;
++        if (s->is_intra != NULL)
++        {
++            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
++        }
 +
-+        if (size_in_pus <= 1)
-+            tab_mvf[0].pred_flag = PF_INTRA;
-+        else
 +        {
-+            for (j = 0; j < size_in_pus; j++, tab_mvf += min_pu_width)
-+                for (k = 0; k < size_in_pus; k++)
-+                    tab_mvf[k].pred_flag = PF_INTRA;
++            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
++            unsigned int n = size_in_pus;
++
++            do
++            {
++                memset(p, 0, size_in_pus * sizeof(*p));
++                p += MVF_STASH_WIDTH_PU;
++            } while (--n != 0);
++        }
++
++
++        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
++        {
++            // Only record top left stuff
++            // Blocks should always be alinged on size boundries
++            // so cannot have overflow from a small block
++
++            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
++            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
++            const unsigned int stride = s->col_mvf_stride - size_in_col;
++            unsigned int j = size_in_col;
++
++            do
++            {
++                unsigned int k = size_in_col;
++                do
++                {
++                    p->L[0].poc = COL_POC_INTRA;
++                    p->L[0].xy = 0;
++                    p->L[1].poc = COL_POC_INTRA;
++                    p->L[1].xy = 0;
++                    ++p;
++                } while (--k != 0);
++                p += stride;
++            } while (--j != 0);
 +        }
 +    }
 +}
 +
-+static void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
 +                                                const unsigned int x0, const unsigned int y0,
 +                                                const unsigned int log2_cb_size)
 +{
@@ -30786,17 +31296,14 @@ index 0000000000..eef98e5643
 +                                int prev_intra_luma_pred_flag,
 +                                const unsigned int idx)
 +{
-+    int x_pu             = x0 >> s->ps.sps->log2_min_pu_size;
-+    int y_pu             = y0 >> s->ps.sps->log2_min_pu_size;
-+    int min_pu_width     = s->ps.sps->min_pu_width;
-+    int x0b              = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size);
-+    int y0b              = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size);
++    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
++    int xb_pu             = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++    int yb_pu             = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
 +
-+    // intra_pred_mode prediction does not cross vertical CTB boundaries
-+    const unsigned int cand_up   = y0b != 0 ?
-+                    s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC;
-+    const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) != 0  || x0b) ?
-+                    s->tab_ipm[y_pu * min_pu_width + x_pu - 1]   : INTRA_DC;
++    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
++    // lc we can just keep 1 CTB lR stashes
++    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
++    const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) == 0  && xb_pu == 0) ? INTRA_DC : lc->ipm_left[yb_pu];
 +
 +    int intra_pred_mode;
 +    int a, b, c;
@@ -31075,7 +31582,7 @@ index 0000000000..eef98e5643
 +
 +    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
 +
-+    set_cabac_stash(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
++    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
 +
 +    return 0;
 +}
@@ -31959,11 +32466,27 @@ index 0000000000..eef98e5643
 +            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), s->ps.sps->log2_ctb_size - 3);
 +        if ((lc->ctb_avail & AVAIL_L) == 0)
 +            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), s->ps.sps->log2_ctb_size - 3);
++#if MVF_STASH_WIDTH > 64
++        // Restore left mvf stash at start of tile if not at start of line
++        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
++        {
++            unsigned int i;
++            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
++            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++            {
++                *dst = *src++;
++                dst += MVF_STASH_WIDTH_PU;
++            }
++        }
++#endif
 +
 +        // Set initial tu states
 +        lc->tu.cu_qp_delta = 0;
 +        lc->tu.is_cu_qp_delta_wanted = 0;
 +        lc->tu.cu_chroma_qp_offset_wanted = 0;
++
++        // Decode
 +        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
 +        if (ff_hevc_rpi_cabac_overflow(lc))
@@ -31973,7 +32496,7 @@ index 0000000000..eef98e5643
 +        }
 +
 +        if (more_data < 0) {
-+            s->tab_slice_address[ctb_addr_rs] = -1;  // Mark slice as broken
++            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
 +            return more_data;
 +        }
 +
@@ -31988,6 +32511,41 @@ index 0000000000..eef98e5643
 +            }
 +        }
 +
++        // --- Post CTB processing
++
++        // Stash rpl top/left for deblock that needs to remember such things cross-slice
++        s->rpl_up[x_ctb >> s->ps.sps->log2_ctb_size] = s->refPicList;
++        s->rpl_left[y_ctb >> s->ps.sps->log2_ctb_size] = s->refPicList;
++
++        if (!s->is_irap)
++        {
++            // Copy MVF up to up-left & stash to up
++            {
++                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
++                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
++
++    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
++
++                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
++                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
++            }
++            // Stash sideways if end of tile line but not end of line (no point)
++            // ** Could/should do this @ end of fn
++#if MVF_STASH_WIDTH > 64
++            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
++#endif
++            {
++                unsigned int i;
++                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
++                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++                {
++                    *dst++ = *src;
++                    src += MVF_STASH_WIDTH_PU;
++                }
++            }
++        }
++
 +        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
 +            ff_hevc_rpi_save_states(s, lc);
 +
@@ -32237,7 +32795,29 @@ index 0000000000..eef98e5643
 +    else
 +    {
 +        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);
++#if MVF_STASH_WIDTH > 64
++        // Horrid calculations to work out what we want but luckily this should almost never execute
++        // **** Move to movlc
++        if (!s->is_irap)
++        {
++            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
++            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
++            {
++                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
++                unsigned int i;
++                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
 +
++                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
++                {
++                    *d_mvf = *s_mvf;
++                    d_mvf += MVF_STASH_WIDTH_PU;
++                    s_mvf += MVF_STASH_WIDTH_PU;
++                }
++
++            }
++        }
++#endif
 +        // When all done poke the thread 0 sem_in one final time
 +#if TRACE_WPP
 +        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
@@ -32541,7 +33121,7 @@ index 0000000000..eef98e5643
 +static void set_no_backward_pred(HEVCRpiContext * const s)
 +{
 +    int i, j;
-+    const RefPicList *const refPicList = s->ref->refPicList;
++    const RefPicList *const refPicList = s->refPicList;
 +
 +    s->no_backward_pred_flag = 0;
 +    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
@@ -32714,24 +33294,43 @@ index 0000000000..eef98e5643
 +
 +static int hevc_frame_start(HEVCRpiContext * const s)
 +{
-+    int pic_size_in_ctb  = ((s->ps.sps->width  >> s->ps.sps->log2_min_cb_size) + 1) *
-+                           ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
 +    int ret;
 +
-+    memset(s->bs_horizontal, 0, s->bs_size);
-+    memset(s->bs_vertical, 0, s->bs_size);
++    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
 +    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
++
++    // Only need to remember intra for CIP
++    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
++        s->is_intra = NULL;
++    else
++    {
++        s->is_intra = s->is_intra_store;
++        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++    }
 +
 +    s->is_decoded        = 0;
 +    s->first_nal_type    = s->nal_unit_type;
 +
 +    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
 +
++    if (s->pkt.nb_nals > s->rpl_tab_size)
++    {
++        // In most cases it will be faster to free & realloc as that doesn't
++        // require (an unwanted) copy
++        av_freep(&s->rpl_tab);
++        s->rpl_tab_size = 0;
++        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
++            goto fail;
++        s->rpl_tab_size = s->pkt.nb_nals;
++    }
++    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
++
 +    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
 +    if (ret < 0)
 +        goto fail;
 +
++    // Resize rpl_tab to max that we might want
 +    ret = ff_hevc_rpi_frame_rps(s);
 +    if (ret < 0) {
 +        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
@@ -32827,6 +33426,7 @@ index 0000000000..eef98e5643
 +                        s->nal_unit_type == HEVC_NAL_RADL_N  ||
 +                        s->nal_unit_type == HEVC_NAL_RASL_N);
 +        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
++        s->is_irap = IS_IRAP(s);
 +
 +#if DEBUG_DECODE_N
 +        {
@@ -32895,7 +33495,7 @@ index 0000000000..eef98e5643
 +        }
 +
 +        ctb_addr_ts = hls_slice_data(s, nal);
-+        if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) {
++        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
 +            s->is_decoded = 1;
 +        }
 +
@@ -33142,7 +33742,7 @@ index 0000000000..eef98e5643
 +    return avpkt->size;
 +}
 +
-+static int hevc_ref_frame(HEVCRpiContext *s, HEVCFrame *dst, HEVCFrame *src)
++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
 +{
 +    int ret;
 +
@@ -33150,22 +33750,15 @@ index 0000000000..eef98e5643
 +    if (ret < 0)
 +        return ret;
 +
-+    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
-+    if (!dst->tab_mvf_buf)
-+        goto fail;
-+    dst->tab_mvf = src->tab_mvf;
-+
-+    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
-+    if (!dst->rpl_tab_buf)
-+        goto fail;
-+    dst->rpl_tab = src->rpl_tab;
-+
-+    dst->rpl_buf = av_buffer_ref(src->rpl_buf);
-+    if (!dst->rpl_buf)
-+        goto fail;
++    if (src->col_mvf_buf != NULL)
++    {
++        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
++        if (!dst->col_mvf_buf)
++            goto fail;
++    }
++    dst->col_mvf = src->col_mvf;
 +
 +    dst->poc        = src->poc;
-+    dst->ctb_count  = src->ctb_count;
 +    dst->flags      = src->flags;
 +    dst->sequence   = src->sequence;
 +    return 0;
@@ -33532,10 +34125,10 @@ index 0000000000..eef98e5643
 +
 diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
 new file mode 100644
-index 0000000000..ea08308be2
+index 0000000000..d324aa273c
 --- /dev/null
 +++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,959 @@
+@@ -0,0 +1,1087 @@
 +/*
 + * HEVC video decoder
 + *
@@ -33574,6 +34167,7 @@ index 0000000000..ea08308be2
 +#include "rpi_hevcpred.h"
 +#include "h2645_parse.h"
 +#include "hevc.h"
++#include "rpi_hevc_mv.h"
 +#include "rpi_hevc_ps.h"
 +#include "rpi_hevc_sei.h"
 +#include "rpi_hevcdsp.h"
@@ -33581,6 +34175,10 @@ index 0000000000..ea08308be2
 +#include "thread.h"
 +#include "videodsp.h"
 +
++#if ARCH_ARM
++#include "arm/rpi_hevc_misc_neon.h"
++#endif
++
 +#define MAX_NB_THREADS 16
 +#define SHIFT_CTB_WPP 2
 +
@@ -33663,10 +34261,7 @@ index 0000000000..ea08308be2
 +
 +
 +// Min CTB size is 16
-+#if ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) >= (1 << 16)
-+#error Check CTB translation array el sizes (currently uint16_t)
-+#endif
-+
++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
 +
 +/**
 + * Value of the luma sample at position (x, y) in the 2D array tab.
@@ -33791,9 +34386,9 @@ index 0000000000..ea08308be2
 +};
 +
 +typedef struct RefPicList {
-+    struct HEVCFrame *ref[HEVC_MAX_REFS];
++    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
 +    int list[HEVC_MAX_REFS];
-+    int isLongTerm[HEVC_MAX_REFS];
++    uint8_t isLongTerm[HEVC_MAX_REFS];
 +    int nb_refs;
 +} RefPicList;
 +
@@ -33820,7 +34415,6 @@ index 0000000000..ea08308be2
 +    uint8_t intra_pred_mode[4];
 +    uint8_t intra_pred_mode_c[4];
 +    uint8_t chroma_mode_c[4];
-+    Mv mvd;
 +    uint8_t merge_flag;
 +} RpiPredictionUnit;
 +
@@ -33848,19 +34442,14 @@ index 0000000000..ea08308be2
 +
 +struct HEVCRpiJob;
 +
-+typedef struct HEVCFrame {
++typedef struct HEVCRpiFrame {
 +    AVFrame *frame;
 +    ThreadFrame tf;
-+    MvField *tab_mvf;
-+    RefPicList *refPicList;
-+    RefPicListTab **rpl_tab;
-+    int ctb_count;
++    ColMvField *col_mvf;
 +    int poc;
-+    struct HEVCFrame *collocated_ref;
++    struct HEVCRpiFrame *collocated_ref;
 +
-+    AVBufferRef *tab_mvf_buf;
-+    AVBufferRef *rpl_tab_buf;
-+    AVBufferRef *rpl_buf;
++    AVBufferRef *col_mvf_buf;
 +
 +    /**
 +     * A sequence counter, so that old frames are output first
@@ -33876,7 +34465,7 @@ index 0000000000..ea08308be2
 +    // Entry no in DPB - can be used as a small unique
 +    // frame identifier (within the current thread)
 +    uint8_t dpb_no;
-+} HEVCFrame;
++} HEVCRpiFrame;
 +
 +typedef struct HEVCRpiLocalContext {
 +    HEVCRpiTransformUnit tu;
@@ -33948,6 +34537,18 @@ index 0000000000..ea08308be2
 +     * of the deblocking filter */
 +    unsigned int boundary_flags;
 +
++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
++    uint8_t ipm_left[IPM_TAB_SIZE];
++    uint8_t ipm_up[IPM_TAB_SIZE];
++
++//#define MVF_STASH_WIDTH       128
++#define MVF_STASH_WIDTH       64
++#define MVF_STASH_HEIGHT      64
++#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
++#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
++    HEVCRpiMvField mvf_ul[1];
++    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
++
 +    /* +7 is for subpixel interpolation, *2 for high bit depths */
 +//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
 +    /* The extended size between the new edge emu buffer is abused by SAO */
@@ -34199,6 +34800,7 @@ index 0000000000..ea08308be2
 +    /** 1 if the independent slice segment header was successfully parsed */
 +    uint8_t slice_initialized;
 +    char used_for_ref;  // rpi
++    char is_irap;
 +    char offload_recon;
 +    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
 +    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
@@ -34242,14 +34844,14 @@ index 0000000000..ea08308be2
 +    uint8_t *sao_pixel_buffer_h[3];
 +    uint8_t *sao_pixel_buffer_v[3];
 +
-+    AVBufferPool *tab_mvf_pool;
-+    AVBufferPool *rpl_tab_pool;
++    unsigned int col_mvf_stride;
++    AVBufferPool *col_mvf_pool;
 +
 +    RpiSAOParams *sao;
 +    DBParams *deblock;
 +    enum HEVCNALUnitType nal_unit_type;
 +    int temporal_id;  ///< temporal_id_plus1 - 1
-+    HEVCFrame *ref;
++    HEVCRpiFrame *ref;
 +    int poc;
 +    int pocTid0;
 +    int slice_idx; ///< number of the slice being currently decoded
@@ -34265,12 +34867,27 @@ index 0000000000..ea08308be2
 +    uint8_t *bsf_stash_up;
 +    uint8_t *bsf_stash_left;
 +
-+    int32_t *tab_slice_address;
++#if HEVC_RPI_MAX_CTBS >= 0xffff
++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
++    uint32_t *tab_slice_address;
++#else
++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
++    uint16_t *tab_slice_address;
++#endif
++
++    // Bitfield 1 bit per 8 pels (min pcm size)
++    uint8_t *is_pcm;
++    // Bitfield 1 bit per 8 pels (min cb size)
++    // Only needed for CIP as CIP processing is async to the main thread
++    uint8_t *is_intra;
 +
 +    // PU
-+    uint8_t *tab_ipm;
++    HEVCRpiMvField *mvf_up;
++    HEVCRpiMvField *mvf_left;
 +
-+    uint8_t *is_pcm;
++    const RefPicList **rpl_up;
++    const RefPicList **rpl_left;
++    RefPicList * refPicList;
 +
 +    // CTB-level flags affecting loop filter operation
 +    uint8_t *filter_slice_edges;
@@ -34297,6 +34914,11 @@ index 0000000000..ea08308be2
 +
 +    struct AVMD5 *md5_ctx;
 +
++    RefPicListTab * rpl_tab;
++    unsigned int rpl_tab_size;
++
++    uint8_t *is_intra_store;
++
 +    RpiSliceHeader sh;
 +
 +    HEVCRpiParamSets ps;
@@ -34304,7 +34926,7 @@ index 0000000000..ea08308be2
 +    HEVCRpiLocalContext    *HEVClc;
 +    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
 +
-+    HEVCFrame DPB[HEVC_DPB_ELS];
++    HEVCRpiFrame DPB[HEVC_DPB_ELS];
 +
 +    ///< candidate references for the current frame
 +    RefPicList rps[5];
@@ -34337,9 +34959,6 @@ index 0000000000..ea08308be2
 + */
 +void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
 +
-+const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref,
-+                                 int x0, int y0);
-+
 +/**
 + * Construct the reference picture sets for the current frame.
 + */
@@ -34366,7 +34985,7 @@ index 0000000000..ea08308be2
 +
 +void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
 +
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags);
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
 +
 +unsigned int ff_hevc_rpi_tb_avail_flags(
 +    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
@@ -34374,11 +34993,13 @@ index 0000000000..ea08308be2
 +
 +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
 +                                int nPbH, int log2_cb_size, int part_idx,
-+                                int merge_idx, MvField * const mv);
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW,
-+                              int nPbH, int log2_cb_size, const unsigned int avail, int part_idx,
-+                              int merge_idx, MvField * const mv,
-+                              int mvp_lx_flag, int LX);
++                                int merge_idx, HEVCRpiMvField * const mv);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    HEVCRpiMvField * const mv,
++    const unsigned int mvp_lx_flag, const unsigned int LX);
 +void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
 +void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
 +                                               const unsigned int x0, const unsigned int y0,
@@ -34398,14 +35019,14 @@ index 0000000000..ea08308be2
 +#endif
 +
 +void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int val, const int field);
++                                     const HEVCRpiFrame * const ref, const int val, const int field);
 +
 +void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
 +
 +// All of these expect that s->threads_type == FF_THREAD_FRAME
 +
 +static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int y)
++                                     const HEVCRpiFrame * const ref, const int y)
 +{
 +    if (s->threads_type != 0)
 +        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
@@ -34418,7 +35039,7 @@ index 0000000000..ea08308be2
 +}
 +
 +static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+                                     const HEVCFrame * const ref, const int y)
++                                     const HEVCRpiFrame * const ref, const int y)
 +{
 +    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
 +}
@@ -34440,7 +35061,7 @@ index 0000000000..ea08308be2
 +
 +// Set all done - signal nothing (used in missing refs)
 +// Works for both rpi & non-rpi
-+static inline void ff_hevc_rpi_progress_set_all_done(HEVCFrame * const ref)
++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
 +{
 +    if (ref->tf.progress != NULL)
 +    {
@@ -34494,13 +35115,113 @@ index 0000000000..ea08308be2
 +#define RPI_ZC_SAND128_ONLY 1
 +#endif
 +
++#ifndef ff_hevc_rpi_copy_vert
++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
++                                         int pixel_shift, int height,
++                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    int i;
++    switch (pixel_shift)
++    {
++        case 2:
++            for (i = 0; i < height; i++) {
++                *(uint32_t *)dst = *(uint32_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        case 1:
++            for (i = 0; i < height; i++) {
++                *(uint16_t *)dst = *(uint16_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        default:
++            for (i = 0; i < height; i++) {
++                *dst = *src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++    }
++}
++#endif
++
++
++#if MVF_STASH_WIDTH == 64
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++    return (HEVCRpiMvField *)((y < y0_ctb) ?
++        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
++        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
++            lc->mvf_stash +
++                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
++                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++                               const unsigned int x0,
++                               const unsigned int x)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
++}
++
++#else
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++    // If not in the same CTB for Y assume up
++    if (y < y0_ctb) {
++        // If not in the same CTB for X too assume up-left
++        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
++    }
++    return mvf_stash_ptr(s, lc, x, y);
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++                               const unsigned int x0,
++                               const unsigned int x)
++{
++    return MVF_STASH_WIDTH_PU;
++}
++#endif
++
 +#endif /* AVCODEC_RPI_HEVCDEC_H */
 diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
 new file mode 100644
-index 0000000000..b041e0fd3f
+index 0000000000..ac29789e7f
 --- /dev/null
 +++ b/libavcodec/rpi_hevcdsp.c
-@@ -0,0 +1,444 @@
+@@ -0,0 +1,450 @@
 +/*
 + * HEVC video decoder
 + *
@@ -34526,6 +35247,7 @@ index 0000000000..b041e0fd3f
 + */
 +
 +#include "rpi_hevcdsp.h"
++#include "rpi_hevc_mv.h"
 +
 +static const int8_t transform[32][32] = {
 +    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
@@ -34626,9 +35348,9 @@ index 0000000000..b041e0fd3f
 +#include "rpi_hevcdsp_template.c"
 +#undef BIT_DEPTH
 +
-+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const MvField *curr, const MvField *neigh,
++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
 +                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc)
++                                               int in_inc0, int in_inc1)
 +{
 +    int shift = 32;
 +    uint32_t bs = 0;
@@ -34636,8 +35358,13 @@ index 0000000000..b041e0fd3f
 +        int strength, out;
 +        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
 +        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
-+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
++        int nr_idx0 = neigh->ref_idx[0];
++        int nr_idx1 = neigh->ref_idx[1];
++        int neigh_refL0 = neigh_rpl0[nr_idx0];
++        int neigh_refL1 = neigh_rpl1[nr_idx1];
++
++        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
++        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
 +
 +#if 1 // This more directly matches the original implementation
 +        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
@@ -34645,24 +35372,24 @@ index 0000000000..b041e0fd3f
 +            if (curr_refL0 == neigh_refL0 &&
 +                curr_refL0 == curr_refL1 &&
 +                neigh_refL0 == neigh_refL1) {
-+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
-+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
++                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
 +                    strength = 1;
 +                else
 +                    strength = 0;
 +            } else if (neigh_refL0 == curr_refL0 &&
 +                       neigh_refL1 == curr_refL1) {
-+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
-+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
 +                    strength = 1;
 +                else
 +                    strength = 0;
 +            } else if (neigh_refL1 == curr_refL0 &&
 +                       neigh_refL0 == curr_refL1) {
-+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
-+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
 +                    strength = 1;
 +                else
 +                    strength = 0;
@@ -34670,24 +35397,24 @@ index 0000000000..b041e0fd3f
 +                strength = 1;
 +            }
 +        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+            Mv curr_mv0, neigh_mv0;
++            MvXY curr_mv0, neigh_mv0;
 +
 +            if (curr->pred_flag & 1) {
-+                curr_mv0   = curr->mv[0];
++                curr_mv0   = curr->xy[0];
 +            } else {
-+                curr_mv0   = curr->mv[1];
++                curr_mv0   = curr->xy[1];
 +                curr_refL0 = curr_refL1;
 +            }
 +
 +            if (neigh->pred_flag & 1) {
-+                neigh_mv0   = neigh->mv[0];
++                neigh_mv0   = neigh->xy[0];
 +            } else {
-+                neigh_mv0   = neigh->mv[1];
++                neigh_mv0   = neigh->xy[1];
 +                neigh_refL0 = neigh_refL1;
 +            }
 +
 +            if (curr_refL0 == neigh_refL0) {
-+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
++                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
 +                    strength = 1;
 +                else
 +                    strength = 0;
@@ -34696,10 +35423,10 @@ index 0000000000..b041e0fd3f
 +        } else
 +            strength = 1;
 +#else // This has exactly the same effect, but is more suitable for vectorisation
-+        Mv curr_mv[2];
-+        Mv neigh_mv[2];
-+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
-+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
++        MvXY curr_mv[2];
++        MvXY neigh_mv[2];
++        memcpy(curr_mv, curr->xy, sizeof curr_mv);
++        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
 +
 +        if (!(curr->pred_flag & 2)) {
 +            curr_mv[1] = curr_mv[0];
@@ -34721,18 +35448,18 @@ index 0000000000..b041e0fd3f
 +        strength = 1;
 +
 +        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
-+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
++                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
++                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
 +
 +        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
-+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
++                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
++                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
 +
 +        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
 +#endif
 +
-+        curr += in_inc / sizeof (MvField);
-+        neigh += in_inc / sizeof (MvField);
++        curr += in_inc0 / sizeof (HEVCRpiMvField);
++        neigh += in_inc1 / sizeof (HEVCRpiMvField);
 +
 +        for (out = dup; out > 0; out--)
 +        {
@@ -34947,10 +35674,10 @@ index 0000000000..b041e0fd3f
 +}
 diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
 new file mode 100644
-index 0000000000..0b532f874b
+index 0000000000..5a7cdeeb66
 --- /dev/null
 +++ b/libavcodec/rpi_hevcdsp.h
-@@ -0,0 +1,185 @@
+@@ -0,0 +1,177 @@
 +/*
 + * HEVC video decoder
 + *
@@ -34981,6 +35708,8 @@ index 0000000000..0b532f874b
 +#include "hevc.h"
 +#include "get_bits.h"
 +
++struct HEVCRpiMvField;
++
 +#define MAX_PB_SIZE 64
 +
 +#define RPI_HEVC_SAO_BUF_STRIDE 160
@@ -34995,16 +35724,6 @@ index 0000000000..0b532f874b
 +
 +} RpiSAOParams;
 +
-+typedef struct Mv {
-+    int16_t x;  ///< horizontal component of motion vector
-+    int16_t y;  ///< vertical component of motion vector
-+} Mv;
-+
-+typedef struct MvField {
-+    DECLARE_ALIGNED(4, Mv, mv)[2];
-+    int8_t ref_idx[2];
-+    int8_t pred_flag;
-+} MvField;
 +
 +// This controls how many sao dsp functions there are
 +// N=5 has width = 8, 16, 32, 48, 64
@@ -35119,9 +35838,9 @@ index 0000000000..0b532f874b
 +                                 uint8_t * src_l,
 +                                 unsigned int no_f);
 +
-+    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh,
++    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
 +                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+                                               int in_inc);
++                                               int in_inc0, int inc_inc1);
 +
 +    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
 +} HEVCDSPContext;
@@ -37721,10 +38440,10 @@ index 0000000000..6e594277c0
 +#endif /* AVCODEC_RPI_HEVCPRED_H */
 diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
 new file mode 100644
-index 0000000000..23835a320e
+index 0000000000..2f710626cf
 --- /dev/null
 +++ b/libavcodec/rpi_hevcpred_template.c
-@@ -0,0 +1,1487 @@
+@@ -0,0 +1,1522 @@
 +/*
 + * HEVC video decoder
 + *
@@ -37755,7 +38474,6 @@ index 0000000000..23835a320e
 +#include "rpi_hevcdec.h"
 +#include "rpi_hevcpred.h"
 +
-+
 +#define DUMP_PRED 0
 +
 +#define POS(x, y) src[(x) + stride * (y)]
@@ -37889,32 +38607,78 @@ index 0000000000..23835a320e
 +
 +// Beware that this inverts the avail ordering
 +// For CIP it seems easier this way round
-+static unsigned int cip_avail(const MvField * mvf, const int mvf_stride, const unsigned int log2_pu_size, const unsigned int avail, unsigned int size,
-+                              unsigned int s0, unsigned int s1)
++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
++                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++                              unsigned int s0, unsigned int odd_s)
 +{
-+    const unsigned int n = 1 << (log2_pu_size - 2);
++    const unsigned int n = 1 << log2_intra_bits;
 +    unsigned int fa = 0;
-+    unsigned int i = 0;
++    unsigned int i;
 +
 +    size >>= 2;   // Now in 4-pel units
 +    s0 >>= 2;
-+    s1 >>= 2;
 +
-+    if ((avail & 4) != 0)
++    if ((avail & AVAIL_DL) != 0)
 +        fa |= ((1 << s0) - 1) << (size - s0);
-+    if ((avail & 2) != 0)
-+        fa |= ((1 << s1) - 1) << size;
-+    if ((avail & 1) != 0)
++    if ((avail & AVAIL_L) != 0)
++        fa |= ((1 << size) - 1) << size;
++    if ((avail & AVAIL_UL) != 0)
 +        fa |= 1 << (size << 1);
 +
-+    for (i = 0; (fa >> i) != 0; i += n, mvf += mvf_stride) {
-+        if ((fa & (((1 << n) - 1) << i)) != 0 && mvf->pred_flag != PF_INTRA)
-+            fa &= ~(((1 << n) - 1) << i);
++    if (odd_s) {
++        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
++            fa &= ~1;
++        is_intra += i_stride;
++    }
++
++    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
++        const unsigned int m = ((1 << n) - 1) << i;
++        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
++            fa &= ~m;
 +    }
 +
 +    return fa;
 +}
 +
++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
++                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++                                unsigned int s1, unsigned int odd_s)
++{
++    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
++    {
++        return 0;
++    }
++    else
++    {
++        const unsigned int n = 1 << log2_intra_bits;
++        unsigned int fa = 0;
++        unsigned int i;
++        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
++
++        size >>= 2;   // Now in 4-pel units
++        s1 >>= 2;
++
++        if ((avail & AVAIL_U) != 0)
++            fa |= ((1 << size) - 1);
++        if ((avail & AVAIL_UR) != 0)
++            fa |= ((1 << s1) - 1) << size;
++
++        if (odd_s) {
++            fa &= im | ~1;
++            im >>= 1;
++        }
++
++        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
++            const unsigned int m = ((1 << n) - 1) << i;
++            if ((im & 1) == 0)
++                fa &= ~m;
++        }
++        return fa;
++    }
++}
++
++
++
 +static inline unsigned int rmbd(unsigned int x)
 +{
 +#if 1
@@ -38053,14 +38817,6 @@ index 0000000000..23835a320e
 +#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
 +#endif
 +
-+
-+#define PU(x) \
-+    ((x) >> s->ps.sps->log2_min_pu_size)
-+#define MVF(x, y) \
-+    (s->ref->tab_mvf[(x) + (y) * s->ps.sps->min_pu_width])
-+#define MVF_PU(x, y) \
-+    MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift))))
-+
 +// Reqs:
 +//
 +// Planar:  DL[0], L, ul, U, UR[0]
@@ -38560,24 +39316,31 @@ index 0000000000..23835a320e
 +            src_ur += stripe_adj;
 +    }
 +
++    // Can deal with I-slices in 'normal' code even if CIP
++    // This also means that we don't need to generate (elsewhere) is_intra
++    // for IRAP frames
 +    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
-+        s->sh.slice_type != HEVC_SLICE_I)  // Can deal with I-slices in 'normal' code
++        s->sh.slice_type != HEVC_SLICE_I)
 +    {
-+        const unsigned int l2_pu_s = FFMAX(s->ps.sps->log2_min_pu_size - hshift, 2);
-+        const unsigned int l2_pu_stride_s = l2_pu_s - (s->ps.sps->log2_min_pu_size - hshift);
-+
-+        unsigned int avail_l = cip_avail(&MVF_PU(-1, size * 2 - 1),
-+                                         -(int)(s->ps.sps->min_pu_width << l2_pu_stride_s),
-+                                         l2_pu_s,
-+                                         avail >> AVAIL_S_UL,
-+                                         size,
-+                                         FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), size);
-+        unsigned int avail_u = cip_avail(&MVF_PU(0, -1),
-+                                         1 << l2_pu_stride_s,
-+                                         l2_pu_s,
-+                                         avail << 1,
-+                                         size,
-+                                         size, FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size));
++        // * If we ever actually care about CIP performance then we should
++        //   special case out size 4 stuff (can be done by 'normal') and
++        //   have 8-pel avail masks
++        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
++                                           -(int)(s->ps.sps->pcm_width),
++                                           1 << (((x - 1) >> (3 - hshift)) & 7),
++                                           1 - hshift,
++                                           avail,
++                                           size,
++                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
++                                           vshift != 0 ? 0 : (y >> 2) & 1);
++
++        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
++                                           (x >> (3 - hshift)) & 7,
++                                           1 - hshift,
++                                           avail,
++                                           size,
++                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
++                                           hshift != 0 ? 0 : (x >> 2) & 1);
 +
 +        // Anything left?
 +        if ((avail_l | avail_u) == 0)
@@ -39190,16 +39953,7 @@ index 0000000000..23835a320e
 +#undef c_src_ptr_t
 +#undef c_dst_ptr_t
 +
-+#undef EXTEND_LEFT_CIP
-+#undef EXTEND_RIGHT_CIP
-+#undef EXTEND_UP_CIP
-+#undef EXTEND_DOWN_CIP
-+#undef IS_INTRA
-+#undef MVF_PU
-+#undef MVF
-+#undef PU
 +#undef EXTEND
-+#undef MIN_TB_ADDR_ZS
 +#undef POS
 +#undef PW
 +
@@ -41519,7 +42273,7 @@ index 0000000000..26fb3be999
 +#endif
 +
 diff --git a/libavfilter/Makefile b/libavfilter/Makefile
-index 3a9fb02556..32e56f6b15 100644
+index bcd5d437ff..ccb49ec8c0 100644
 --- a/libavfilter/Makefile
 +++ b/libavfilter/Makefile
 @@ -346,6 +346,7 @@ OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
@@ -41932,10 +42686,10 @@ index 0000000000..64578b7ac4
 +};
 +
 diff --git a/libavformat/utils.c b/libavformat/utils.c
-index f2f2cc4239..f152a3bcc2 100644
+index c25eab4d49..4db44315c7 100644
 --- a/libavformat/utils.c
 +++ b/libavformat/utils.c
-@@ -2996,6 +2996,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
+@@ -3005,6 +3005,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
      return 1;
  }
  
@@ -41976,7 +42730,7 @@ index f2f2cc4239..f152a3bcc2 100644
  /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
  static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
                              AVDictionary **options)
-@@ -3030,7 +3064,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+@@ -3039,7 +3073,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
          av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
          if (s->codec_whitelist)
              av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
@@ -41989,7 +42743,7 @@ index f2f2cc4239..f152a3bcc2 100644
          if (!options)
              av_dict_free(&thread_opt);
          if (ret < 0) {
-@@ -3061,6 +3099,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
+@@ -3070,6 +3108,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
          if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
              avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
              ret = avcodec_send_packet(avctx, &pkt);
@@ -42004,7 +42758,7 @@ index f2f2cc4239..f152a3bcc2 100644
              if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
                  break;
              if (ret >= 0)
-@@ -3654,9 +3700,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
+@@ -3663,9 +3709,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
          // Try to just open decoders, in case this is enough to get parameters.
          if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
              if (codec && !avctx->codec)
@@ -43290,7 +44044,7 @@ index 0000000000..59c0d3959e
 +# -Wa,-ahls
 diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
 new file mode 100755
-index 0000000000..66c455539d
+index 0000000000..40549a35e5
 --- /dev/null
 +++ b/pi-util/conf_pi2.sh
 @@ -0,0 +1,32 @@
@@ -43313,7 +44067,7 @@ index 0000000000..66c455539d
 + --disable-thumb\
 + --enable-mmal\
 + --enable-rpi\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
 + --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
 + --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
 + --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
@@ -43509,10 +44263,10 @@ index 0000000000..e9556f0837
 +
 diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
 new file mode 100755
-index 0000000000..27cc453963
+index 0000000000..8bb326943f
 --- /dev/null
 +++ b/pi-util/ffperf.py
-@@ -0,0 +1,124 @@
+@@ -0,0 +1,125 @@
 +#!/usr/bin/env python3
 +
 +import time
@@ -43583,6 +44337,7 @@ index 0000000000..27cc453963
 +    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
 +    argp.add_argument("--csv_in", help="CSV input filename")
 +    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
++    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
 +
 +    args = argp.parse_args()
 +
@@ -43617,7 +44372,7 @@ index 0000000000..27cc453963
 +        print ("====", f)
 +
 +        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
-+        for i in range(3):
++        for i in range(args.repeat):
 +            t = tstats.time_file(f, prefix)
 +            print ("...", t.times_str())
 +            if t0 > t: