diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk index 058b5a74c31..9449a1f9e28 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv) PKG_NAME="inputstream.adaptive" -PKG_VERSION="2.2.27" -PKG_SHA256="15d1e2f05d3ddeb31a9509e9fc6c8a305a6055ba68329c717606bb895ed5aacf" +PKG_VERSION="7f0d294f7d7bbc37b7f1fe2cd2e47dd5d4c2fcfa" +PKG_SHA256="6c64725dabb29c37e022fa78469bb5a152a24cac38c85676003ecf1c7067f4bb" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk index 30bb3fd06b8..25375eb97a2 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.dvbviewer" -PKG_VERSION="b63e867740a61cf5a9c530a636069fa8ec1e20c7" -PKG_SHA256="51b51ef6ecb7ed0bfb774e6d17ede6621e84aea755038ed938ffc730e80d1d60" +PKG_VERSION="f09e3eba97a0d4d588f1d1837e361129ce5b64e9" +PKG_SHA256="246aad9a16ca160f1255c8994e55b10402c83bf4815aa1c97d06c74147beb295" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk index 8b77d569b75..ada45f99425 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.hts" -PKG_VERSION="7c7c6cefc785ccd5ee7015eb997e0c688258a4e0" -PKG_SHA256="c471894f9efe5b69bc10ab2bcff4d808e760cddb07f18e1a320c7f361d2b5b43" +PKG_VERSION="8b66ec3d80527f8803ba3b8db4abb34a18cfde54" +PKG_SHA256="54f0171cf3c03ad58f6e277a17d4935402f709e23fb6a659cdbd2376b4594943" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk index c05d617b94b..670f030fe77 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.mediaportal.tvserver" -PKG_VERSION="cc3eaf05e1459bc9981b00c6bed32adddca53630" -PKG_SHA256="28946df252cd1f29d34fec2fc7448d8b12a9e3249efea27ddc2284d301113b94" +PKG_VERSION="b17d5ad3ce77ad844ba3e33b50a887606bd24dd3" +PKG_SHA256="3f71ce08e0d9bf1bbda4c3012c774b05feeb913f0f093b8952c0475ff7564042" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk index 104c91583ac..a04c7d24413 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.mythtv" -PKG_VERSION="e61f47ce6c00a3efa98f2b83e5e27b0fa1d40584" -PKG_SHA256="e067e15534688eaa9a4bdd1033e7ef63dbcaa7b67f809c9b3937d5b6b7285afd" +PKG_VERSION="22aa23b80c9f9ae8dd8e39b1d29487eb519e6101" +PKG_SHA256="3f5107619d631319eb2b10590a0d02493e397d2052502676c2f928e5a7695515" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk index 595ecb877db..459d58cbc9a 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.sledovanitv.cz/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.sledovanitv.cz" -PKG_VERSION="293c7fce7e811d305caf43e55019534c093df533" -PKG_SHA256="f405aed37bcddc85010388e47bfccca61efa7a5b32fcedee15e0c118e44f650c" +PKG_VERSION="fcc62d88f50a6b49a18bbb51323338855f3be873" +PKG_SHA256="d6f313892c865d043ce4a823b22739b2f2e07741c1cffcdb31d48a8817b3c956" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPLv2" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk index 8e53a00979a..c7d0af3e345 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.vuplus" -PKG_VERSION="016a4ff6d8c607a0119e825605f8f83a073fc662" -PKG_SHA256="c9a42c2f208cb4c0b833c93397e47463b1314099203dce4e64f3967ff79907dd" +PKG_VERSION="6b36662707a096753e834d66d6c2a9c32dbdc240" +PKG_SHA256="5eec48c068a39b1c7ef9b46c12578ffa096dffa3818eff6e614c9ac77bf6312f" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk index f4e4e4a2157..45e30868906 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk @@ -2,8 +2,8 @@ # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv) PKG_NAME="pvr.zattoo" -PKG_VERSION="12134c8659fffd564505ebe1eb01ecc49f5e3cdc" -PKG_SHA256="859e25c0f233be46eed7889bfabde6191e930a33cc78470c1aa32b264c6f6955" +PKG_VERSION="8de69a10cd3c68e21a8ebb0c6b46111a5f9c8d66" +PKG_SHA256="0aa8fb78d84c127f103c374e922bd361199b8869ba5333b569fa685cccb6a774" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk index 66b5c1690b2..73c44dbe9bd 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.pictureit/package.mk @@ -3,8 +3,8 @@ # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv) PKG_NAME="visualization.pictureit" -PKG_VERSION="11063b29c238a6e81d7e779f18933140221ac439" -PKG_SHA256="94f0576a59a3bd08cfc6be94cd5b2ec8f57e7dd86bf9f5c41bc0c82a3f47f78d" +PKG_VERSION="f08d0aa6d5f80cfa95a24adca48be15e333ca8e0" +PKG_SHA256="f29112d232907b46a738e6971f03c75ec6a61b70be70de203db82b2252ae4f8d" PKG_REV="2" PKG_ARCH="x86_64" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi/config/appliance.xml b/packages/mediacenter/kodi/config/appliance.xml index 8889ff1410b..7831707eebd 100644 --- a/packages/mediacenter/kodi/config/appliance.xml +++ b/packages/mediacenter/kodi/config/appliance.xml @@ -4,9 +4,6 @@
- - true - false diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk index 0428ef9f18f..e1c6aa6efa3 100644 --- a/packages/mediacenter/kodi/package.mk +++ b/packages/mediacenter/kodi/package.mk @@ -15,14 +15,14 @@ PKG_PATCH_DIRS="$KODI_VENDOR" case $KODI_VENDOR in raspberrypi) - PKG_VERSION="newclock5_18.0b1v2-Leia" - PKG_SHA256="7434263c55aa528f3e3d8f455cffe3148e3707a1c1068f80bd08829094e16576" + PKG_VERSION="newclock5_18.0b2-Leia" + PKG_SHA256="28ba41ea6a942f4399b98e300596ea4a85ac043c8358c9eae9f2d0e0bee9aa99" PKG_URL="https://github.com/popcornmix/xbmc/archive/$PKG_VERSION.tar.gz" PKG_SOURCE_NAME="kodi-$KODI_VENDOR-$PKG_VERSION.tar.gz" ;; *) - PKG_VERSION="18.0b1v2-Leia" - PKG_SHA256="3808aa97723b710a0774261116e3387f091bc3d8150b9ba49ef36cb30b3d7ba2" + PKG_VERSION="18.0b2-Leia" + PKG_SHA256="25fc0aabfb523d4db19e08b1990d4851592ee2adec0424f5fb729bd3672eae69" PKG_URL="https://github.com/xbmc/xbmc/archive/$PKG_VERSION.tar.gz" PKG_SOURCE_NAME="kodi-$PKG_VERSION.tar.gz" ;; diff --git a/packages/mediacenter/kodi/patches/kodi-995.01-pr14354_wrapper_toolchain_nm.patch b/packages/mediacenter/kodi/patches/kodi-995.01-pr14354_wrapper_toolchain_nm.patch deleted file mode 100644 index 92b634da191..00000000000 --- a/packages/mediacenter/kodi/patches/kodi-995.01-pr14354_wrapper_toolchain_nm.patch +++ /dev/null @@ -1,17 +0,0 @@ -wrapper.def: - - make nm binary configurable (-DCMAKE_NM=..) - - fail if an empty file is generated - -diff --git a/xbmc/cores/DllLoader/exports/CMakeLists.txt b/xbmc/cores/DllLoader/exports/CMakeLists.txt -index 580a779fdc..efcd872cad 100644 ---- a/xbmc/cores/DllLoader/exports/CMakeLists.txt -+++ b/xbmc/cores/DllLoader/exports/CMakeLists.txt -@@ -16,7 +16,7 @@ elseif(NOT CORE_SYSTEM_NAME STREQUAL windows AND NOT CORE_SYSTEM_NAME STREQUAL w - add_options(C ALL_BUILDS "-fPIC") - add_library(wrapper OBJECT wrapper.c) - -- add_custom_target(wrapper.def ALL nm ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/wrapper.dir/wrapper.c.o | grep __wrap | awk '{ printf(\"%s \", \$\$3) }' | sed \"s/___wrap_/__wrap_/g\" | sed \"s/__wrap_/-Wl,-wrap,/g\" > wrapper.def) -+ add_custom_target(wrapper.def ALL ${CMAKE_NM} ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/wrapper.dir/wrapper.c.o | grep __wrap | awk '{ printf(\"%s \", \$\$3) }' | sed \"s/___wrap_/__wrap_/g\" | sed \"s/__wrap_/-Wl,-wrap,/g\" > wrapper.def && test -s wrapper.def) - - if(CORE_SYSTEM_NAME STREQUAL android) - add_custom_command(TARGET wrapper.def COMMAND echo \"-L${DEPENDS_PATH}/lib/dummy-lib${APP_NAME_LC} -l${APP_NAME_LC}\" >> wrapper.def) diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 1b4d0da9066..1d65823f3da 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -19,7 +19,7 @@ index 0e57cb0b4c..b2e3374fea 100644 /ffplay /ffprobe diff --git a/configure b/configure -index dee507cb6a..9a93189107 100755 +index 827abfe694..28f630068e 100755 --- a/configure +++ b/configure @@ -318,6 +318,7 @@ External library support: @@ -55,7 +55,7 @@ index dee507cb6a..9a93189107 100755 huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" iac_decoder_select="imc_decoder" -@@ -3392,6 +3397,8 @@ tinterlace_filter_deps="gpl" +@@ -3393,6 +3398,8 @@ tinterlace_filter_deps="gpl" tinterlace_merge_test_deps="tinterlace_filter" tinterlace_pad_test_deps="tinterlace_filter" tonemap_filter_deps="const_nan" @@ -65,7 +65,7 @@ index dee507cb6a..9a93189107 100755 uspp_filter_deps="gpl avcodec" vaguedenoiser_filter_deps="gpl" diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index 4dbe72186d..0e48ecb9da 100644 +index c0214c42d8..faaea5772a 100644 --- a/fftools/ffmpeg.c +++ b/fftools/ffmpeg.c @@ -24,6 +24,12 @@ @@ -409,7 +409,7 @@ index 4dbe72186d..0e48ecb9da 100644 break; } -@@ -2887,6 +3166,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) +@@ -2891,6 +3170,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; @@ -1681,10 +1681,10 @@ index 0000000000..0211e447a8 + diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S new file mode 100644 -index 0000000000..3bbfb443bf +index 0000000000..200eac416e --- /dev/null +++ b/libavcodec/arm/rpi_hevc_misc_neon.S -@@ -0,0 +1,226 @@ +@@ -0,0 +1,238 @@ +#include "libavutil/arm/asm.S" +#include "neon.S" + @@ -1728,14 +1728,14 @@ index 0000000000..3bbfb443bf +.endif +.endm + -+.macro cpy_compound val, p1, p2 ++.set expected_next, 0 ++ ++.macro cpy_compound val, p1, p2, drop_thru=0 +.if \p1 + \p2 != \val +.error "Bad addition! \p1 + \p2 != \val" +.endif -+.if \val <= 64 -+@ As max we deal with 128 vals above 64 will never be recursed to -+100\val\(): -+ push {r11, lr} ++.if expected_next != 0 && expected_next != \val ++.error "Drop thru failure" +.endif +\val\(): + push {r0-r3} @@ -1743,7 +1743,12 @@ index 0000000000..3bbfb443bf + pop {r0-r3} + add r0, #\p1 + add r2, #\p1 ++.if \drop_thru == 0 + b \p2\()b ++.set expected_next, 0 ++.else ++.set expected_next, \p2 ++.endif +.endm + +@ ff_hevc_cpy_blks8x4_neon( @@ -1763,9 +1768,12 @@ index 0000000000..3bbfb443bf +function ff_hevc_rpi_cpy_blks8x4_neon, export=1 + ldr r12, [sp, #0] + push {r11, lr} -+ sub r12, #1 -+A adr lr, 98f -+ ubfx r12, r12, #3, #4 ++.if jent_pic ++A adr lr, 98f - 2 ++.else ++A adr lr, 98f - 4 ++.endif ++ lsr r12, #3 + ldr r11, [sp, #(8 + 4)] +.if jent_pic +A lsl r12, #1 @@ -1778,6 +1786,7 @@ index 0000000000..3bbfb443bf +.endif + +98: ++T .short 0 @ unused + jent 8f + jent 16f + jent 24f @@ -1835,8 +1844,6 @@ index 0000000000..3bbfb443bf + bgt 1b + pop {r11, pc} + -+cpy_compound 24, 16, 8 -+ +10032: + push {r11, lr} +32: @@ -1857,10 +1864,6 @@ index 0000000000..3bbfb443bf + bgt 1b + pop {r11, pc} + -+cpy_compound 40, 32, 8 -+cpy_compound 48, 32, 16 -+cpy_compound 56, 32, 24 -+ +10064: + push {r11, lr} +64: @@ -1879,14 +1882,6 @@ index 0000000000..3bbfb443bf + bgt 1b + pop {r11, pc} + -+cpy_compound 72, 64, 8 -+cpy_compound 80, 64, 16 -+cpy_compound 88, 64, 24 -+cpy_compound 96, 64, 32 -+cpy_compound 104, 64, 40 -+cpy_compound 112, 64, 48 -+cpy_compound 120, 64, 56 -+ +128: + push {r4, r5} + @ We could do this with fewer registers if we jump around but I @@ -1909,8 +1904,539 @@ index 0000000000..3bbfb443bf + bgt 1b + pop {r4, r5, r11, pc} + ++@ Use drop_thru where we can ++cpy_compound 104, 64, 40, 1 ++cpy_compound 40, 32, 8 ++ ++cpy_compound 112, 64, 48, 1 ++cpy_compound 48, 32, 16 ++ ++cpy_compound 120, 64, 56, 1 ++cpy_compound 56, 32, 24, 1 ++cpy_compound 24, 16, 8 ++ ++cpy_compound 72, 64, 8 ++cpy_compound 80, 64, 16 ++cpy_compound 88, 64, 24 ++cpy_compound 96, 64, 32 ++ ++ +endfunc + +diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h +new file mode 100644 +index 0000000000..9d21f6a882 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_misc_neon.h +@@ -0,0 +1,438 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H ++#define AVCODEC_ARM_RPI_HEVC_MISC_H ++ ++#include "config.h" ++#if HAVE_NEON_INLINE && !CONFIG_THUMB ++ ++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_src) ++{ ++ const uint8_t *src2 = src + stride_src; ++ stride_src <<= 1; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {q0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {q1}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.32 {q0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.32 {q1}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.16 d0, d1 \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.16 d2, d3 \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d2}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vzip.16 d0, d1 \n\t" ++ "vst1.16 {d0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vzip.16 d2, d3 \n\t" ++ "vst1.16 {d2}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #8 \n\t" ++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.8 d0, d1 \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.8 d2, d3 \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d2}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vzip.8 d0, d1 \n\t" ++ "vst1.8 {d0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vzip.8 d2, d3 \n\t" ++ "vst1.8 {d2}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst) ++{ ++ uint8_t *dst2 = dst + stride_dst; ++ stride_dst <<= 1; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "subs %[height], #4 \n\t" ++ "vld1.32 {q0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.32 {q1}, [%[src]]! \n\t" ++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.32 {q0}, [%[src]]! \n\t" ++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d1[0]}, [%[dst]] \n\t" ++ "vst1.32 {d1[1]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d3[0]}, [%[dst]] \n\t" ++ "vst1.32 {d3[1]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "subs %[height], #4 \n\t" ++ "vld1.16 {d0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.16 {d2}, [%[src]]! \n\t" ++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.16 {d0}, [%[src]]! \n\t" ++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d0[2]}, [%[dst]] \n\t" ++ "vst1.16 {d0[3]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d2[2]}, [%[dst]] \n\t" ++ "vst1.16 {d2[3]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "subs %[height], #8 \n\t" ++ "vld1.8 {d0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.8 {d2}, [%[src]]! \n\t" ++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.8 {d0}, [%[src]]! \n\t" ++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[6]}, [%[dst]] \n\t" ++ "vst1.8 {d0[7]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[6]}, [%[dst]] \n\t" ++ "vst1.8 {d2[7]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ int x, y; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "ldr %[x], [%[src]], %[stride_src] \n\t" ++ "ldr %[y], [%[src]], %[stride_src] \n\t" ++ "str %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldr %[x], [%[src]], %[stride_src] \n\t" ++ "str %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldr %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "str %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "str %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "ldrh %[x], [%[src]], %[stride_src] \n\t" ++ "ldrh %[y], [%[src]], %[stride_src] \n\t" ++ "strh %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldrh %[x], [%[src]], %[stride_src] \n\t" ++ "strh %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldrh %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "strh %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "strh %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "ldrb %[x], [%[src]], %[stride_src] \n\t" ++ "ldrb %[y], [%[src]], %[stride_src] \n\t" ++ "strb %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldrb %[x], [%[src]], %[stride_src] \n\t" ++ "strb %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldrb %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "strb %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "strb %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon ++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ if (stride_dst == 1 << pixel_shift) ++ ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src); ++ else if (stride_src == 1 << pixel_shift) ++ ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst); ++ else ++ ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src); ++} ++ ++#endif /* HAVE_NEON_INLINE */ ++ ++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */ +diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h +new file mode 100644 +index 0000000000..c73de55a48 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_mv_arm.h +@@ -0,0 +1,64 @@ ++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H ++#define AVCODEC_ARM_RPI_HEVC_MV_H ++ ++#if HAVE_ARMV6T2_INLINE ++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b) ++{ ++ MvXY r; ++ __asm__ ( ++ "sadd16 %[r], %[a], %[b] \n\t" ++ : [r]"=r"(r) ++ : [a]"r"(a), ++ [b]"r"(b) ++ : ++ ); ++ return r; ++} ++#define mvxy_add mvxy_add_arm ++#endif ++ ++#if HAVE_ARMV6T2_INLINE ++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV)) ++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb) ++{ ++ int t; ++ __asm__ ( ++ "ssat %[td], #8, %[td] \n\t" ++ "ssat %[tb], #8, %[tb] \n\t" ++ "eor %[t], %[td], %[td], asr #31 \n\t" ++ "adds %[t], %[t], %[td], lsr #31 \n\t" ++ "asr %[t], #1 \n\t" ++ "add %[t], #0x4000 \n\t" ++ "it ne \n\t" ++ "sdivne %[t], %[t], %[td] \n\t" ++ "mov %[td], #32 \n\t" ++ "smlabb %[td], %[t], %[tb], %[td] \n\t" ++ "ssat %[td], #13, %[td], asr #6 \n\t" ++ "mov %[tb], #127 \n\t" ++ "smlatb %[t], %[xy], %[td], %[tb] \n\t" ++ "smlabb %[tb], %[xy], %[td], %[tb] \n\t" ++// This takes the sign of x & y for rounding at the "wrong" point ++// (i.e. after adding 127) but for the range of values (-1,-127) ++// where it does the wrong thing you get the right answer (0) anyway ++ "add %[t], %[t], %[t], lsr #31 \n\t" ++ "add %[xy], %[tb], %[tb], lsr #31 \n\t" ++ "ssat %[t], #16, %[t], asr #8 \n\t" ++ "ssat %[xy], #16, %[xy], asr #8 \n\t" ++ "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t" ++ : ++ [t]"=&r"(t), ++ [xy]"+r"(xy), ++ [td]"+r"(td), ++ [tb]"+r"(tb) ++ : ++ : ++ "cc" ++ ); ++ return xy; ++} ++#define mv_scale_xy mv_scale_xy_arm ++#endif ++#endif ++ ++#endif // AVCODEC_ARM_RPI_HEVC_MV_H ++ diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h new file mode 100644 index 0000000000..62b9326532 @@ -1945,10 +2471,10 @@ index 0000000000..62b9326532 +#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S new file mode 100644 -index 0000000000..98512d21dc +index 0000000000..18a76a4112 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S -@@ -0,0 +1,1625 @@ +@@ -0,0 +1,1633 @@ +/* + * Copyright (c) 2014 Seppo Tomperi + * @@ -3042,22 +3568,24 @@ index 0000000000..98512d21dc +.endm + + -+#if 1 // NEON version ++@ The NEON version is faster under ideal circumstances (i.e. everything in L1) ++@ But in real world testing it is ~20% slower, presumably due to code size + ++#if 0 // NEON version + -+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, + * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ * int in_inc) ++ * int in_inc0, int in_inc1) + */ +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 + mov ip, sp -+ push {a2,v1-v8,lr} -+ ldm ip, {v1-v5} ++ push {a1-a3,v1-v8,lr} ++ ldm ip, {v1-v6} + cmp a1, #2 + bls 2f + vpush {d8-d13} + sub v5, v5, #10 -+ mov v6, #32 ++ sub v6, v6, #10 +1: + vld2.32 {d0[0], d2[0]}, [a3]! + vld2.32 {d4[0], d6[0]}, [a4]! @@ -3069,7 +3597,7 @@ index 0000000000..98512d21dc + add a2, v1, a2, lsl #2 + vld1.8 {d24[0]}, [a3], v5 + add ip, v3, ip, lsl #2 -+ vld1.8 {d25[0]}, [a4], v5 ++ vld1.8 {d25[0]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d16[0]}, [a2] + add lr, v4, lr, lsl #2 @@ -3089,7 +3617,7 @@ index 0000000000..98512d21dc + add a2, v1, a2, lsl #2 + vld1.8 {d24[2]}, [a3], v5 + add ip, v3, ip, lsl #2 -+ vld1.8 {d25[2]}, [a4], v5 ++ vld1.8 {d25[2]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d16[1]}, [a2] + add lr, v4, lr, lsl #2 @@ -3106,7 +3634,7 @@ index 0000000000..98512d21dc + add a2, v1, a2, lsl #2 + vld1.8 {d24[4]}, [a3], v5 + add ip, v3, ip, lsl #2 -+ vld1.8 {d25[4]}, [a4], v5 ++ vld1.8 {d25[4]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d17[0]}, [a2] + add lr, v4, lr, lsl #2 @@ -3123,7 +3651,7 @@ index 0000000000..98512d21dc + add a2, v1, a2, lsl #2 + vld1.8 {d24[6]}, [a3], v5 + add ip, v3, ip, lsl #2 -+ vld1.8 {d25[6]}, [a4], v5 ++ vld1.8 {d25[6]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d17[1]}, [a2] + add lr, v4, lr, lsl #2 @@ -3150,16 +3678,16 @@ index 0000000000..98512d21dc + vtst.16 d30, d25, d12 + vtst.16 d31, d25, d13 + veor d26, d8, d9 -+ ldr lr, [sp, 6*8] ++ ldr lr, [sp, 6*8 + 1*4] + vmovl.s16 q4, d28 + vmovl.s16 q5, d29 + teq lr, #1 + vmovl.s16 q14, d30 -+ it ne -+ lslne v1, lr, #1 ++ it ne ++ lslne v1, lr, #1 + vmovl.s16 q15, d31 -+ it ne -+ rsbne v2, v1, #32 ++ it ne ++ rsbne v2, v1, #32 + vbif q0, q1, q4 + vbif q2, q3, q14 + vbif q1, q0, q5 @@ -3212,7 +3740,6 @@ index 0000000000..98512d21dc + vmov v8, s1 + vmov.u16 ip, d0[1] + vmov.u16 lr, d0[3] -+ sub v6, #8 + lsl a2, #30 + lsl v8, #30 + lsl ip, #30 @@ -3224,9 +3751,12 @@ index 0000000000..98512d21dc + orr v7, a2, v7, lsr #8 + bhi 1b + ++ mov a1, #32 ++ ldr a3, [sp, #6*8] + vpop {d8-d13} -+ mov a1, v7, lsr v6 -+ pop {a2,v1-v8,pc} ++ sub a1, a1, a3, lsl #1 ++ mov a1, v7, lsr a1 ++ pop {a2-a4,v1-v8,pc} +10: + @ Merge results into result word, with duplicates + vmul.i16 d0, d1 @@ -3234,13 +3764,12 @@ index 0000000000..98512d21dc + vmov v8, s1 + vmov.u16 ip, d0[1] + vmov.u16 lr, d0[3] -+ sub v6, v6, v1, lsl #2 + lsl a2, v2 + subs a1, #4 + lsl v8, v2 + lsl ip, v2 + lsl lr, v2 -+ ldr v2, [sp, #6*8 + 10*4 + 1*4] ++ ldr v2, [sp, #6*8 + 12*4 + 1*4] +T lsr a2, v1 +T orr a2, ip, a2 +A orr a2, ip, a2, lsr v1 @@ -3252,19 +3781,24 @@ index 0000000000..98512d21dc +T lsr a2, ip +T orr a2, v8, a2 +A orr a2, v8, a2, lsr ip -+ ldr v1, [sp, #6*8 + 10*4] ++ ldr v1, [sp, #6*8 + 12*4] +T lsr v7, lr +T orr v7, a2, v7 +A orr v7, a2, v7, lsr lr + bhi 1b + ++ mov a1, #32 ++ ldrd a3, a4, [sp, #6*8] + vpop {d8-d13} -+ mov a1, v7, lsr v6 -+ pop {a2,v1-v8,pc} ++ mls a1, a3, a4, a1 ++ mls a1, a3, a4, a1 ++ mov a1, v7, lsr a1 ++ pop {a2-a4,v1-v8,pc} + + +2: + sub v5, v5, #10 ++ sub v6, v6, #10 + vmov.u8 d16, #0 + blo 3f + vld2.32 {d0[0], d1[0]}, [a3]! @@ -3276,7 +3810,7 @@ index 0000000000..98512d21dc + add a2, v1, a2, lsl #2 + vld1.8 {d16[0]}, [a3], v5 + add ip, v3, ip, lsl #2 -+ vld1.8 {d16[4]}, [a4], v5 ++ vld1.8 {d16[4]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d4[0]}, [a2] + add lr, v4, lr, lsl #2 @@ -3297,7 +3831,7 @@ index 0000000000..98512d21dc + add a2, v1, a2, lsl #2 + vld1.8 {d16[2]}, [a3], v5 + add ip, v3, ip, lsl #2 -+ vld1.8 {d16[6]}, [a4], v5 ++ vld1.8 {d16[6]}, [a4], v6 + add v8, v2, v8, lsl #2 + vld1.32 {d4[1]}, [a2] + add lr, v4, lr, lsl #2 @@ -3321,12 +3855,12 @@ index 0000000000..98512d21dc + vtst.16 d22, d16, d18 + vadd.i16 d30, d16, d17 + vswp d2, d3 -+ ldr lr, [sp] ++ ldr lr, [sp, #1*4] + vmovl.s16 q10, d20 -+ teq lr, #1 ++ teq lr, #1 + vmovl.s16 q11, d22 -+ it ne -+ lslne v1, lr, #1 ++ it ne ++ lslne v1, lr, #1 + vbif d0, d1, d20 + vbif d4, d6, d20 + vbif d3, d2, d21 @@ -3352,8 +3886,8 @@ index 0000000000..98512d21dc + vshrn.i32 d7, q11, #8 + vmovn.i32 d3, q10 + vand q0, q3, q1 -+ it ne -+ rsbne v2, v1, #32 ++ it ne ++ rsbne v2, v1, #32 + vrev16.8 q3, q3 + vand q0, q3 + vsra.u64 d30, #32 @@ -3372,7 +3906,7 @@ index 0000000000..98512d21dc + vmov.u16 a2, d0[0] + it eq + orreq a1, a2, a1, lsl #2 -+ pop {a2,v1-v8,pc} ++ pop {a2-a4,v1-v8,pc} +10: + @ Construct result word, with duplicates + cmp a1, #2 @@ -3387,7 +3921,7 @@ index 0000000000..98512d21dc +T lsleq a1, v1 +T orreq a1, a2, a1 +A orreq a1, a2, a1, lsl v1 -+ pop {a2,v1-v8,pc} ++ pop {a2-a4,v1-v8,pc} +endfunc + + @@ -3395,9 +3929,9 @@ index 0000000000..98512d21dc +#else // non-NEON version + + -+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, + * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ * int in_inc) ++ * int in_inc0, in_inc1) + */ +function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 + add ip, sp, #4*4 @@ -3447,12 +3981,12 @@ index 0000000000..98512d21dc +T orr v7, v5, v7 + bhi 11b + -+ ldr v5, [sp, #16*4] -+ add ip, sp, #16*4 ++ ldrd v3, v4, [sp, #16*4] + ldr a2, [sp] ++ add ip, sp, #16*4 + subs a1, a1, #1 -+ add a3, a3, v5 -+ add a4, a4, v5 ++ add a3, a3, v3 ++ add a4, a4, v4 + bhi 1b + mov a1, v7, lsr v6 + pop {a2-a4,v1-v8,pc} @@ -3803,7 +4337,7 @@ index 0000000000..109fa98c29 +} diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c new file mode 100644 -index 0000000000..8a94a644a4 +index 0000000000..9294ab8010 --- /dev/null +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c @@ -0,0 +1,467 @@ @@ -4038,9 +4572,9 @@ index 0000000000..8a94a644a4 + int16_t *sao_offset_val, int sao_left_class, int width, int height); + + -+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const MvField *curr, const MvField *neigh, ++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ int in_inc); ++ int in_inc0, int in_inc1); +void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height); + + @@ -4268,9 +4802,9 @@ index 0000000000..8a94a644a4 +#endif + } + -+ assert(offsetof(MvField, mv) == 0); -+ assert(offsetof(MvField, ref_idx) == 8); -+ assert(offsetof(MvField, pred_flag) == 10); ++ assert(offsetof(HEVCRpiMvField, mv) == 0); ++ assert(offsetof(HEVCRpiMvField, ref_idx) == 8); ++ assert(offsetof(HEVCRpiMvField, pred_flag) == 10); + c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; + c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon; +} @@ -8133,10 +8667,10 @@ index 0000000000..21e7700174 + diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S new file mode 100644 -index 0000000000..ebf12e8684 +index 0000000000..3dd9246a16 --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S -@@ -0,0 +1,2973 @@ +@@ -0,0 +1,2975 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * @@ -9698,6 +10232,8 @@ index 0000000000..ebf12e8684 + @ Standard sign + .byte 2, 5, 9, 13, 17, 21, 26, 32 + ++ .balign 2 ++ + @ Sign inverted from standards table +inv_angle: + .short 4096, 1638, 910, 630, 482, 390, 315 @@ -11813,10 +12349,10 @@ index 0000000000..75a1789c25 + diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S new file mode 100644 -index 0000000000..6ce3d3ca8d +index 0000000000..7ea82b38fe --- /dev/null +++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S -@@ -0,0 +1,872 @@ +@@ -0,0 +1,902 @@ +/* + * Copyright (c) 2018 John Cox (for Raspberry Pi) + * @@ -11887,8 +12423,6 @@ index 0000000000..6ce3d3ca8d +.equ AVAIL_S_UL_N_L_C, 32 - 3 +.equ AVAIL_S_L_N_DL_C, 32 - 4 + -+.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr -+ +@ On entry +@ r2 req +@ r3 avail @@ -11908,77 +12442,78 @@ index 0000000000..6ce3d3ca8d +@ If UR avail then d_ur == a_ur so U-filter good too +@ +@ Data load pointers (only load if req & avail): -+@ r4 DL + stride -+@ r10 L -+@ r6 U -+@ r5 UR ++@ r8 DL + stride ++@ r6 L ++@ r7 U ++@ r4 UR +@ +@ Others: -+@ r2 req -+@ r7 req & avail -+@ r3 L + stride -+@ r8 DL + stride * 2 -+@ r9 stride * 2 ++@ r2 req (if preserve_req) ++@ r3 req & avail (if preserve_req) ++@ r2 req & avail (if !preserve_req) ++@ r10 L + stride ++@ r5 DL + stride * 2 ++@ r12 stride * 2 +@ cs Load U +@ mi Load UR +@ +@ Clobbered: -+@ r12 -+ -+.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur -+ -+.equ src_l\@, \sp_offset + 0 -+.equ src_u\@, \sp_offset + 4 -+.equ src_ur\@, \sp_offset + 8 -+.equ stride\@, \sp_offset + 12 -+.equ pw\@, (1 << \pw_s) @ pel width in bytes -+.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes -+ -+@ r9 stride -+@ r7 = ab_ul, r6 = a_u, r5 = a_ur -+@ r4 = b_dl, r10 = b_l, r8 = b_u -+ -+ ldr r5, [sp, #src_ur\@] -+ lsl r12, r3, #AVAIL_S_U_DL_CPSR -+ ldr r10, [sp, #src_l\@] -+ ldr r9, [sp, #stride\@] -+ ldr r6, [sp, #src_u\@] -+ -+ @ This is quite a slow instruction but it replaces -+ @ a decent number of tests that yield a max of 2 flags/op -+ @ It is annoying we can't branch on Q! -+ @ If L navail (ne) then DL must be navail (pl) -+ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur -+ -+ mov r4, r5 -+ sub r7, r10, r9 -+ it vs -+ movvs r4, r6 -+ add r8, r6, #b_size\@ - pw\@ -+ it cs -+ movcs r4, r7 -+ ite ne -+ movne r10, r4 -+ addeq r4, r7, r9, lsl #\log2_s -+ it cc -+ movcc r7, r10 -+ it mi -+ addmi r4, r10, r9, lsl #\log2_s -+ vld1.\d_type {\d_ul}, [r7] -+ itt vc -+ movvc r8, r7 -+ movvc r6, r7 -+ vld1.\d_type {\d_l }, [r4], r9 -+ tst r3, #AVAIL_UR -+ vld1.\d_type {\d_u }, [r6] -+ it eq -+ moveq r5, r8 -+ and r7, r2, r3 -+ add r8, r4, r9 -+ vld1.\d_type {\d_ur}, [r5] -+ lsls r12, r7, #AVAIL_S_UR_N_U_C -+ add r3, r10, r9 -+ lsl r9, #1 ++@ r9, lr ++ ++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur, preserve_req, I1, I2 ++ ++.equ src_l, \sp_offset + 0 ++.equ src_u, \sp_offset + 4 ++.equ src_ur, \sp_offset + 8 ++.equ stride, \sp_offset + 12 ++.equ pw, (1 << \pw_s) @ pel width in bytes ++.equ b_size, (1 << (\pw_s + \log2_s)) @ size in bytes ++ ++ ldrd r4, r5, [sp, #src_ur] @ and stride ++ ldrd r6, r7, [sp, #src_l] @ and src_u ++ lsls lr, r3, #AVAIL_S_U_N_UL_C ++ mov r8, r4 ++ sub r9, r6, r5 ++ it mi ++ movmi r8, r7 ++ it cs ++ movcs r8, r9 ++ lsls lr, r3, #AVAIL_S_L_N_DL_C ++ ite pl ++ movpl r6, r8 ++ addmi r8, r9, r5, lsl #\log2_s ++ it cs ++ addcs r8, r6, r5, lsl #\log2_s ++ .if !\preserve_req ++ and r2, r2, r3 ++ .endif ++ add r10, r6, r5 ++ lsl r12, r5, #1 ++ lsls lr, r3, #AVAIL_S_U_N_UL_C ++ it cc ++ movcc r9, r6 ++ vld1.\d_type {\d_l}, [r8], r5 ++ add lr, r7, #b_size - pw ++ add r5, r8, r5 ++ itt pl ++ movpl lr, r9 ++ movpl r7, r9 ++ tst r3, #AVAIL_UR ++ vld1.\d_type {\d_ul}, [r9] ++ it eq ++ moveq r4, lr ++ \I1 ++ .if \preserve_req ++ and r3, r2, r3 ++ .else ++ lsls lr, r2, #AVAIL_S_UR_N_U_C ++ .endif ++ vld1.\d_type {\d_u}, [r7] ++ \I2 ++ vld1.\d_type {\d_ur}, [r4] ++ .if \preserve_req ++ lsls lr, r3, #AVAIL_S_UR_N_U_C ++ .endif +.endm + + @@ -12001,33 +12536,33 @@ index 0000000000..6ce3d3ca8d +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_8, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] -+ -+ it cs -+ vldrcs s2, [r6] -+ ite pl -+ vmovpl s3, s4 -+ vldrmi s3, [r5] -+ -+ lsls r7, #AVAIL_S_L_N_DL_C -+ add r12, r0, #-pw -+ bpl 1f ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d2[0], d3[], d4[], 0 + -+ vld1.8 {d0[0]}, [r10], r9 -+ vld1.8 {d0[1]}, [r3], r9 -+ vld1.8 {d0[2]}, [r10] -+ vld1.8 {d0[3]}, [r3] ++ sub r3, r0, #pw ++ it mi ++ vldrmi s7, [r4] ++ it cs ++ vldrcs s6, [r7] ++ it pl ++ vmovpl.f32 s7, s8 ++ lsls lr, r2, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.8 {d0[0]}, [r6], r12 ++ vld1.8 {d1[0]}, [r10], r12 ++ vld1.8 {d0[1]}, [r6] ++ vld1.8 {d1[1]}, [r10] +1: -+ bcc 1f -+ vld1.8 {d0[5]}, [r4], r9 -+ vld1.8 {d0[6]}, [r8] -+ vld1.8 {d0[7]}, [r4] ++ bcc 1f ++ vld1.8 {d1[2]}, [r8], r12 ++ vld1.8 {d0[3]}, [r5] ++ vld1.8 {d1[3]}, [r8] +1: -+ vstr d1, [r1] @ Up -+ vst1.8 {d31[7]}, [r12] -+ vstr d0, [r0] @ Left -+ pop {r4-r10, pc} ++ vst1.8 {d2[0]}, [r3] ++ vst1.8 {d3}, [r1] ++ vzip.8 d0, d1 ++ vst1.8 {d0}, [r0] ++ pop {r4-r10, pc} +endfunc + + @@ -12049,30 +12584,31 @@ index 0000000000..6ce3d3ca8d +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] -+ -+ it cs -+ vldrcs d2, [r6] -+ it mi -+ vldrmi d3, [r5] -+ lsls r7, #AVAIL_S_L_N_DL_C -+ add r12, r0, #-pw -+ bpl 1f -+ vld1.16 {d0[0]}, [r10], r9 -+ vld1.16 {d0[1]}, [r3], r9 -+ vld1.16 {d0[2]}, [r10] -+ vld1.16 {d0[3]}, [r3] ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d2[0], d3[], d4[], 0 ++ ++ sub r3, r0, #pw ++ it mi ++ vldrmi d4, [r4] ++ it cs ++ vldrcs d3, [r7] ++ lsls lr, r2, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.16 {d0[0]}, [r6], r12 ++ vld1.16 {d1[0]}, [r10], r12 ++ vld1.16 {d0[1]}, [r6] ++ vld1.16 {d1[1]}, [r10] +1: -+ bcc 1f -+ vld1.16 {d1[1]}, [r4], r9 -+ vld1.16 {d1[2]}, [r8] -+ vld1.16 {d1[3]}, [r4] ++ bcc 1f ++ vld1.16 {d1[2]}, [r8], r12 ++ vld1.16 {d0[3]}, [r5] ++ vld1.16 {d1[3]}, [r8] +1: -+ vst1.16 {q1}, [r1] @ Up -+ vst1.16 {d31[3]}, [r12] -+ vst1.16 {q0}, [r0] @ Left -+ pop {r4-r10, pc} ++ vst1.16 {d2[0]}, [r3] ++ vst1.16 {d3, d4}, [r1] ++ vzip.16 d0, d1 ++ vst1.16 {q0}, [r0] ++ pop {r4-r10, pc} +endfunc + + @@ -12094,72 +12630,69 @@ index 0000000000..6ce3d3ca8d +.set log2_s, 3 + +function ff_hevc_rpi_intra_filter_8_neon_8, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] -+ -+ it cs -+ vldrcs d4, [r6] -+ it mi -+ vldrmi d5, [r5] ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d3[7], d4[], d5[], 1 + -+ lsls r7, #AVAIL_S_L_N_DL_C -+ bpl 1f -+ vld1.8 {d0[0]}, [r10], r9 -+ vld1.8 {d0[1]}, [r3], r9 -+ vld1.8 {d0[2]}, [r10], r9 -+ vld1.8 {d0[3]}, [r3], r9 -+ vld1.8 {d0[4]}, [r10], r9 -+ vld1.8 {d0[5]}, [r3], r9 -+ vld1.8 {d0[6]}, [r10] -+ vld1.8 {d0[7]}, [r3] ++ it mi ++ vldrmi d5, [r4] ++ sub r0, #pw ++ it cs ++ vldrcs d4, [r7] ++ lsls lr, r3, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.8 {d0[0]}, [r6], r12 ++ vld1.8 {d1[0]}, [r10], r12 ++ vld1.8 {d0[1]}, [r6], r12 ++ vld1.8 {d1[1]}, [r10], r12 ++ vld1.8 {d0[2]}, [r6], r12 ++ vld1.8 {d1[2]}, [r10], r12 ++ vld1.8 {d0[3]}, [r6] ++ vld1.8 {d1[3]}, [r10] +1: -+ bcc 1f -+ vld1.8 {d1[1]}, [r4], r9 -+ vld1.8 {d1[2]}, [r8], r9 -+ vld1.8 {d1[3]}, [r4], r9 -+ vld1.8 {d1[4]}, [r8], r9 -+ vld1.8 {d1[5]}, [r4], r9 -+ vld1.8 {d1[6]}, [r8] -+ vld1.8 {d1[7]}, [r4] ++ bcc 1f ++ vld1.8 {d1[4]}, [r8], r12 ++ vld1.8 {d0[5]}, [r5], r12 ++ vld1.8 {d1[5]}, [r8], r12 ++ vld1.8 {d0[6]}, [r5], r12 ++ vld1.8 {d1[6]}, [r8], r12 ++ vld1.8 {d0[7]}, [r5], r12 ++ vld1.8 {d1[7]}, [r8], r12 +1: -+ tst r2, #FILTER_LIGHT -+ add r12, r0, #-pw -+ beq 10f ++ vext.8 q3, q1, q2, #15 ++ vmov.u8 r4, d5[7] @ Save final pel ++ tst r2, #FILTER_LIGHT ++ vzip.8 d0, d1 ++ beq 1f + + @ Luma light filter -+ vext.8 q8, q15, q2, #15 -+ vext.8 q12, q15, q0, #15 -+ vaddl.u8 q9, d17, d5 -+ vaddl.u8 q8, d16, d4 -+ vaddl.u8 q13, d25, d1 -+ vaddl.u8 q12, d24, d0 -+ vmov.u8 r3, d5[7] @ Save final pel -+ vmov.u8 r2, d1[7] @ Save final pel -+ -+ vext.16 q2, q8, q9, #1 -+ vext.16 q3, q9, q9, #1 -+ vext.16 q0, q12, q13, #1 -+ vext.16 q1, q13, q13, #1 -+ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] -+ vadd.u16 q2, q8 -+ vadd.u16 q3, q9 -+ vadd.u16 q0, q12 -+ vadd.u16 q1, q13 -+ -+ vrshrn.u16 d4, q2, #2 -+ vrshrn.u16 d5, q3, #2 -+ vrshrn.u16 d0, q0, #2 -+ vrshrn.u16 d1, q1, #2 -+ vrshr.u16 d30, #2 -+ vmov.u8 d5[7], r3 @ Restore final pel -+ vmov.u8 d1[7], r2 @ Restore final pel -+ vdup.u8 d31, d30[0] @ d31[3] = d30[0] -+ -+10: -+ vst1.8 {q2 }, [r1] @ Up -+ vst1.8 {d31[7]}, [r12] @ Up-left -+ vst1.8 {q0 }, [r0] @ Left -+ pop {r4-r10, pc} ++ vaddl.u8 q8, d7, d5 ++ vext.8 q1, q1, q0, #15 ++ vaddl.u8 q2, d6, d4 ++ vaddl.u8 q3, d3, d1 ++ vaddl.u8 q9, d2, d0 ++ vext.16 q10, q8, q8, #1 ++ vext.16 q11, q3, q3, #1 ++ vadd.u16 q10, q8 ++ vadd.u16 q11, q3 ++ vadd.u16 d2, d4, d18 @ d2[0] = l[0] + 2ul + u[0] ++ vmov.u8 r5, d1[7] @ Save final pel ++ vext.16 q0, q2, q8, #1 ++ vext.16 q3, q9, q3, #1 ++ vadd.u16 q8, q0, q2 ++ vadd.u16 q3, q9 ++ vrshrn.u16 d5, q10, #2 ++ vrshrn.u16 d1, q11, #2 ++ vrshr.u16 d2, #2 ++ vrshrn.u16 d4, q8, #2 ++ vrshrn.u16 d0, q3, #2 ++ vmov.8 d5[7], r4 @ Restore final pel ++ vmov.8 d1[7], r5 @ Restore final pel ++ vdup.8 d3, d2[0] ++1: ++ vst1.8 {d3[7]}, [r0]! ++ vst1.8 {q2}, [r1] ++ vst1.8 {q0}, [r0] ++ pop {r4-r10, pc} +endfunc + + @@ -12184,85 +12717,89 @@ index 0000000000..6ce3d3ca8d +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_8_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d5[3], "d16[],d17[]", "d18[],d19[]", 1, \ ++ "ldr r9, [sp, #ur_size]", \ ++ "sub r0, #pw" + -+ it cs -+ vldmcs r6, {d4, d5} -+ ldr r12, [sp, #ur_size] -+ bpl 1f -+ cmp r12, #4 -+ vldm r5, {d6, d7} -+ bgt 1f -+ vdup.16 d7, d6[3] -+1: -+ lsls r12, r7, #AVAIL_S_L_N_DL_C -+ vdup.16 q1, d0[0] -+ bpl 1f -+ vld1.16 {d0[0]}, [r10], r9 -+ vld1.16 {d0[1]}, [r3], r9 -+ vld1.16 {d0[2]}, [r10], r9 -+ vld1.16 {d0[3]}, [r3], r9 -+ vld1.16 {d1[0]}, [r10], r9 -+ vld1.16 {d1[1]}, [r3], r9 -+ vld1.16 {d1[2]}, [r10] -+ vld1.16 {d1[3]}, [r3] ++ vmov q1, q0 ++ ldrh lr, [r4, #3*2] ++ it mi ++ vldmmi r4, {d18, d19} ++ it cs ++ vldmcs r7, {d16, d17} ++ itt mi ++ cmpmi r9, #p_size ++ vdupmi.16 d19, lr ++ lsls lr, r3, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.16 {d0[0]}, [r6], r12 ++ vld1.16 {d2[0]}, [r10], r12 ++ vld1.16 {d0[1]}, [r6], r12 ++ vld1.16 {d2[1]}, [r10], r12 ++ vld1.16 {d0[2]}, [r6], r12 ++ vld1.16 {d2[2]}, [r10], r12 ++ vld1.16 {d0[3]}, [r6] ++ vld1.16 {d2[3]}, [r10] +1: -+ bcc 1f -+ ldr r12, [sp, #dl_size] -+ vld1.16 {d2[1]}, [r4], r9 -+ cmp r12, #p_size -+ vld1.16 {d2[2]}, [r8], r9 -+ vld1.16 {d2[3]}, [r4], r9 -+ blt 2f -+ vld1.16 {d3[0]}, [r8], r9 -+ vld1.16 {d3[1]}, [r4], r9 -+ vld1.16 {d3[2]}, [r8] -+ vld1.16 {d3[3]}, [r4] -+ b 1f ++ ldr lr, [sp, #dl_size] ++ bcc 2f ++ vld1.16 {d3[0]}, [r8], r12 ++ vld1.16 {d1[1]}, [r5], r12 ++ cmp lr, #p_size ++ vld1.16 {d3[1]}, [r8], r12 ++ bcc 10f ++ vld1.16 {d1[2]}, [r5], r12 ++ vld1.16 {d3[2]}, [r8], r12 ++ vld1.16 {d1[3]}, [r5] ++ vld1.16 {d3[3]}, [r8] +2: -+ vdup.16 d3, d2[3] -+1: -+ tst r2, #FILTER_LIGHT -+ add r12, r0, #-pw -+ beq 10f ++ vext.16 q3, q8, q9, #7 ++ vext.16 q10, q2, q8, #7 ++ tst r2, #FILTER_LIGHT ++ vzip.16 q0, q1 ++ beq 3f + + @ Luma light filter -+ vext.16 q9, q2, q3, #7 -+ vext.16 q8, q15, q2, #7 -+ vext.16 q13, q0, q1, #7 -+ vext.16 q12, q15, q0, #7 -+ vadd.u16 q9, q3 -+ vadd.u16 q8, q2 -+ vadd.u16 q13, q1 -+ vadd.u16 q12, q0 -+ vmov.u16 r3, d7[3] @ Save final pel -+ vmov.u16 r2, d3[3] @ Save final pel -+ -+ vext.16 q2, q8, q9, #1 -+ vext.16 q3, q9, q9, #1 -+ vext.16 q0, q12, q13, #1 -+ vext.16 q1, q13, q13, #1 -+ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] -+ vadd.u16 q2, q8 -+ vadd.u16 q3, q9 -+ vadd.u16 q0, q12 -+ vadd.u16 q1, q13 -+ -+ vrshr.u16 q2, #2 -+ vrshr.u16 q3, #2 -+ vrshr.u16 q0, #2 -+ vrshr.u16 q1, #2 -+ vrshr.u16 d30, #2 -+ vmov.u16 d7[3], r3 @ Restore final pel -+ vmov.u16 d3[3], r2 @ Restore final pel -+ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ vadd.i16 q3, q9 ++ vext.16 q11, q0, q1, #7 ++ vext.16 q2, q2, q0, #7 ++ vadd.i16 q8, q10 ++ vadd.i16 q10, q11, q1 ++ vadd.i16 q0, q2 ++ vext.16 q11, q3, q3, #1 ++ vadd.i16 d4, d16, d0 @ d4[0] = l[0] + 2ul + u[0] ++ vmov.u16 r4, d19[3] @ Save final pel ++ vext.16 q9, q10, q10, #1 ++ vext.16 q12, q8, q3, #1 ++ vext.16 q13, q0, q10, #1 ++ vadd.i16 q3, q11 ++ vadd.i16 q10, q9 ++ vadd.i16 q8, q12 ++ vadd.i16 q0, q13 ++ vmov.u16 r5, d3[3] @ Save final pel ++ vrshr.u16 d4, d4, #2 ++ vrshr.u16 q9, q3, #2 ++ vrshr.u16 q1, q10, #2 ++ vrshr.u16 q8, #2 ++ vrshr.u16 q0, #2 ++ vmov.16 d19[3], r4 @ Restore final pel ++ vmov.16 d3[3], r5 @ Restore final pel ++ vdup.16 d5, d4[0] ++3: ++ vst1.16 {d5[3]}, [r0]! ++ vst1.16 {q8-q9}, [r1] ++ vst1.16 {q0-q1}, [r0] ++ pop {r4-r10, pc} + +10: -+ vst1.16 {q2, q3}, [r1] @ Up -+ vst1.16 {d31[3]}, [r12] @ Up-left -+ vst1.16 {q0, q1}, [r0] @ Left -+ pop {r4-r10, pc} ++A ldrh r9, [r8, -r12] ++T sub r9, r8, r12 ++T ldrh r9, [r9] ++ orr r9, r9, r9, lsl #16 ++ vmov.32 d1[1], r9 ++ vmov.32 d3[1], r9 ++ b 2b +endfunc + +@ int ff_hevc_rpi_intra_filter_16_neon_16( @@ -12286,152 +12823,163 @@ index 0000000000..6ce3d3ca8d +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_16_neon_16, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" -+ -+ vdup.16 q9, d16[0] -+ vdup.16 q11, d20[0] -+ -+ it cs -+ vldmcs r6, {d16-d19} -+ ldr r12, [sp, #ur_size] -+ bpl 1f -+ cmp r12, #12 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d4[],d5[]", d17[3], "d18[],d19[]", "d22[],d23[]", 1, \ ++ "ldr r9, [sp, #ur_size]", \ ++ "sub r0, #pw" ++ ++ vmov q10, q9 ++ ldr lr, [sp, #dl_size] ++ vmov q12, q11 ++ it cs ++ vldmcs r7, {q9-q10} + @ Given chroma frame layout, if UR exists then it is always legit to + @ load all of it even if most of it is outside the frame. -+ vldm r5, {d20-d23} -+ bgt 1f -+ bge 4f -+ cmp r5, #8 -+ bge 3f -+ vdup.16 d21, d20[3] -+3: vdup.16 d22, d21[3] -+4: vdup.16 d23, d22[3] -+ -+1: -+ lsls r7, #AVAIL_S_L_N_DL_C -+ ldr r12, [sp, #dl_size] -+ vdup.16 q1, d0[0] -+ vdup.16 q2, d0[0] -+ vdup.16 q3, d0[0] -+ bpl 1f -+ vld1.16 {d0[0]}, [r10], r9 -+ vld1.16 {d0[1]}, [r3], r9 -+ vld1.16 {d0[2]}, [r10], r9 -+ vld1.16 {d0[3]}, [r3], r9 -+ vld1.16 {d1[0]}, [r10], r9 -+ vld1.16 {d1[1]}, [r3], r9 -+ vld1.16 {d1[2]}, [r10], r9 -+ vld1.16 {d1[3]}, [r3], r9 -+ vld1.16 {d2[0]}, [r10], r9 -+ vld1.16 {d2[1]}, [r3], r9 -+ vld1.16 {d2[2]}, [r10], r9 -+ vld1.16 {d2[3]}, [r3], r9 -+ vld1.16 {d3[0]}, [r10], r9 -+ vld1.16 {d3[1]}, [r3], r9 -+ vld1.16 {d3[2]}, [r10] -+ vld1.16 {d3[3]}, [r3] ++ itt mi ++ vldmmi r4, {q11-q12} ++ cmpmi r9, #p_size ++ bmi 10f +1: -+ bcc 1f -+ vld1.16 {d4[1]}, [r4], r9 -+ cmp r12, #4 -+ vld1.16 {d4[2]}, [r8], r9 -+ vld1.16 {d4[3]}, [r4], r9 -+ ble 2f -+ vld1.16 {d5[0]}, [r8], r9 -+ vld1.16 {d5[1]}, [r4], r9 -+ cmp r12, #12 -+ vld1.16 {d5[2]}, [r8], r9 -+ vld1.16 {d5[3]}, [r4], r9 -+ blt 3f -+ vld1.16 {d6[0]}, [r8], r9 -+ vld1.16 {d6[1]}, [r4], r9 -+ vld1.16 {d6[2]}, [r8], r9 -+ vld1.16 {d6[3]}, [r4], r9 -+ ble 4f -+ vld1.16 {d7[0]}, [r8], r9 -+ vld1.16 {d7[1]}, [r4], r9 -+ vld1.16 {d7[2]}, [r8] -+ vld1.16 {d7[3]}, [r4] -+ b 1f -+2: vdup.16 d5, d4[3] -+3: vdup.16 d6, d5[3] -+4: vdup.16 d7, d6[3] -+1: -+ tst r2, #FILTER_LIGHT -+ add r12, r0, #-pw -+ beq 10f -+ -+ vpush {q5} -+ @ Luma light filter -+ @ Left -+ vext.16 q5, q2, q3, #7 -+ vext.16 q14, q1, q2, #7 -+ vext.16 q13, q0, q1, #7 -+ vext.16 q12, q15, q0, #7 -+ -+ vadd.u16 q5, q3 -+ vadd.u16 q14, q2 -+ vadd.u16 q13, q1 -+ vadd.u16 q12, q0 -+ vmov.u16 r2, d7[3] @ Save final pel -+ -+ vext.16 q0, q12, q13, #1 -+ vext.16 q1, q13, q14, #1 -+ vext.16 q2, q14, q5, #1 -+ vext.16 q3, q5, q5, #1 -+ -+ vmov d30, d24 @ d30[0] = l[0] + ul -+ vadd.u16 q0, q12 -+ vadd.u16 q1, q13 -+ vadd.u16 q2, q14 -+ vadd.u16 q3, q5 -+ -+ vrshr.u16 q0, #2 -+ vrshr.u16 q1, #2 -+ vrshr.u16 q2, #2 -+ vrshr.u16 q3, #2 -+ -+ @ Up -+ vext.16 q5, q10, q11, #7 -+ vext.16 q14, q9, q10, #7 -+ vext.16 q13, q8, q9, #7 -+ vext.16 q12, q15, q8, #7 -+ -+ vadd.u16 q5, q11 -+ vadd.u16 q14, q10 -+ vadd.u16 q13, q9 -+ vadd.u16 q12, q8 -+ vmov.u16 r3, d23[3] @ Save final pel -+ -+ vext.16 q8, q12, q13, #1 -+ vext.16 q9, q13, q14, #1 -+ vext.16 q10, q14, q5, #1 -+ vext.16 q11, q5, q5, #1 -+ -+ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] -+ vadd.u16 q8, q12 -+ vadd.u16 q9, q13 -+ vadd.u16 q10, q14 -+ vadd.u16 q11, q5 -+ -+ vrshr.u16 q8, #2 -+ vrshr.u16 q9, #2 ++ lsls r3, #AVAIL_S_L_N_DL_C ++ bpl 20f ++ vld1.16 {d0[0]}, [r6], r12 ++ vld1.16 {d2[0]}, [r10], r12 ++ vld1.16 {d0[1]}, [r6], r12 ++ vld1.16 {d2[1]}, [r10], r12 ++ vld1.16 {d0[2]}, [r6], r12 ++ vld1.16 {d2[2]}, [r10], r12 ++ vld1.16 {d0[3]}, [r6], r12 ++ vld1.16 {d2[3]}, [r10], r12 ++ vld1.16 {d1[0]}, [r6], r12 ++ vld1.16 {d3[0]}, [r10], r12 ++ vld1.16 {d1[1]}, [r6], r12 ++ vld1.16 {d3[1]}, [r10], r12 ++ vld1.16 {d1[2]}, [r6], r12 ++ vld1.16 {d3[2]}, [r10], r12 ++ vld1.16 {d1[3]}, [r6] ++ vld1.16 {d3[3]}, [r10] ++2: bcc 30f ++ vld1.16 {d6[0]}, [r8], r12 ++ vld1.16 {d4[1]}, [r5], r12 ++ cmp lr, #p_size ++ vld1.16 {d6[1]}, [r8], r12 ++ bcc 40f ++ vld1.16 {d4[2]}, [r5], r12 ++ vld1.16 {d6[2]}, [r8], r12 ++ vld1.16 {d4[3]}, [r5], r12 ++ vld1.16 {d6[3]}, [r8], r12 ++ vld1.16 {d5[0]}, [r5], r12 ++ vld1.16 {d7[0]}, [r8], r12 ++ vld1.16 {d5[1]}, [r5], r12 ++ vld1.16 {d7[1]}, [r8], r12 ++ vld1.16 {d5[2]}, [r5], r12 ++ vld1.16 {d7[2]}, [r8], r12 ++ vld1.16 {d5[3]}, [r5] ++ vld1.16 {d7[3]}, [r8] ++3: ++ vzip.16 q0, q1 ++ tst r2, #FILTER_LIGHT ++ vzip.16 q2, q3 ++ beq 4f ++ ++ vext.16 q13, q8, q0, #7 ++ vadd.i16 q13, q0 ++ vext.16 q0, q0, q1, #7 ++ vadd.i16 q0, q1 ++ vext.16 q1, q1, q2, #7 ++ vadd.i16 q1, q2 ++ vext.16 q2, q2, q3, #7 ++ vadd.i16 q2, q3 ++ vext.16 q14, q8, q9, #7 ++ vadd.i16 q14, q9 ++ vext.16 q9, q9, q10, #7 ++ vadd.i16 q9, q10 ++ vext.16 q10, q10, q11, #7 ++ vadd.i16 q10, q11 ++ vext.16 q11, q11, q12, #7 ++ vadd.i16 q11, q12 ++ vadd.i16 d17, d26, d28 @ d17[0] = l[0] + 2ul + u[0] ++ vmov.u16 r4, d7[3] @ Save final pel ++ vext.16 q3, q2, q2, #1 ++ vadd.i16 q3, q2 ++ vext.16 q2, q1, q2, #1 ++ vadd.i16 q2, q1 ++ vext.16 q1, q0, q1, #1 ++ vadd.i16 q1, q0 ++ vext.16 q0, q13, q0, #1 ++ vadd.i16 q0, q13 ++ vext.16 q13, q11, q11, #1 ++ vadd.i16 q13, q11 ++ vext.16 q11, q10, q11, #1 ++ vadd.i16 q11, q10 ++ vext.16 q10, q9, q10, #1 ++ vadd.i16 q10, q9 ++ vext.16 q9, q14, q9, #1 ++ vadd.i16 q9, q14 ++ vrshr.u16 d17, #2 ++ vmov.u16 r5, d25[3] @ Save final pel ++ vrshr.u16 q3, #2 ++ vrshr.u16 q12, q13, #2 ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 q2, #2 ++ vrshr.u16 q9, #2 + vrshr.u16 q10, #2 + vrshr.u16 q11, #2 ++ vdup.16 d17, d17[0] ++ vmov.16 d7[3], r4 @ Restore final pel ++ vmov.16 d25[3], r5 @ Restore final pel ++4: ++ vst1.16 {d17[3]}, [r0]! ++ vst1.16 {q9-q10}, [r1]! ++ vst1.16 {q0-q1}, [r0]! ++ vst1.16 {q11-q12}, [r1] ++ vst1.16 {q2-q3}, [r0] ++ pop {r4-r10, pc} + -+ @ Misc -+ vrshr.u16 d30, #2 -+ vmov.u16 d7[3], r2 @ Restore final pel -+ vmov.u16 d23[3], r3 @ Restore final pel -+ vdup.u16 d31, d30[0] @ d31[3] = d30[0] -+ vpop {q5} -+ -+10: -+ vstm r1, {d16-d23} @ Up -+ vst1.16 {d31[3]}, [r12] @ Up-left -+ vstm r0, { d0-d7 } @ Left -+ pop {r4-r10, pc} ++10: cmp r9, #8 ++ bhi 12f ++ beq 11f ++ vdup.16 d21, d20[3] ++11: vdup.16 d22, d21[3] ++12: vdup.16 d23, d22[3] ++ b 1b ++ ++20: vmov q0, q2 ++ vmov q1, q2 ++ b 2b ++ ++30: vmov q3, q2 ++ b 3b ++ ++40: cmp lr, #8 ++ bhi 42f ++ beq 41f ++ vdup.16 d5, d6[1] ++ vdup.16 d7, d6[1] ++ vmov.f32 s9, s10 ++ vmov.f32 s13, s10 ++ b 3b ++41: vld1.16 {d4[2]}, [r5], r12 ++ vld1.16 {d6[2]}, [r8], r12 ++ vld1.16 {d4[3]}, [r5] ++ vld1.16 {d6[3]}, [r8] ++ vdup.16 d5, d6[3] ++ vdup.16 d7, d6[3] ++ b 3b ++42: vld1.16 {d4[2]}, [r5], r12 ++ vld1.16 {d6[2]}, [r8], r12 ++ vld1.16 {d4[3]}, [r5], r12 ++ vld1.16 {d6[3]}, [r8], r12 ++ vld1.16 {d5[0]}, [r5], r12 ++ ldrh lr, [r8, r12] ++ vld1.16 {d7[0]}, [r8], r12 ++ vld1.16 {d5[1]}, [r5] ++ vld1.16 {d7[1]}, [r8] ++ orr lr, lr, lr, lsl #16 ++ vmov s11, lr ++ vmov s15, lr ++ b 3b +endfunc + +@ int ff_hevc_rpi_intra_filter_4_neon_32( @@ -12452,31 +13000,31 @@ index 0000000000..6ce3d3ca8d +.set log2_s, 2 + +function ff_hevc_rpi_intra_filter_4_neon_32, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" -+ -+ it cs -+ vldmcs r6, {d4, d5} -+ it mi -+ vldmmi r5, {d6, d7} -+ lsls r7, #AVAIL_S_L_N_DL_C -+ vdup.32 q1, d0[0] -+ add r12, r0, #-pw -+ bpl 1f -+ vld1.32 {d0[0]}, [r10], r9 -+ vld1.32 {d0[1]}, [r3], r9 -+ vld1.32 {d1[0]}, [r10] -+ vld1.32 {d1[1]}, [r3] ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d16[0], "d4[],d5[]", "d6[],d7[]", 0, \ ++ "vmov q1, q0" ++ ++ sub r3, r0, #pw ++ it mi ++ vldmmi r4, {d6, d7} ++ it cs ++ vldmcs r7, {d4, d5} ++ lsls lr, r2, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.32 {d0[0]}, [r6], r12 ++ vld1.32 {d0[1]}, [r10], r12 ++ vld1.32 {d1[0]}, [r6] ++ vld1.32 {d1[1]}, [r10] +1: -+ bcc 1f -+ vld1.32 {d2[1]}, [r4], r9 -+ vld1.32 {d3[0]}, [r8] -+ vld1.32 {d3[1]}, [r4] ++ bcc 1f ++ vld1.32 {d2[1]}, [r8], r12 ++ vld1.32 {d3[0]}, [r5] ++ vld1.32 {d3[1]}, [r8] +1: -+ vst1.32 {q2, q3 }, [r1] @ Up -+ vst1.32 {d31[1]}, [r12] -+ vst1.32 {q0, q1 }, [r0] @ Left -+ pop {r4-r10, pc} ++ vst1.32 {d16[0]}, [r3] ++ vst1.32 {q2, q3}, [r1] ++ vst1.32 {q0, q1}, [r0] ++ pop {r4-r10, pc} +endfunc + + @@ -12502,54 +13050,57 @@ index 0000000000..6ce3d3ca8d + +function ff_hevc_rpi_intra_filter_8_neon_32, export=1 + push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" -+ -+ vdup.32 q9, d16[0] -+ vdup.32 q11, d20[0] -+ -+ it cs -+ vldmcs r6, {q8, q9 } -+ ldr r12, [sp, #ur_size] ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]", 0, \ ++ "vmov r3, s0" ++ ++ vmov q9, q8 ++ ldr r9, [r4, #3*4] ++ vmov q11, q10 ++ ldr lr, [sp, #ur_size] ++ it cs ++ vldmcs r7, {q8, q9} ++ ittt mi ++ vldmmi r4, {q10, q11} ++ cmpmi lr, #p_size ++ vdupmi.32 q11, r9 ++ lsls lr, r2, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, r3 ++ vdup.32 q2, r3 ++ vdup.32 q3, r3 ++ it cs ++ ldrcs r9, [r8, r12] + bpl 1f -+ cmp r12, #p_size -+ vldm r5, {q10, q11} -+ bge 1f -+ vdup.32 q11, d21[1] -+1: -+ lsls r7, #AVAIL_S_L_N_DL_C -+ vdup.32 q1, d0[0] -+ vdup.32 q2, d0[0] -+ vdup.32 q3, d0[0] -+ bpl 1f -+ vld1.32 {d0[0]}, [r10], r9 -+ vld1.32 {d0[1]}, [r3], r9 -+ vld1.32 {d1[0]}, [r10], r9 -+ vld1.32 {d1[1]}, [r3], r9 -+ vld1.32 {d2[0]}, [r10], r9 -+ vld1.32 {d2[1]}, [r3], r9 -+ vld1.32 {d3[0]}, [r10] -+ vld1.32 {d3[1]}, [r3] ++ vld1.32 {d0[0]}, [r6], r12 ++ vld1.32 {d0[1]}, [r10], r12 ++ vld1.32 {d1[0]}, [r6], r12 ++ vld1.32 {d1[1]}, [r10], r12 ++ vld1.32 {d2[0]}, [r6], r12 ++ vld1.32 {d2[1]}, [r10], r12 ++ vld1.32 {d3[0]}, [r6] ++ vld1.32 {d3[1]}, [r10] +1: ++ ldr lr, [sp, #dl_size] ++ bcc 2f ++ vld1.32 {d4[1]}, [r8], r12 ++ vld1.32 {d5[0]}, [r5], r12 ++ cmp lr, #p_size ++ vld1.32 {d5[1]}, [r8], r12 + bcc 1f -+ ldr r12, [sp, #dl_size] -+ vld1.32 {d4[1]}, [r4], r9 -+ cmp r12, #p_size -+ vld1.32 {d5[0]}, [r8], r9 -+ vld1.32 {d5[1]}, [r4], r9 -+ blt 2f -+ vld1.32 {d6[0]}, [r8], r9 -+ vld1.32 {d6[1]}, [r4], r9 -+ vld1.32 {d7[0]}, [r8] -+ vld1.32 {d7[1]}, [r4] -+ b 1f -+2: -+ vdup.32 q3, d5[1] ++ vld1.32 {d6[0]}, [r5], r12 ++ vld1.32 {d6[1]}, [r8], r12 ++ vld1.32 {d7[0]}, [r5] ++ vld1.32 {d7[1]}, [r8] +1: -+ add r12, r0, #-pw -+ vstm r1, { q8-q11} @ Up -+ vst1.32 {d31[1]}, [r12] -+ vstm r0, { q0-q3 } @ Left -+ pop {r4-r10, pc} ++ it cc ++ vdupcc.32 q3, r9 ++2: ++ vst1.32 {q8-q9}, [r1]! ++ sub r3, r0, #pw ++ vst1.32 {q0-q1}, [r0]! ++ vst1.32 {q10-q11}, [r1] ++ vst1.32 {q2-q3}, [r0] ++ vst1.32 {d31[1]}, [r3] ++ pop {r4-r10, pc} +endfunc + + @@ -12574,116 +13125,131 @@ index 0000000000..6ce3d3ca8d +.set p_size, (1 << log2_s) @ size in pels + +function ff_hevc_rpi_intra_filter_16_neon_32, export=1 -+ push {r4-r10, lr} -+ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1], 1, \ ++ "ldr r9, [sp, #ur_size]", \ ++ "sub r0, #pw" + + @ Once we get this big we have run out of neon regs to store + @ everything at once so do in pieces + -+ @ Up (have) -+ it cs -+ vldmcs r6, { q0-q3 } -+ ldr r12, [sp, #ur_size] -+ it mi -+ vldmmi r5, { q8-q11} -+ it cs -+ vstmcs r1, { q0-q3 } -+ bpl 1f -+ cmp r12, #12 -+ add lr, r1, #(pw << log2_s) -+ bgt 2f -+ cmp r12, #8 -+ bge 3f -+ vdup.16 q9, d17[1] -+4: vdup.16 d10, d19[1] -+3: vdup.16 q11, d21[1] -+2: vstm lr, { q8-q11} ++ @ Up and/or up-right (have) ++ add lr, r1, #(pw << log2_s) ++ bcc 1f ++ vldm r7, {q0-q3} ++ vstm r1, {q0-q3} +1: ++ bpl 3f ++ vldm r4, {q8-q11} ++ cmp r9, #16 ++ blo 10f ++2: vstm lr, {q8-q11} ++3: ++ @ Up-left ++ vst1.32 {d30[1]}, [r0]! ++ ++ @ Left and/or down-left (have) ++ lsls lr, r3, #AVAIL_S_L_N_DL_C ++ ldr r9, [sp, #dl_size] ++ bpl 4f ++ vld1.32 {d0[0]}, [r6], r12 ++ vld1.32 {d0[1]}, [r10], r12 ++ vld1.32 {d1[0]}, [r6], r12 ++ vld1.32 {d1[1]}, [r10], r12 ++ vld1.32 {d2[0]}, [r6], r12 ++ vld1.32 {d2[1]}, [r10], r12 ++ vld1.32 {d3[0]}, [r6], r12 ++ vld1.32 {d3[1]}, [r10], r12 ++ vld1.32 {d4[0]}, [r6], r12 ++ vld1.32 {d4[1]}, [r10], r12 ++ vld1.32 {d5[0]}, [r6], r12 ++ vld1.32 {d5[1]}, [r10], r12 ++ vld1.32 {d6[0]}, [r6], r12 ++ vld1.32 {d6[1]}, [r10], r12 ++ vld1.32 {d7[0]}, [r6] ++ vld1.32 {d7[1]}, [r10] ++ vstm r0, {q0-q3} ++4: add lr, r0, #(pw << log2_s) ++ bcc 6f ++ vdup.32 d16, d30[0] ++ vld1.32 {d16[1]}, [r8], r12 ++ vld1.32 {d17[0]}, [r5], r12 ++ cmp r9, #16 ++ vld1.32 {d17[1]}, [r8], r12 ++ blo 20f ++ vld1.32 {d18[0]}, [r5], r12 ++ vld1.32 {d18[1]}, [r8], r12 ++ vld1.32 {d19[0]}, [r5], r12 ++ vld1.32 {d19[1]}, [r8], r12 ++ vld1.32 {d20[0]}, [r5], r12 ++ vld1.32 {d20[1]}, [r8], r12 ++ vld1.32 {d21[0]}, [r5], r12 ++ vld1.32 {d21[1]}, [r8], r12 ++ vld1.32 {d22[0]}, [r5], r12 ++ vld1.32 {d22[1]}, [r8], r12 ++ vld1.32 {d23[0]}, [r5] ++ vld1.32 {d23[1]}, [r8] ++5: vstm lr, {q8-q11} ++6: ++ eors r3, r2 @ (req & avail) ^ req = (req & ~avail) ++ bne 7f ++ pop {r4-r10, pc} ++7: ++ @ Up and/or up-right (don't have) ++ vdup.32 q0, d31[0] ++ lsls lr, r3, #AVAIL_S_UR_N_U_C ++ vdup.32 q1, d31[0] ++ add lr, r1, #(pw << log2_s) ++ vdup.32 q8, d31[1] ++ vdup.32 q9, d31[1] ++ it cs ++ vstmcs r1!, {q0-q1} ++ it mi ++ vstmmi lr!, {q8-q9} ++ it cs ++ vstmcs r1, {q0-q1} ++ it mi ++ vstmmi lr, {q8-q9} + -+ @ Left (have) -+ add lr, r0, #-pw -+ lsls r12, r7, #AVAIL_S_L_N_DL_C -+ vst1.32 {d30[1]}, [lr] @ UL -+ bpl 1f -+ vld1.32 { d0[0]}, [r10], r9 -+ vld1.32 { d0[1]}, [r3], r9 -+ vld1.32 { d1[0]}, [r10], r9 -+ vld1.32 { d1[1]}, [r3], r9 -+ vld1.32 { d2[0]}, [r10], r9 -+ vld1.32 { d2[1]}, [r3], r9 -+ vld1.32 { d3[0]}, [r10], r9 -+ vld1.32 { d3[1]}, [r3], r9 -+ vld1.32 { d4[0]}, [r10], r9 -+ vld1.32 { d4[1]}, [r3], r9 -+ vld1.32 { d5[0]}, [r10], r9 -+ vld1.32 { d5[1]}, [r3], r9 -+ vld1.32 { d6[0]}, [r10], r9 -+ vld1.32 { d6[1]}, [r3], r9 -+ vld1.32 { d7[0]}, [r10] -+ vld1.32 { d7[1]}, [r3] -+ vstm r0, { q0-q3 } -+1: -+ bcc 1f -+ ldr r12, [sp, #dl_size] -+ vdup.32 d16, d30[0] @ d16[0] = d30[0] -+ add lr, r0, #(pw << log2_s) -+ vld1.32 {d16[1]}, [r4], r9 -+ cmp r12, #4 -+ vld1.32 {d17[0]}, [r8], r9 -+ vld1.32 {d17[1]}, [r4], r9 -+ ble 2f -+ vld1.32 {d18[0]}, [r8], r9 -+ vld1.32 {d18[1]}, [r4], r9 -+ cmp r12, #12 -+ vld1.32 {d19[0]}, [r8], r9 -+ vld1.32 {d19[1]}, [r4], r9 -+ blt 3f -+ vld1.32 {d20[0]}, [r8], r9 -+ vld1.32 {d20[1]}, [r4], r9 -+ vld1.32 {d21[0]}, [r8], r9 -+ vld1.32 {d21[1]}, [r4], r9 -+ ble 4f -+ vld1.32 {d22[0]}, [r8], r9 -+ vld1.32 {d22[1]}, [r4], r9 -+ vld1.32 {d23[0]}, [r8] -+ vld1.32 {d23[1]}, [r4] -+ b 5f -+2: vdup.32 q9, d17[1] -+3: vdup.32 q10, d19[1] -+4: vdup.32 q11, d21[1] -+5: vstm lr, { q8-q11} -+1: -+ eors r7, r2 -+ beq 99f -+ -+ lsls r12, r7, #AVAIL_S_UR_N_U_C -+ vdup.32 q0, d31[0] -+ vdup.32 q1, d31[0] -+ vdup.32 q2, d31[0] -+ vdup.32 q3, d31[0] -+ add lr, r1, #(pw << log2_s) -+ vdup.32 q8, d31[1] -+ vdup.32 q9, d31[1] -+ vdup.32 q10, d31[1] -+ vdup.32 q11, d31[1] -+ it cs -+ vstmcs r1, { q0-q3 } -+ it mi -+ vstmmi lr, { q8-q11} -+ -+ lsls r7, #AVAIL_S_L_N_DL_C -+ vdup.32 q0, d30[0] -+ vdup.32 q1, d30[0] -+ vdup.32 q2, d30[0] -+ vdup.32 q3, d30[0] -+ add lr, r0, #(pw << log2_s) -+ it mi -+ vstmmi r0, { q0-q3 } -+ it cs -+ vstmcs lr, { q0-q3 } ++ @ Left and/or down-left (don't have) ++ vdup.32 q0, d30[0] ++ lsls lr, r3, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d30[0] ++ add lr, r0, #(pw << log2_s) ++ it mi ++ vstmmi r0!, {q0-q1} ++ it cs ++ vstmcs lr!, {q0-q1} ++ it mi ++ vstmmi r0, {q0-q1} ++ it cs ++ vstmcs lr, {q0-q1} ++ pop {r4-r10, pc} + -+99: -+ pop {r4-r10, pc} ++10: cmp r9, #8 ++ bhi 12f ++ beq 11f ++ vdup.32 q9, d17[1] ++11: vdup.32 q10, d19[1] ++12: vdup.32 q11, d21[1] ++ b 2b ++ ++20: cmp r9, #8 ++ blo 21f ++ vld1.32 {d18[0]}, [r5], r12 ++ vld1.32 {d18[1]}, [r8], r12 ++ vld1.32 {d19[0]}, [r5], r12 ++ vld1.32 {d19[1]}, [r8], r12 ++ beq 22f ++ vld1.32 {d20[0]}, [r5], r12 ++ vld1.32 {d20[1]}, [r8], r12 ++ vld1.32 {d21[0]}, [r5] ++ vld1.32 {d21[1]}, [r8] ++ b 23f ++21: vdup.32 q9, d17[1] ++22: vdup.32 q10, d19[1] ++23: vdup.32 q11, d21[1] ++ b 5b +endfunc + + @@ -14836,10 +15402,10 @@ index d181b74570..c52c450956 100644 if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c new file mode 100644 -index 0000000000..79549c411a +index 0000000000..552c2e349e --- /dev/null +++ b/libavcodec/rpi_hevc_cabac.c -@@ -0,0 +1,2253 @@ +@@ -0,0 +1,2255 @@ +/* + * HEVC CABAC decoding + * @@ -16843,11 +17409,15 @@ index 0000000000..79549c411a + const int res = trans_scale_sat( + (levels[m] ^ k) - k, scale, dc_scale, shift); +#if RPI_COMPRESS_COEFFS -+ if (use_compress) -+ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs); -+ else ++ if (use_compress) ++ { ++ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs); ++ } ++ else +#endif -+ blk_coeffs[0] = res; ++ { ++ blk_coeffs[0] = res; ++ } + --m; + } + @@ -16957,7 +17527,7 @@ index 0000000000..79549c411a + +#if !USE_BY22 +// Stores results to lc -+void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) +{ + int x = abs_mvd_greater0_flag_decode(lc); + int y = abs_mvd_greater0_flag_decode(lc); @@ -16968,28 +17538,26 @@ index 0000000000..79549c411a + y += abs_mvd_greater1_flag_decode(lc); + + switch (x) { -+ case 2: lc->pu.mvd.x = mvd_decode(lc); break; -+ case 1: lc->pu.mvd.x = mvd_sign_flag_decode(lc); break; -+ case 0: lc->pu.mvd.x = 0; break; ++ case 2: x = mvd_decode(lc); break; ++ case 1: x = mvd_sign_flag_decode(lc); break; ++ case 0: x = 0; break; + } + + switch (y) { -+ case 2: lc->pu.mvd.y = mvd_decode(lc); break; -+ case 1: lc->pu.mvd.y = mvd_sign_flag_decode(lc); break; -+ case 0: lc->pu.mvd.y = 0; break; ++ case 2: y = mvd_decode(lc); break; ++ case 1: y = mvd_sign_flag_decode(lc); break; ++ case 0: y = 0; break; + } ++ return MV_XY(x,y); +} +#else -+void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) +{ + int x = abs_mvd_greater0_flag_decode(lc); + int y = abs_mvd_greater0_flag_decode(lc); + -+ lc->pu.mvd.x = 0; -+ lc->pu.mvd.y = 0; -+ + if ((x | y) == 0) -+ return; ++ return 0; + + if (x != 0) + x += abs_mvd_greater1_flag_decode(lc); @@ -17000,9 +17568,9 @@ index 0000000000..79549c411a + { + // Not worth starting BY22 + if (x != 0) -+ lc->pu.mvd.x = mvd_sign_flag_decode(lc); ++ x = mvd_sign_flag_decode(lc); + if (y != 0) -+ lc->pu.mvd.y = mvd_sign_flag_decode(lc); ++ y = mvd_sign_flag_decode(lc); + } + else + { @@ -17015,7 +17583,7 @@ index 0000000000..79549c411a + b = val = get_cabac_by22_peek(cc); + + if (x == 1) { -+ lc->pu.mvd.x = ((int32_t)b >> 31) | 1; ++ x = ((int32_t)b >> 31) | 1; + n = 1; + b <<= 1; + } @@ -17044,7 +17612,7 @@ index 0000000000..79549c411a + x = (b >> (32 - k)) + (1 << k); + b <<= k; + s = (int32_t)b >> 31; -+ lc->pu.mvd.x = (x ^ s) - s; ++ x = (x ^ s) - s; + b <<= 1; + + // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits) @@ -17057,7 +17625,7 @@ index 0000000000..79549c411a + } + + if (y == 1) { -+ lc->pu.mvd.y = ((int32_t)b >> 31) | 1; ++ y = ((int32_t)b >> 31) | 1; + ++n; + // don't care about b anymore + } @@ -17082,7 +17650,7 @@ index 0000000000..79549c411a + + y = (b >> (32 - k)) + (1 << k); + s = (int32_t)(b << k) >> 31; -+ lc->pu.mvd.y = (y ^ s) - s; ++ y = (y ^ s) - s; + // don't care about b anymore + } + @@ -17090,12 +17658,12 @@ index 0000000000..79549c411a + bypass_finish(cc); + } + -+// printf("BY: X=%d,Y=%d\n", lc->pu.mvd.x, lc->pu.mvd.y); ++ return MV_XY(x, y); +} +#endif diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h new file mode 100644 -index 0000000000..47c9c7029d +index 0000000000..a6587616ae --- /dev/null +++ b/libavcodec/rpi_hevc_cabac_fns.h @@ -0,0 +1,191 @@ @@ -17130,7 +17698,7 @@ index 0000000000..47c9c7029d + const int log2_trafo_size, const enum ScanType scan_idx, + const int c_idx); + -+void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); +int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc); + +#define HEVC_BIN_SAO_MERGE_FLAG 0 @@ -17410,10 +17978,10 @@ index 0000000000..0aee673d8b +#endif /* AVCODEC_RPI_HEVC_DATA_H */ diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c new file mode 100644 -index 0000000000..8e7695bcf9 +index 0000000000..dd5f65b5c4 --- /dev/null +++ b/libavcodec/rpi_hevc_filter.c -@@ -0,0 +1,1204 @@ +@@ -0,0 +1,1206 @@ +/* + * HEVC video decoder + * @@ -17447,7 +18015,6 @@ index 0000000000..8e7695bcf9 +#include "libavutil/common.h" +#include "libavutil/internal.h" + -+#include "cabac_functions.h" +#include "rpi_hevcdec.h" + +#include "bit_depth_template.c" @@ -17551,37 +18118,6 @@ index 0000000000..8e7695bcf9 + } +} + -+static void copy_vert(uint8_t *dst, const uint8_t *src, -+ int pixel_shift, int height, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src) -+{ -+ int i; -+ switch (pixel_shift) -+ { -+ case 2: -+ for (i = 0; i < height; i++) { -+ *(uint32_t *)dst = *(uint32_t *)src; -+ dst += stride_dst; -+ src += stride_src; -+ } -+ break; -+ case 1: -+ for (i = 0; i < height; i++) { -+ *(uint16_t *)dst = *(uint16_t *)src; -+ dst += stride_dst; -+ src += stride_src; -+ } -+ break; -+ default: -+ for (i = 0; i < height; i++) { -+ *dst = *src; -+ dst += stride_dst; -+ src += stride_src; -+ } -+ break; -+ } -+} -+ +static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src, + ptrdiff_t stride_src, int x, int y, int width, int height, + int c_idx, int x_ctb, int y_ctb) @@ -17597,9 +18133,9 @@ index 0000000000..8e7695bcf9 + src + stride_src * (height - 1), width << sh); + + /* copy vertical edges */ -+ copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); ++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); + -+ copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); ++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); +} + +// N.B. Src & dst are swapped as this is a restore! @@ -17629,14 +18165,7 @@ index 0000000000..8e7695bcf9 + const uint8_t * bs = dst1; + while (m != 0) { + if ((m & 1) != 0) { -+ unsigned int i; -+ uint8_t * d = bd; -+ const uint8_t * s = bs; -+ for (i = 0; i != bheight; ++i) { -+ memcpy(d, s, bwidth); -+ d += stride_src; -+ s += stride_dst; -+ } ++ s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight); + } + m >>= 1; + bs += bwidth; @@ -17866,22 +18395,22 @@ index 0000000000..8e7695bcf9 + } + if (src_l != NULL) { + if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { -+ copy_vert(dst - (1 << sh), ++ ff_hevc_rpi_copy_vert(dst - (1 << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { -+ copy_vert(dst - (1 << sh), ++ ff_hevc_rpi_copy_vert(dst - (1 << sh), + src_l, + sh, height, stride_dst, stride_src); + } + } + if (src_r != NULL) { + if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { -+ copy_vert(dst + (width << sh), ++ ff_hevc_rpi_copy_vert(dst + (width << sh), + s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), + sh, height, stride_dst, 1 << sh); + } else { -+ copy_vert(dst + (width << sh), ++ ff_hevc_rpi_copy_vert(dst + (width << sh), + src_r, + sh, height, stride_dst, stride_src); + } @@ -18001,19 +18530,59 @@ index 0000000000..8e7695bcf9 + +// Get block strength +// Given how we call we will always get within the 32bit boundries -+static inline uint32_t bs_get32(const uint8_t * bs, const unsigned int stride2, -+ const unsigned int xl, const unsigned int xr, const unsigned int y) ++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2, ++ unsigned int xl, unsigned int xr, const unsigned int y) +{ + if (xr <= xl) { + return 0; + } + else + { ++#if HAVE_ARMV6T2_INLINE ++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 ++#error This case not yet handled in bs_get32 ++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 ++#error Stride1 < return size ++#endif ++ uint32_t tmp; ++ __asm__ ( ++ "lsr %[tmp], %[xl], %[xl_shift] \n\t" ++ "rsb %[xr], %[xl], %[xr] \n\t" ++ "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t" ++ "add %[xr], %[xr], #7 \n\t" ++ "lsr %[bs], %[y], %[y_shift1] \n\t" ++ "bic %[xr], %[xr], #7 \n\t" ++ "ubfx %[xl], %[xl], #1, #5 \n\t" ++ "lsr %[xr], %[xr], #1 \n\t" ++ "cmp %[xr], #32 \n\t" ++ "mvn %[tmp], #0 \n\t" ++ "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t" ++ "lsl %[tmp], %[tmp], %[xr] \n\t" ++ "lsr %[xl], %[bs], %[xl] \n\t" ++ "it ne \n\t" ++ "bicne %[bs], %[xl], %[tmp] \n\t" ++ : // Outputs ++ [bs]"+r"(bs), ++ [stride2]"+r"(stride2), ++ [xl]"+r"(xl), ++ [xr]"+r"(xr), ++ [tmp]"=&r"(tmp) ++ : // Inputs ++ [y]"r"(y), ++ [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT), ++ [y_shift1]"M"(HEVC_RPI_BS_Y_SHR), ++ [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++ : // Clobbers ++ "cc" ++ ); ++ return (uint32_t) bs; ++#else + const uint32_t a = *bs_ptr32(bs, stride2, xl, y); + const unsigned int n = ((xr - xl + 7) & ~7) >> 1; + + return n == 32 ? a : + (a >> ((xl >> 1) & 31)) & ~(~0U << n); ++#endif + } +} + @@ -18335,14 +18904,15 @@ index 0000000000..8e7695bcf9 + +static inline uint32_t bsf_mv(const HEVCRpiContext * const s, + const unsigned int rep, const unsigned int dup, -+ const unsigned int mvf_stride, ++ const unsigned int mvf_stride0, ++ const unsigned int mvf_stride1, + const RefPicList * const rpl_p, const RefPicList * const rpl_q, -+ const MvField * const mvf_p, const MvField * const mvf_q) ++ const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q) +{ + return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, + mvf_p, mvf_q, + rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list, -+ sizeof(MvField) * mvf_stride); ++ sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1); +} + + @@ -18352,13 +18922,11 @@ index 0000000000..8e7695bcf9 + const unsigned int log2_trafo_size, + const int is_coded_block) +{ -+ const MvField * const tab_mvf = s->ref->tab_mvf; -+ const unsigned int log2_min_pu_size = s->ps.sps->log2_min_pu_size; -+ const unsigned int mvf_stride = s->ps.sps->min_pu_width; // width in pus; mvf stride -+ const RefPicList * const rpl = s->ref->refPicList; ++ const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0); ++ const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE; ++ const RefPicList * const rpl = s->refPicList; + // Rep count for bsf_mv when running with min_pu chuncks + const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size; -+ const MvField * const mvf_curr = tab_mvf + (y0 >> log2_min_pu_size) * mvf_stride + (x0 >> log2_min_pu_size); + const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags; + const unsigned int trafo_size = (1U << log2_trafo_size); + const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1; @@ -18451,14 +19019,15 @@ index 0000000000..8e7695bcf9 + // If we aren't on the top boundary we must be in the middle + // and in that case we know where mvf can change + const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0; -+ const RefPicList *const rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ? -+ ff_hevc_rpi_get_ref_list(s, s->ref, x0, y0 - 1) : -+ rpl; ++ const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ? ++ s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] : ++ rpl; + + bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), + rpl, rpl_top, -+ mvf_curr, mvf_curr - mvf_stride); ++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1)); + } + + // Finally put the results into bs @@ -18468,16 +19037,16 @@ index 0000000000..8e7695bcf9 + // Max of 1 pu internal split - ignore if not on 8pel boundary + if (has_y_split && !off_boundary(lc->cu.y_split, 3)) + { -+ const MvField * const mvf = tab_mvf + -+ (lc->cu.y_split >> log2_min_pu_size) * mvf_stride + (x0 >> log2_min_pu_size); ++ const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split); + // If we have the x split as well then it must be in the middle + const unsigned int log2_rep = has_x_split ? 1 : 0; + + hbs_set(s, x0, lc->cu.y_split, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), + trafo_size >> (log2_min_pu_size + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), + rpl, rpl, -+ mvf, mvf - mvf_stride)); ++ mvf, mvf - MVF_STASH_WIDTH_PU)); + } + } + @@ -18492,14 +19061,15 @@ index 0000000000..8e7695bcf9 + if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split)) + { + const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0; -+ const RefPicList *const rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ? -+ ff_hevc_rpi_get_ref_list(s, s->ref, x0 - 1, y0) : -+ rpl; ++ const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ? ++ s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] : ++ rpl; + + bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), -+ (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + rpl, rpl_left, -+ mvf_curr, mvf_curr - 1); ++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0)); + } + + vbs_set(s, x0, y0, bsf_mask, bsf_v); @@ -18507,13 +19077,13 @@ index 0000000000..8e7695bcf9 + + if (has_x_split && !off_boundary(lc->cu.x_split, 3)) + { -+ const MvField * const mvf = tab_mvf + -+ (y0 >> log2_min_pu_size) * mvf_stride + (lc->cu.x_split >> log2_min_pu_size); ++ const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0); + const unsigned int log2_rep = has_y_split ? 1 : 0; + + vbs_set(s, lc->cu.x_split, y0, bsf_mask, + bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), -+ (mvf_stride << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), + rpl, rpl, + mvf, mvf - 1)); + } @@ -18618,12 +19188,89 @@ index 0000000000..8e7695bcf9 + return y; +} + +diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h +new file mode 100644 +index 0000000000..6b36f5e737 +--- /dev/null ++++ b/libavcodec/rpi_hevc_mv.h +@@ -0,0 +1,71 @@ ++#ifndef AVCODEC_RPI_HEVC_MV_H ++#define AVCODEC_RPI_HEVC_MV_H ++ ++#include "config.h" ++ ++typedef int32_t MvXY; ++ ++typedef struct HEVCRpiMvField { ++ MvXY xy[2]; ++ int8_t ref_idx[2]; ++ int8_t pred_flag; ++ int8_t dummy; // To 12 bytes ++} HEVCRpiMvField; ++ ++ ++#define MV_X(xy) (((xy) << 16) >> 16) ++#define MV_Y(xy) ((xy) >> 16) ++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16)) ++ ++#if ARCH_ARM ++#include "arm/rpi_hevc_mv_arm.h" ++#endif ++ ++#ifndef mvxy_add ++static inline MvXY mvxy_add(const MvXY a, const MvXY b) ++{ ++ return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b)); ++} ++#endif ++ ++ ++#ifndef mv_scale_xy ++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb) ++{ ++ int tx, scale_factor; ++ ++ td = td == 0 ? 1 : av_clip_int8(td); ++ tb = av_clip_int8(tb); ++ tx = (0x4000 + (abs(td) >> 1)) / td; ++ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12); ++ return MV_XY( ++ av_clip_int16((scale_factor * MV_X(src) + 127 + ++ (scale_factor * MV_X(src) < 0)) >> 8), ++ av_clip_int16((scale_factor * MV_Y(src) + 127 + ++ (scale_factor * MV_Y(src) < 0)) >> 8)); ++} ++#endif ++ ++// 8.3.1 states that the bitstream may not contain poc diffs that do not ++// fit in 16 bits, so given that we don't care about the high bits we only ++// store the low 16 + LT & Inter flags ++ ++#define COL_POC_INTRA 0 ++#define COL_POC_INTER (1 << 16) ++#define COL_POC_LT (1 << 17) ++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y))) ++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff)) ++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0) ++ ++typedef struct ColMv_s { ++ int32_t poc; ++ int32_t xy; ++} ColMv; ++ ++typedef struct ColMvField_s { ++ ColMv L[2]; ++} ColMvField; ++ ++ ++ ++#endif // AVCODEC_RPI_HEVC_MV_H diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c new file mode 100644 -index 0000000000..163e2558dc +index 0000000000..221755fb6e --- /dev/null +++ b/libavcodec/rpi_hevc_mvs.c -@@ -0,0 +1,681 @@ +@@ -0,0 +1,486 @@ +/* + * HEVC video decoder + * @@ -18650,152 +19297,44 @@ index 0000000000..163e2558dc +#include "hevc.h" +#include "rpi_hevcdec.h" + -+static const uint8_t l0_l1_cand_idx[12][2] = { -+ { 0, 1, }, -+ { 1, 0, }, -+ { 0, 2, }, -+ { 2, 0, }, -+ { 1, 2, }, -+ { 2, 1, }, -+ { 0, 3, }, -+ { 3, 0, }, -+ { 1, 3, }, -+ { 3, 1, }, -+ { 2, 3, }, -+ { 3, 2, }, -+}; -+ -+ -+//check if the two luma locations belong to the same motion estimation region -+static av_always_inline int is_diff_mer(const HEVCRpiContext * const s, int xN, int yN, int xP, int yP) ++static av_always_inline int ++is_eq_mer(const unsigned int plevel, ++ const unsigned int xN, const unsigned int yN, ++ const unsigned int xP, const unsigned int yP) +{ -+ uint8_t plevel = s->ps.pps->log2_parallel_merge_level; -+ -+ return xN >> plevel == xP >> plevel && -+ yN >> plevel == yP >> plevel; ++ return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0; +} + -+#define MATCH_MV(x) (AV_RN32A(&A.x) == AV_RN32A(&B.x)) -+#define MATCH(x) (A.x == B.x) -+ +// check if the mv's and refidx are the same between A and B -+static av_always_inline int compare_mv_ref_idx(const struct MvField A, const struct MvField B) ++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) +{ -+ int a_pf = A.pred_flag; -+ int b_pf = B.pred_flag; -+ if (a_pf == b_pf) { -+ if (a_pf == PF_BI) { -+ return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) && -+ MATCH(ref_idx[1]) && MATCH_MV(mv[1]); -+ } else if (a_pf == PF_L0) { -+ return MATCH(ref_idx[0]) && MATCH_MV(mv[0]); -+ } else if (a_pf == PF_L1) { -+ return MATCH(ref_idx[1]) && MATCH_MV(mv[1]); -+ } -+ } ++ return a->pred_flag == b->pred_flag && ++ ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) && ++ ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1])); + return 0; +} + -+static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb) -+{ -+ int tx, scale_factor; -+ -+ td = av_clip_int8(td); -+ tb = av_clip_int8(tb); -+ tx = (0x4000 + abs(td / 2)) / td; -+ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12); -+ dst->x = av_clip_int16((scale_factor * src->x + 127 + -+ (scale_factor * src->x < 0)) >> 8); -+ dst->y = av_clip_int16((scale_factor * src->y + 127 + -+ (scale_factor * src->y < 0)) >> 8); -+} -+ -+static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol, -+ const int colPic, const int poc, -+ const RefPicList * const refPicList, const int X, const int refIdxLx, -+ const RefPicList * const refPicList_col, const int listCol, const int refidxCol) -+{ -+ int cur_lt = refPicList[X].isLongTerm[refIdxLx]; -+ int col_lt = refPicList_col[listCol].isLongTerm[refidxCol]; -+ int col_poc_diff, cur_poc_diff; -+ -+ if (cur_lt != col_lt) { -+ mvLXCol->x = 0; -+ mvLXCol->y = 0; -+ return 0; -+ } -+ -+ col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol]; -+ cur_poc_diff = poc - refPicList[X].list[refIdxLx]; -+ -+ if (cur_lt || col_poc_diff == cur_poc_diff || !col_poc_diff) { -+ mvLXCol->x = mvCol->x; -+ mvLXCol->y = mvCol->y; -+ } else { -+ mv_scale(mvLXCol, mvCol, col_poc_diff, cur_poc_diff); -+ } -+ return 1; -+} -+ -+#define CHECK_MVSET(l) \ -+ check_mvset(mvLXCol, temp_col.mv + l, \ -+ colPic, s->poc, \ -+ refPicList, X, refIdxLx, \ -+ refPicList_col, L ## l, temp_col.ref_idx[l]) -+ -+// derive the motion vectors section 8.5.3.2.8 -+static int derive_temporal_colocated_mvs(const HEVCRpiContext * const s, const MvField temp_col, -+ const int refIdxLx, Mv * const mvLXCol, const int X, -+ const int colPic, const RefPicList * const refPicList_col) -+{ -+ const RefPicList * const refPicList = s->ref->refPicList; -+ -+ if (temp_col.pred_flag == PF_INTRA) -+ return 0; -+ -+ if (temp_col.pred_flag == PF_L0 || -+ (temp_col.pred_flag == PF_BI && (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) -+ { -+ return CHECK_MVSET(0); -+ } -+ return CHECK_MVSET(1); -+} -+ -+#define TAB_MVF(x, y) \ -+ tab_mvf[(y) * min_pu_width + x] -+ -+#define TAB_MVF_PU(v) \ -+ TAB_MVF(((x ## v) >> s->ps.sps->log2_min_pu_size), \ -+ ((y ## v) >> s->ps.sps->log2_min_pu_size)) -+ -+#define DERIVE_TEMPORAL_COLOCATED_MVS \ -+ derive_temporal_colocated_mvs(s, temp_col, \ -+ refIdxLx, mvLXCol, X, colPic, \ -+ ff_hevc_rpi_get_ref_list(s, ref, x, y)) -+ +/* + * 8.5.3.1.7 temporal luma motion vector prediction + */ -+static int temporal_luma_motion_vector(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, ++static int temporal_luma_motion_vector(const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, const int x0, const int y0, + const int nPbW, const int nPbH, const int refIdxLx, -+ Mv * const mvLXCol, const int X) ++ MvXY * const mvLXCol, const int X) +{ -+ MvField *tab_mvf; -+ MvField temp_col; -+ int x, y, x_pu, y_pu; -+ const int min_pu_width = s->ps.sps->min_pu_width; -+ int availableFlagLXCol = 0; -+ int colPic; ++ int x, y; ++ const ColMv * cmv = NULL; + -+ HEVCFrame * const ref = s->ref->collocated_ref; ++ HEVCRpiFrame * const col_ref = s->ref->collocated_ref; ++ const RefPicList * const refPicList = s->refPicList + X; ++ const int cur_lt = refPicList->isLongTerm[refIdxLx]; + -+ if (ref == NULL || ref->tab_mvf == NULL) { -+ memset(mvLXCol, 0, sizeof(*mvLXCol)); ++ *mvLXCol = 0; ++ // Unlikely but we might have a col_ref IDR frame! ++ if (col_ref->col_mvf == NULL) + return 0; -+ } + -+ tab_mvf = ref->tab_mvf; -+ colPic = ref->poc; ++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH); + + //bottom right collocated motion vector + x = x0 + nPbW; @@ -18803,508 +19342,421 @@ index 0000000000..163e2558dc + + if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && + y < s->ps.sps->height && -+ x < s->ps.sps->width) { -+ x &= ~15; -+ y &= ~15; -+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); -+ x_pu = x >> s->ps.sps->log2_min_pu_size; -+ y_pu = y >> s->ps.sps->log2_min_pu_size; -+ temp_col = TAB_MVF(x_pu, y_pu); -+ availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS; ++ x < s->ps.sps->width) ++ { ++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + ++ (y >> 4) * s->col_mvf_stride; ++ ++ if (col->L[0].poc != COL_POC_INTRA && ++ (col->L[1].poc == COL_POC_INTRA || ++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) ++ { ++ cmv = col->L + 0; ++ } ++ else if (col->L[1].poc != COL_POC_INTRA) ++ { ++ cmv = col->L + 1; ++ } + } + + // derive center collocated motion vector -+ if (!availableFlagLXCol) { ++ if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt) ++ { ++ cmv = NULL; + x = x0 + (nPbW >> 1); + y = y0 + (nPbH >> 1); -+ x &= ~15; -+ y &= ~15; -+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); -+ x_pu = x >> s->ps.sps->log2_min_pu_size; -+ y_pu = y >> s->ps.sps->log2_min_pu_size; -+ temp_col = TAB_MVF(x_pu, y_pu); -+ availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS; -+ } -+ return availableFlagLXCol; -+} -+ -+#define AVAILABLE(cand, v) \ -+ (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA)) -+ -+#define COMPARE_MV_REFIDX(a, b) \ -+ compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b)) + -+/* -+ * 8.5.3.1.2 Derivation process for spatial merging candidates -+ */ -+static void derive_spatial_merge_candidates(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, -+ int nPbW, int nPbH, -+ int log2_cb_size, const unsigned int avail, -+ int singleMCLFlag, int part_idx, -+ int merge_idx, -+ struct MvField mergecandlist[]) -+{ -+ const RefPicList * const refPicList = s->ref->refPicList; -+ const MvField * const tab_mvf = s->ref->tab_mvf; -+ -+ const int min_pu_width = s->ps.sps->min_pu_width; -+ const int xA1 = x0 - 1; -+ const int yA1 = y0 + nPbH - 1; -+ -+ const int xB1 = x0 + nPbW - 1; -+ const int yB1 = y0 - 1; ++ { ++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + ++ (y >> 4) * s->col_mvf_stride; + -+ const int xB0 = x0 + nPbW; -+ const int yB0 = y0 - 1; ++ if (col->L[0].poc != COL_POC_INTRA && ++ (col->L[1].poc == COL_POC_INTRA || ++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) ++ { ++ cmv = col->L + 0; ++ } ++ else if (col->L[1].poc != COL_POC_INTRA) ++ { ++ cmv = col->L + 1; ++ } ++ } ++ } + -+ const int xA0 = x0 - 1; -+ const int yA0 = y0 + nPbH; ++ if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc)) ++ return 0; + -+ const int xB2 = x0 - 1; -+ const int yB2 = y0 - 1; ++ { ++ const int col_poc = col_ref->poc; ++ const int ref_poc = refPicList->list[refIdxLx]; + -+ const int nb_refs = (s->sh.slice_type == HEVC_SLICE_P) ? -+ s->sh.nb_refs[0] : FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]); ++ *mvLXCol = (cur_lt || ++ cmv->poc == col_poc || ++ COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ? ++ cmv->xy : ++ mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc); ++ } + -+ int zero_idx = 0; ++ return cmv != NULL; ++} + -+ int nb_merge_cand = 0; -+ int nb_orig_merge_cand = 0; ++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) ++{ ++ return b != NULL && compare_mv_ref_idx(a, b); ++} + -+ int is_available_a0; -+ int is_available_a1; -+ int is_available_b0; -+ int is_available_b1; -+ int is_available_b2; + + -+ if (!singleMCLFlag && part_idx == 1 && -+ (lc->cu.part_mode == PART_Nx2N || -+ lc->cu.part_mode == PART_nLx2N || -+ lc->cu.part_mode == PART_nRx2N) || -+ is_diff_mer(s, xA1, yA1, x0, y0)) { -+ is_available_a1 = 0; -+ } else { -+ is_available_a1 = AVAILABLE((avail & AVAIL_L) != 0, A1); -+ if (is_available_a1) { -+ mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1); -+ if (merge_idx == 0) -+ return; -+ nb_merge_cand++; -+ } ++/* ++ * 8.5.3.1.2 Derivation process for spatial merging candidates ++ */ ++static inline const HEVCRpiMvField * ++derive_spatial_merge_candidates( ++ const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ const unsigned int part_idx, ++ const unsigned int merge_idx, ++ HEVCRpiMvField * const mvf_t) ++{ ++ const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N); ++ const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD); ++ ++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); ++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); ++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); ++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; ++ const unsigned int plevel = s->ps.pps->log2_parallel_merge_level; ++ const unsigned int part_mode = lc->cu.part_mode; ++ ++ const HEVCRpiMvField * perm[4]; ++ unsigned int nb_merge_cand = 0; ++ ++ // singleMCLFlag => part_idx == 0 so no need to test for it ++ if ((avail & AVAIL_L) == 0 || ++ (part_idx == 1 && ++ ((parts_a1 >> part_mode) & 1) != 0 || ++ is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) || ++ mvf_a1->pred_flag == PF_INTRA) ++ { ++ mvf_a1 = NULL; ++ } ++ else ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_a1; ++ perm[nb_merge_cand++] = mvf_a1; + } + -+ if (!singleMCLFlag && part_idx == 1 && -+ (lc->cu.part_mode == PART_2NxN || -+ lc->cu.part_mode == PART_2NxnU || -+ lc->cu.part_mode == PART_2NxnD) || -+ is_diff_mer(s, xB1, yB1, x0, y0)) { -+ is_available_b1 = 0; -+ } else { -+ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1); -+ if (is_available_b1 && -+ !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) { -+ mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1); -+ if (merge_idx == nb_merge_cand) -+ return; -+ nb_merge_cand++; -+ } ++ if ((avail & AVAIL_U) == 0 || ++ (part_idx == 1 && ++ ((parts_b1 >> part_mode) & 1) != 0 || ++ is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) || ++ mvf_b1->pred_flag == PF_INTRA) ++ { ++ mvf_b1 = NULL; ++ } ++ else if (!mvf_eq(mvf_b1, mvf_a1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b1; ++ perm[nb_merge_cand++] = mvf_b1; + } + + // above right spatial merge candidate -+ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0) && -+ !is_diff_mer(s, xB0, yB0, x0, y0); -+ -+ if (is_available_b0 && -+ !(is_available_b1 && COMPARE_MV_REFIDX(B0, B1))) { -+ mergecandlist[nb_merge_cand] = TAB_MVF_PU(B0); ++ // Never need mvf_b0 again so don't bother zeroing if navail ++ if ((avail & AVAIL_UR) != 0 && ++ !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) && ++ mvf_b0->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_b0, mvf_b1)) ++ { + if (merge_idx == nb_merge_cand) -+ return; -+ nb_merge_cand++; ++ return mvf_b0; ++ perm[nb_merge_cand++] = mvf_b0; + } + + // left bottom spatial merge candidate -+ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0) && -+ !is_diff_mer(s, xA0, yA0, x0, y0); -+ -+ if (is_available_a0 && -+ !(is_available_a1 && COMPARE_MV_REFIDX(A0, A1))) { -+ mergecandlist[nb_merge_cand] = TAB_MVF_PU(A0); ++ // Never need mvf_a0 again so don't bother zeroing if navail ++ if ((avail & AVAIL_DL) != 0 && ++ !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) && ++ mvf_a0->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_a0, mvf_a1)) ++ { + if (merge_idx == nb_merge_cand) -+ return; -+ nb_merge_cand++; ++ return mvf_a0; ++ perm[nb_merge_cand++] = mvf_a0; + } + + // above left spatial merge candidate -+ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2) && -+ !is_diff_mer(s, xB2, yB2, x0, y0); -+ -+ if (is_available_b2 && -+ !(is_available_a1 && COMPARE_MV_REFIDX(B2, A1)) && -+ !(is_available_b1 && COMPARE_MV_REFIDX(B2, B1)) && -+ nb_merge_cand != 4) { -+ mergecandlist[nb_merge_cand] = TAB_MVF_PU(B2); -+ if (merge_idx == nb_merge_cand) -+ return; -+ nb_merge_cand++; ++ if (nb_merge_cand != 4 && ++ (avail & AVAIL_UL) != 0 && ++ !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0)) ++ { ++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL ++ ++ if (mvf_b2->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_b2, mvf_a1) && ++ !mvf_eq(mvf_b2, mvf_b1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b2; ++ perm[nb_merge_cand++] = mvf_b2; ++ } + } + + // temporal motion vector candidate -+ if (s->sh.slice_temporal_mvp_enabled_flag && -+ nb_merge_cand < s->sh.max_num_merge_cand) { -+ Mv mv_l0_col = { 0 }, mv_l1_col = { 0 }; -+ int available_l0 = temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, -+ 0, &mv_l0_col, 0); -+ int available_l1 = (s->sh.slice_type == HEVC_SLICE_B) ? -+ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, -+ 0, &mv_l1_col, 1) : 0; -+ -+ if (available_l0 || available_l1) { -+ mergecandlist[nb_merge_cand].pred_flag = available_l0 + (available_l1 << 1); -+ AV_ZERO16(mergecandlist[nb_merge_cand].ref_idx); -+ mergecandlist[nb_merge_cand].mv[0] = mv_l0_col; -+ mergecandlist[nb_merge_cand].mv[1] = mv_l1_col; ++ if (s->sh.slice_temporal_mvp_enabled_flag) ++ { ++ static const HEVCRpiMvField mvf_z = {{0}}; ++ ++ *mvf_t = mvf_z; ++ ++ if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, ++ 0, mvf_t->xy + 0, 0)) ++ mvf_t->pred_flag = PF_L0; ++ ++ if (s->sh.slice_type == HEVC_SLICE_B && ++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, ++ 0, mvf_t->xy + 1, 1)) ++ mvf_t->pred_flag |= PF_L1; + ++ if (mvf_t->pred_flag != 0) ++ { + if (merge_idx == nb_merge_cand) -+ return; -+ nb_merge_cand++; ++ return mvf_t; ++ perm[nb_merge_cand++] = mvf_t; + } + } + -+ nb_orig_merge_cand = nb_merge_cand; -+ + // combined bi-predictive merge candidates (applies for B slices) -+ if (s->sh.slice_type == HEVC_SLICE_B && nb_orig_merge_cand > 1 && -+ nb_orig_merge_cand < s->sh.max_num_merge_cand) { -+ int comb_idx = 0; -+ -+ for (comb_idx = 0; nb_merge_cand < s->sh.max_num_merge_cand && -+ comb_idx < nb_orig_merge_cand * (nb_orig_merge_cand - 1); comb_idx++) { -+ int l0_cand_idx = l0_l1_cand_idx[comb_idx][0]; -+ int l1_cand_idx = l0_l1_cand_idx[comb_idx][1]; -+ MvField l0_cand = mergecandlist[l0_cand_idx]; -+ MvField l1_cand = mergecandlist[l1_cand_idx]; -+ -+ if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) && -+ (refPicList[0].list[l0_cand.ref_idx[0]] != -+ refPicList[1].list[l1_cand.ref_idx[1]] || -+ AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) { -+ mergecandlist[nb_merge_cand].ref_idx[0] = l0_cand.ref_idx[0]; -+ mergecandlist[nb_merge_cand].ref_idx[1] = l1_cand.ref_idx[1]; -+ mergecandlist[nb_merge_cand].pred_flag = PF_BI; -+ AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]); -+ AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]); -+ if (merge_idx == nb_merge_cand) -+ return; -+ nb_merge_cand++; ++ if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1) ++ { ++ unsigned int comb_idx = 0; ++ const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1); ++ const RefPicList * const refPicList = s->refPicList; ++ ++ for (comb_idx = 0; comb_idx < cand_count; comb_idx++) ++ { ++ static const uint8_t l0_l1_cand_idx[12][2] = { ++ { 0, 1, }, ++ { 1, 0, }, ++ { 0, 2, }, ++ { 2, 0, }, ++ { 1, 2, }, ++ { 2, 1, }, ++ { 0, 3, }, ++ { 3, 0, }, ++ { 1, 3, }, ++ { 3, 1, }, ++ { 2, 3, }, ++ { 3, 2, }, ++ }; ++ ++ const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0]; ++ const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1]; ++ const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx]; ++ const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx]; ++ ++ if ((mvf_c0->pred_flag & PF_L0) != 0 && ++ (mvf_c1->pred_flag & PF_L1) != 0 && ++ (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] || ++ mvf_c0->xy[0] != mvf_c1->xy[1])) ++ { ++ if (merge_idx == nb_merge_cand++) ++ { ++ // Need to be a bit careful as we will construct mvf_t and we ++ // may already be using that as one of our condidates ++ // so build & copy rather than build in place ++ const HEVCRpiMvField mvf_m = { ++ .xy = { ++ mvf_c0->xy[0], ++ mvf_c1->xy[1]}, ++ .ref_idx = { ++ mvf_c0->ref_idx[0], ++ mvf_c1->ref_idx[1]}, ++ .pred_flag = PF_BI ++ }; ++ *mvf_t = mvf_m; ++ return mvf_t; ++ } + } + } + } + -+ // append Zero motion vector candidates -+ while (nb_merge_cand < s->sh.max_num_merge_cand) { -+ mergecandlist[nb_merge_cand].pred_flag = PF_L0 + ((s->sh.slice_type == HEVC_SLICE_B) << 1); -+ AV_ZERO32(mergecandlist[nb_merge_cand].mv + 0); -+ AV_ZERO32(mergecandlist[nb_merge_cand].mv + 1); -+ mergecandlist[nb_merge_cand].ref_idx[0] = zero_idx < nb_refs ? zero_idx : 0; -+ mergecandlist[nb_merge_cand].ref_idx[1] = zero_idx < nb_refs ? zero_idx : 0; ++ // "append" Zero motion vector candidates ++ { ++ const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ? ++ FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0]; ++ const unsigned int zero_idx = merge_idx - nb_merge_cand; ++ ++ const HEVCRpiMvField mvf_m = { ++ .xy = {0, 0}, ++ .ref_idx = { ++ zero_idx < nb_refs ? zero_idx : 0, ++ (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0}, ++ .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0 ++ }; + -+ if (merge_idx == nb_merge_cand) -+ return; -+ nb_merge_cand++; -+ zero_idx++; ++ *mvf_t = mvf_m; ++ return mvf_t; + } +} + -+/* -+ * 8.5.3.1.1 Derivation process of luma Mvs for merge mode -+ */ ++ ++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, + int nPbH, int log2_cb_size, int part_idx, -+ int merge_idx, MvField * const mv) ++ int merge_idx, HEVCRpiMvField * const mv) +{ -+ int singleMCLFlag = 0; -+ int nCS = 1 << log2_cb_size; -+ LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]); -+ int nPbW2 = nPbW; -+ int nPbH2 = nPbH; -+ -+ if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) { -+ singleMCLFlag = 1; -+ x0 = lc->cu.x; -+ y0 = lc->cu.y; -+ nPbW = nCS; -+ nPbH = nCS; -+ part_idx = 0; -+ } ++ const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ? ++ derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8, ++ ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8), ++ 0, merge_idx, mv) : ++ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), ++ part_idx, merge_idx, mv); + -+ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, -+ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), -+ singleMCLFlag, part_idx, -+ merge_idx, mergecand_list); ++ if (mvf_m != mv) ++ *mv = *mvf_m; + -+ if (mergecand_list[merge_idx].pred_flag == PF_BI && -+ (nPbW2 + nPbH2) == 12) { -+ mergecand_list[merge_idx].pred_flag = PF_L0; -+ } -+ -+ *mv = mergecand_list[merge_idx]; ++ if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12) ++ mv->pred_flag = PF_L0; +} + -+static av_always_inline void dist_scale(const HEVCRpiContext * const s, Mv * const mv, -+ int min_pu_width, int x, int y, -+ int elist, int ref_idx_curr, int ref_idx) -+{ -+ const RefPicList * const refPicList = s->ref->refPicList; -+ const MvField * const tab_mvf = s->ref->tab_mvf; -+ int ref_pic_elist = refPicList[elist].list[TAB_MVF(x, y).ref_idx[elist]]; -+ int ref_pic_curr = refPicList[ref_idx_curr].list[ref_idx]; + -+ if (ref_pic_elist != ref_pic_curr) { -+ int poc_diff = s->poc - ref_pic_elist; -+ if (!poc_diff) -+ poc_diff = 1; -+ mv_scale(mv, mv, poc_diff, s->poc - ref_pic_curr); -+ } -+} -+ -+static int mv_mp_mode_mx(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index, -+ Mv * const mv, const int ref_idx_curr, const int ref_idx) ++static av_always_inline const MvXY * ++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf) +{ -+ const MvField * const tab_mvf = s->ref->tab_mvf; -+ const int min_pu_width = s->ps.sps->min_pu_width; -+ -+ const RefPicList * const refPicList = s->ref->refPicList; -+ -+ if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) && -+ refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) { -+ *mv = TAB_MVF(x, y).mv[pred_flag_index]; -+ return 1; ++ if (mvf != NULL) ++ { ++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0) ++ return mvf->xy + pfi0; ++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0) ++ return mvf->xy + pfi1; + } -+ return 0; ++ return NULL; +} + -+static int mv_mp_mode_mx_lt(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index, -+ Mv * const mv, const int ref_idx_curr, const int ref_idx) ++static av_always_inline const MvXY * ++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, ++ const int islt0, const int poc0, const int poc_cur, ++ MvXY * const mv_t, const HEVCRpiMvField * const mvf) +{ -+ MvField *tab_mvf = s->ref->tab_mvf; -+ int min_pu_width = s->ps.sps->min_pu_width; -+ -+ RefPicList *refPicList = s->ref->refPicList; -+ -+ if ((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) { -+ int currIsLongTerm = refPicList[ref_idx_curr].isLongTerm[ref_idx]; -+ -+ int colIsLongTerm = -+ refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])]; -+ -+ if (colIsLongTerm == currIsLongTerm) { -+ *mv = TAB_MVF(x, y).mv[pred_flag_index]; -+ if (!currIsLongTerm) -+ dist_scale(s, mv, min_pu_width, x, y, -+ pred_flag_index, ref_idx_curr, ref_idx); -+ return 1; ++ if (mvf != NULL) ++ { ++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0) ++ { ++ const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]]; ++ if (islt0 || poc1 == poc0) { ++ return mvf->xy + pfi0; ++ } ++ *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0); ++ return mv_t; ++ } ++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0) ++ { ++ const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]]; ++ if (islt0 || poc1 == poc0) { ++ return mvf->xy + pfi1; ++ } ++ *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0); ++ return mv_t; + } + } -+ return 0; ++ return NULL; +} + -+#define MP_MX(v, pred, mx) \ -+ mv_mp_mode_mx(s, \ -+ (x ## v) >> s->ps.sps->log2_min_pu_size, \ -+ (y ## v) >> s->ps.sps->log2_min_pu_size, \ -+ pred, &mx, ref_idx_curr, ref_idx) -+ -+#define MP_MX_LT(v, pred, mx) \ -+ mv_mp_mode_mx_lt(s, \ -+ (x ## v) >> s->ps.sps->log2_min_pu_size, \ -+ (y ## v) >> s->ps.sps->log2_min_pu_size, \ -+ pred, &mx, ref_idx_curr, ref_idx) -+ -+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, -+ int x0, int y0, int nPbW, int nPbH, -+ int log2_cb_size, const unsigned int avail, int part_idx, -+ int merge_idx, MvField * const mv, -+ int mvp_lx_flag, int LX) ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ HEVCRpiMvField * const mv, ++ const unsigned int mvp_lx_flag, const unsigned int LX) +{ -+ const MvField *tab_mvf = s->ref->tab_mvf; -+ int isScaledFlag_L0 = 0; -+ int availableFlagLXA0 = 1; -+ int availableFlagLXB0 = 1; -+ int numMVPCandLX = 0; -+ int min_pu_width = s->ps.sps->min_pu_width; ++ const unsigned int pfi0 = LX; ++ const unsigned int pfi1 = LX == 0 ? 1 : 0; ++ const RefPicList * const rpl = s->refPicList; ++ const int poc0 = rpl[LX].list[mv->ref_idx[LX]]; ++ const int poc_cur = s->poc; ++ const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]]; + -+ int xA0, yA0; -+ int is_available_a0; -+ int xA1, yA1; -+ int is_available_a1; -+ int xB0, yB0; -+ int is_available_b0; -+ int xB1, yB1; -+ int is_available_b1; -+ int xB2, yB2; -+ int is_available_b2; ++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); ++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); ++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL ++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); ++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; ++ const MvXY * mva = NULL; ++ const MvXY * mvb; ++ MvXY * const mv_rv = mv->xy + LX; ++ MvXY mvt_a, mvt_b; + -+ Mv mvpcand_list[2] = { { 0 } }; -+ Mv mxA; -+ Mv mxB; -+ int ref_idx_curr; -+ int ref_idx = 0; -+ int pred_flag_index_l0; -+ int pred_flag_index_l1; ++ *mv_rv = 0; + -+ ref_idx_curr = LX; -+ ref_idx = mv->ref_idx[LX]; -+ pred_flag_index_l0 = LX; -+ pred_flag_index_l1 = !LX; ++ if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA) ++ mvf_a0 = NULL; ++ else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0) ++ goto use_mva; + -+ // left bottom spatial candidate -+ xA0 = x0 - 1; -+ yA0 = y0 + nPbH; ++ if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA) ++ mvf_a1 = NULL; + -+ is_available_a0 = AVAILABLE((avail & AVAIL_DL) != 0, A0); ++ if (mva == NULL && ++ (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL && ++ (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL) ++ mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1); + -+ //left spatial merge candidate -+ xA1 = x0 - 1; -+ yA1 = y0 + nPbH - 1; ++ if (mvp_lx_flag == 0 && mva != NULL) ++ goto use_mva; + -+ is_available_a1 = AVAILABLE((avail & AVAIL_L), A1); -+ if (is_available_a0 || is_available_a1) -+ isScaledFlag_L0 = 1; ++ if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA) ++ mvf_b0 = NULL; ++ if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA) ++ mvf_b1 = NULL; ++ if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA) ++ mvf_b2 = NULL; + -+ if (is_available_a0) { -+ if (MP_MX(A0, pred_flag_index_l0, mxA)) { -+ goto b_candidates; -+ } -+ if (MP_MX(A0, pred_flag_index_l1, mxA)) { -+ goto b_candidates; -+ } -+ } ++ if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL && ++ (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL) ++ mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2); + -+ if (is_available_a1) { -+ if (MP_MX(A1, pred_flag_index_l0, mxA)) { -+ goto b_candidates; -+ } -+ if (MP_MX(A1, pred_flag_index_l1, mxA)) { -+ goto b_candidates; -+ } -+ } ++ if (mvf_a0 == NULL && mvf_a1 == NULL) { ++ mva = mvb; ++ if (mvp_lx_flag == 0 && mva != NULL) ++ goto use_mva; + -+ if (is_available_a0) { -+ if (MP_MX_LT(A0, pred_flag_index_l0, mxA)) { -+ goto b_candidates; -+ } -+ if (MP_MX_LT(A0, pred_flag_index_l1, mxA)) { -+ goto b_candidates; -+ } ++ if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL && ++ (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL) ++ mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2); + } + -+ if (is_available_a1) { -+ if (MP_MX_LT(A1, pred_flag_index_l0, mxA)) { -+ goto b_candidates; -+ } -+ if (MP_MX_LT(A1, pred_flag_index_l1, mxA)) { -+ goto b_candidates; -+ } ++ if (mva == NULL) { ++ mva = mvb; ++ mvb = NULL; + } -+ availableFlagLXA0 = 0; -+ -+b_candidates: -+ // B candidates -+ // above right spatial merge candidate -+ xB0 = x0 + nPbW; -+ yB0 = y0 - 1; -+ -+ is_available_b0 = AVAILABLE((avail & AVAIL_UR) != 0, B0); + -+ // above spatial merge candidate -+ xB1 = x0 + nPbW - 1; -+ yB1 = y0 - 1; -+ is_available_b1 = AVAILABLE((avail & AVAIL_U) != 0, B1); ++ if (mvb != NULL && *mva == *mvb) // If A == B then ignore B ++ mvb = NULL; + -+ // above left spatial merge candidate -+ xB2 = x0 - 1; -+ yB2 = y0 - 1; -+ is_available_b2 = AVAILABLE((avail & AVAIL_UL) != 0, B2); -+ -+ // above right spatial merge candidate -+ if (is_available_b0) { -+ if (MP_MX(B0, pred_flag_index_l0, mxB)) { -+ goto scalef; -+ } -+ if (MP_MX(B0, pred_flag_index_l1, mxB)) { -+ goto scalef; -+ } ++ if (mvp_lx_flag == 0 && mva != NULL) { ++ goto use_mva; + } -+ -+ // above spatial merge candidate -+ if (is_available_b1) { -+ if (MP_MX(B1, pred_flag_index_l0, mxB)) { -+ goto scalef; -+ } -+ if (MP_MX(B1, pred_flag_index_l1, mxB)) { -+ goto scalef; -+ } ++ else if (mvp_lx_flag != 0 && mvb != NULL) { ++ *mv_rv = *mvb; + } -+ -+ // above left spatial merge candidate -+ if (is_available_b2) { -+ if (MP_MX(B2, pred_flag_index_l0, mxB)) { -+ goto scalef; -+ } -+ if (MP_MX(B2, pred_flag_index_l1, mxB)) { -+ goto scalef; -+ } -+ } -+ availableFlagLXB0 = 0; -+ -+scalef: -+ if (!isScaledFlag_L0) { -+ if (availableFlagLXB0) { -+ availableFlagLXA0 = 1; -+ mxA = mxB; -+ } -+ availableFlagLXB0 = 0; -+ -+ // XB0 and L1 -+ if (is_available_b0) { -+ availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l0, mxB); -+ if (!availableFlagLXB0) -+ availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l1, mxB); -+ } -+ -+ if (is_available_b1 && !availableFlagLXB0) { -+ availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l0, mxB); -+ if (!availableFlagLXB0) -+ availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l1, mxB); -+ } -+ -+ if (is_available_b2 && !availableFlagLXB0) { -+ availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l0, mxB); -+ if (!availableFlagLXB0) -+ availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l1, mxB); -+ } -+ } -+ -+ if (availableFlagLXA0) -+ mvpcand_list[numMVPCandLX++] = mxA; -+ -+ if (availableFlagLXB0 && (!availableFlagLXA0 || mxA.x != mxB.x || mxA.y != mxB.y)) -+ mvpcand_list[numMVPCandLX++] = mxB; -+ -+ //temporal motion vector prediction candidate -+ if (numMVPCandLX < 2 && s->sh.slice_temporal_mvp_enabled_flag && -+ mvp_lx_flag == numMVPCandLX) { -+ Mv mv_col; -+ int available_col = temporal_luma_motion_vector(s, lc, x0, y0, nPbW, -+ nPbH, ref_idx, -+ &mv_col, LX); -+ if (available_col) -+ mvpcand_list[numMVPCandLX++] = mv_col; ++ else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) { ++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, ++ nPbH, mv->ref_idx[LX], ++ mv_rv, LX); + } ++ return; + -+ mv->mv[LX] = mvpcand_list[mvp_lx_flag]; ++use_mva: ++ *mv_rv = *mva; ++ return; +} ++ diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c new file mode 100644 index 0000000000..04f9231acc @@ -19497,7 +19949,7 @@ index 0000000000..4b4d032a16 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */ diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c new file mode 100644 -index 0000000000..98e2fd7009 +index 0000000000..0866a26702 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.c @@ -0,0 +1,1940 @@ @@ -20613,7 +21065,7 @@ index 0000000000..98e2fd7009 + + // Inferred parameters + sps->log2_ctb_size = CtbLog2SizeY; -+ sps->log2_min_pu_size = sps->log2_min_cb_size - 1; ++// sps->log2_min_pu_size = sps->log2_min_cb_size - 1; + } + + sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb); @@ -20767,8 +21219,8 @@ index 0000000000..98e2fd7009 + sps->min_cb_height = sps->height >> sps->log2_min_cb_size; + sps->min_tb_width = sps->width >> sps->log2_min_tb_size; + sps->min_tb_height = sps->height >> sps->log2_min_tb_size; -+ sps->min_pu_width = sps->width >> sps->log2_min_pu_size; -+ sps->min_pu_height = sps->height >> sps->log2_min_pu_size; ++ sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE; ++ sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE; + sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1; + + sps->qp_bd_offset = 6 * (sps->bit_depth - 8); @@ -21043,7 +21495,7 @@ index 0000000000..98e2fd7009 + /** + * 6.5 + */ -+ pic_area_in_ctbs = sps->ctb_width * sps->ctb_height; ++ pic_area_in_ctbs = sps->ctb_size; + + pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); + pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); @@ -21443,10 +21895,10 @@ index 0000000000..98e2fd7009 +} diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h new file mode 100644 -index 0000000000..77af463e31 +index 0000000000..11d9e26853 --- /dev/null +++ b/libavcodec/rpi_hevc_ps.h -@@ -0,0 +1,442 @@ +@@ -0,0 +1,444 @@ +/* + * HEVC parameter set parsing + * @@ -21728,7 +22180,9 @@ index 0000000000..77af463e31 + unsigned int log2_min_tb_size; // 2..5 + unsigned int log2_max_trafo_size; + unsigned int log2_ctb_size; // 4..6 -+ unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1) ++// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1) ++#define LOG2_MIN_PU_SIZE 2 ++#define LOG2_MIN_CU_SIZE 3 + + int max_transform_hierarchy_depth_inter; + int max_transform_hierarchy_depth_intra; @@ -21891,10 +22345,10 @@ index 0000000000..77af463e31 +#endif /* AVCODEC_RPI_HEVC_PS_H */ diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c new file mode 100644 -index 0000000000..d7745711ab +index 0000000000..8cc5796cf0 --- /dev/null +++ b/libavcodec/rpi_hevc_refs.c -@@ -0,0 +1,515 @@ +@@ -0,0 +1,485 @@ +/* + * HEVC video decoder + * @@ -21926,7 +22380,7 @@ index 0000000000..d7745711ab +#include "hevc.h" +#include "rpi_hevcdec.h" + -+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags) ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags) +{ + /* frame->frame can be NULL if context init failed */ + if (!frame->frame || !frame->frame->buf[0]) @@ -21936,27 +22390,13 @@ index 0000000000..d7745711ab + if (!frame->flags) { + ff_thread_release_buffer(s->avctx, &frame->tf); + -+ av_buffer_unref(&frame->tab_mvf_buf); -+ frame->tab_mvf = NULL; -+ -+ av_buffer_unref(&frame->rpl_buf); -+ av_buffer_unref(&frame->rpl_tab_buf); -+ frame->rpl_tab = NULL; -+ frame->refPicList = NULL; ++ av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL ++ frame->col_mvf = NULL; + + frame->collocated_ref = NULL; + } +} + -+const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref, int x0, int y0) -+{ -+ int x_cb = x0 >> s->ps.sps->log2_ctb_size; -+ int y_cb = y0 >> s->ps.sps->log2_ctb_size; -+ int pic_width_cb = s->ps.sps->ctb_width; -+ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb]; -+ return (const RefPicList *)ref->rpl_tab[ctb_addr_ts]; -+} -+ +void ff_hevc_rpi_clear_refs(HEVCRpiContext *s) +{ + int i; @@ -21973,11 +22413,11 @@ index 0000000000..d7745711ab + ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); +} + -+static HEVCFrame *alloc_frame(HEVCRpiContext *s) ++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s) +{ -+ int i, j, ret; ++ int i, ret; + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame * const frame = &s->DPB[i]; + if (frame->frame->buf[0]) + continue; + @@ -21986,22 +22426,15 @@ index 0000000000..d7745711ab + if (ret < 0) + return NULL; + -+ frame->rpl_buf = av_buffer_allocz(s->pkt.nb_nals * sizeof(RefPicListTab)); -+ if (!frame->rpl_buf) -+ goto fail; -+ -+ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); -+ if (!frame->tab_mvf_buf) -+ goto fail; -+ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; -+ -+ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); -+ if (!frame->rpl_tab_buf) -+ goto fail; -+ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; -+ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; -+ for (j = 0; j < frame->ctb_count; j++) -+ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ frame->col_mvf = NULL; ++ frame->col_mvf_buf = NULL; ++ if (s->used_for_ref && !s->is_irap) ++ { ++ frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool); ++ if (!frame->col_mvf_buf) ++ goto fail; ++ frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data; ++ } + + frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; + frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); @@ -22018,12 +22451,12 @@ index 0000000000..d7745711ab + +int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc) +{ -+ HEVCFrame *ref; ++ HEVCRpiFrame *ref; + int i; + + /* check that this POC doesn't already exist */ + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame *frame = &s->DPB[i]; + + if (frame->frame->buf[0] && frame->sequence == s->seq_decode && + frame->poc == poc) { @@ -22064,7 +22497,7 @@ index 0000000000..d7745711ab + + if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) { + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame *frame = &s->DPB[i]; + if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc && + frame->sequence == s->seq_output) { + ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); @@ -22073,7 +22506,7 @@ index 0000000000..d7745711ab + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame *frame = &s->DPB[i]; + if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) && + frame->sequence == s->seq_output) { + nb_output++; @@ -22090,7 +22523,7 @@ index 0000000000..d7745711ab + return 0; + + if (nb_output) { -+ HEVCFrame *frame = &s->DPB[min_idx]; ++ HEVCRpiFrame *frame = &s->DPB[min_idx]; + if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1) + return 0; + @@ -22122,7 +22555,7 @@ index 0000000000..d7745711ab + int i; + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame *frame = &s->DPB[i]; + if ((frame->flags) && + frame->sequence == s->seq_output && + frame->poc != s->poc) { @@ -22132,7 +22565,7 @@ index 0000000000..d7745711ab + + if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) { + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame *frame = &s->DPB[i]; + if ((frame->flags) && + frame->sequence == s->seq_output && + frame->poc != s->poc) { @@ -22143,7 +22576,7 @@ index 0000000000..d7745711ab + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame *frame = &s->DPB[i]; + if (frame->flags & HEVC_FRAME_FLAG_OUTPUT && + frame->sequence == s->seq_output && + frame->poc <= min_poc) { @@ -22157,19 +22590,10 @@ index 0000000000..d7745711ab + +static int init_slice_rpl(HEVCRpiContext *s) +{ -+ HEVCFrame *frame = s->ref; -+ int ctb_count = frame->ctb_count; -+ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; -+ int i; -+ -+ if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab)) ++ if (s->slice_idx >= s->rpl_tab_size) + return AVERROR_INVALIDDATA; + -+ for (i = ctb_addr_ts; i < ctb_count; i++) -+ frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; -+ -+ frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts]; -+ ++ s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0; + return 0; +} + @@ -22193,7 +22617,7 @@ index 0000000000..d7745711ab + + for (list_idx = 0; list_idx < nb_list; list_idx++) { + RefPicList rpl_tmp = { { 0 } }; -+ RefPicList *rpl = &s->ref->refPicList[list_idx]; ++ RefPicList *rpl = &s->refPicList[list_idx]; + + /* The order of the elements is + * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and @@ -22243,13 +22667,13 @@ index 0000000000..d7745711ab + return 0; +} + -+static HEVCFrame *find_ref_idx(HEVCRpiContext *s, int poc) ++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc) +{ + int i; + int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1; + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *ref = &s->DPB[i]; ++ HEVCRpiFrame *ref = &s->DPB[i]; + if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) { + if ((ref->poc & LtMask) == poc) + return ref; @@ -22257,7 +22681,7 @@ index 0000000000..d7745711ab + } + + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *ref = &s->DPB[i]; ++ HEVCRpiFrame *ref = &s->DPB[i]; + if (ref->frame->buf[0] && ref->sequence == s->seq_decode) { + if (ref->poc == poc || (ref->poc & LtMask) == poc) + return ref; @@ -22270,15 +22694,15 @@ index 0000000000..d7745711ab + return NULL; +} + -+static void mark_ref(HEVCFrame *frame, int flag) ++static void mark_ref(HEVCRpiFrame *frame, int flag) +{ + frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF); + frame->flags |= flag; +} + -+static HEVCFrame *generate_missing_ref(HEVCRpiContext *s, int poc) ++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc) +{ -+ HEVCFrame *frame; ++ HEVCRpiFrame *frame; + int i, x, y; + + frame = alloc_frame(s); @@ -22311,7 +22735,7 @@ index 0000000000..d7745711ab +static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list, + int poc, int ref_flag) +{ -+ HEVCFrame *ref = find_ref_idx(s, poc); ++ HEVCRpiFrame *ref = find_ref_idx(s, poc); + + if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS) + return AVERROR_INVALIDDATA; @@ -22344,7 +22768,7 @@ index 0000000000..d7745711ab + + /* clear the reference flags on all frames except the current one */ + for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -+ HEVCFrame *frame = &s->DPB[i]; ++ HEVCRpiFrame *frame = &s->DPB[i]; + + if (frame == s->ref) + continue; @@ -27482,234 +27906,210 @@ index 0000000000..3caef20137 + diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h new file mode 100644 -index 0000000000..18128f4311 +index 0000000000..1c364492d0 --- /dev/null +++ b/libavcodec/rpi_hevc_transform10.h -@@ -0,0 +1,106 @@ +@@ -0,0 +1,94 @@ +static const unsigned char rpi_hevc_transform10 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000 -+0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008 -+0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010 -+0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018 -+0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020 -+0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028 -+0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030 -+0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038 -+0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, 0x39, 0xef, // 0040 -+0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048 -+0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050 -+0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058 -+0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060 -+0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068 -+0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070 -+0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078 -+0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080 -+0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088 -+0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090 -+0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098 -+0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0 -+0x30, 0xc0, 0x06, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8 -+0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0 -+0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8 -+0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0 -+0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8 -+0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0 -+0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8 -+0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0 -+0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8 -+0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0 -+0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8 -+0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100 -+0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108 -+0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110 -+0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118 -+0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120 -+0x00, 0x00, 0x05, 0xe8, 0x00, 0x02, 0x00, 0x00, // 0128 -+0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130 -+0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138 -+0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140 -+0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148 -+0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150 -+0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158 -+0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160 -+0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168 -+0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170 -+0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178 -+0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180 -+0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188 -+0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190 -+0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198 -+0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0 -+0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8 -+0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0 -+0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8 -+0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0 -+0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8 -+0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0 -+0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8 -+0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0 -+0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8 -+0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0 -+0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8 -+0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200 -+0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208 -+0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210 -+0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218 -+0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x02, // 0220 -+0x00, 0x00, 0x65, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228 -+0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230 -+0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238 -+0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240 -+0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248 -+0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250 -+0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258 -+0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260 -+0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268 -+0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290 -+0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298 -+0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0 -+0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8 -+0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0 -+0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8 -+0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0 -+0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8 -+0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0 -+0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8 -+0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0 -+0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8 -+0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0 -+0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8 -+0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308 -+0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310 -+0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318 -+0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320 -+0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338 ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 ++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 ++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 ++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 ++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 ++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 ++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030 ++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 ++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 ++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 ++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 ++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 ++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 ++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 ++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 ++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 ++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 ++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 ++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090 ++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 ++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 ++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 ++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 ++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 ++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 ++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 ++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 ++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 ++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 ++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 ++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 ++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 ++0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 ++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 ++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 ++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 ++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 ++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 ++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 ++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 ++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 ++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 ++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 ++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 ++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 ++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 ++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 ++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 ++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 ++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 ++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 ++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 ++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 ++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 ++0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8 ++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 ++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 ++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 ++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 ++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 ++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 ++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 ++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 ++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 ++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 ++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 ++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 ++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 ++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 ++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 ++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 ++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 ++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 ++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 ++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 ++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 ++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 ++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 ++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 +}; diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h new file mode 100644 -index 0000000000..3557348e30 +index 0000000000..1128a2c054 --- /dev/null +++ b/libavcodec/rpi_hevc_transform8.h -@@ -0,0 +1,106 @@ +@@ -0,0 +1,94 @@ +static const unsigned char rpi_hevc_transform8 [] = { -+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xe8, // 0000 -+0x20, 0x00, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0x88, // 0008 -+0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x40, 0xe8, // 0010 -+0x00, 0x02, 0x00, 0x00, 0x0c, 0xf8, 0x00, 0xa8, // 0018 -+0x00, 0x00, 0xc0, 0xf8, 0x00, 0x00, 0x00, 0x60, // 0020 -+0x03, 0xe8, 0x20, 0x00, 0x00, 0x00, 0x07, 0xe8, // 0028 -+0x00, 0x02, 0x00, 0x00, 0x08, 0xe8, 0x00, 0x04, // 0030 -+0x00, 0x00, 0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, // 0038 -+0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, 0x39, 0xef, // 0040 -+0xc0, 0xfd, 0xff, 0xff, 0x2b, 0xef, 0x40, 0x00, // 0048 -+0x00, 0x00, 0x5b, 0x7a, 0x5b, 0x7c, 0x4a, 0xc3, // 0050 -+0x50, 0x17, 0x02, 0x6f, 0x02, 0x6a, 0x32, 0x18, // 0058 -+0x0a, 0x6a, 0x16, 0x40, 0x04, 0x18, 0x1a, 0x66, // 0060 -+0x80, 0x90, 0x33, 0x00, 0x0c, 0xf8, 0x00, 0x80, // 0068 -+0x00, 0x00, 0xc0, 0x08, 0x18, 0x00, 0x80, 0x90, // 0070 -+0x5e, 0x00, 0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, // 0078 -+0x20, 0x08, 0x10, 0x00, 0x4c, 0xfe, 0x30, 0xc0, // 0080 -+0x09, 0x04, 0x20, 0x08, 0x00, 0x00, 0x04, 0xfe, // 0088 -+0x00, 0x90, 0x80, 0x02, 0x00, 0x08, 0x02, 0x00, // 0090 -+0x80, 0x90, 0x4d, 0x00, 0x04, 0xff, 0x30, 0xc0, // 0098 -+0x80, 0x03, 0x20, 0x08, 0x14, 0x00, 0x4c, 0xfe, // 00a0 -+0x30, 0xc0, 0x04, 0x04, 0x20, 0x08, 0x00, 0x00, // 00a8 -+0x8c, 0xf8, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x30, // 00b0 -+0x04, 0x00, 0x80, 0x45, 0x71, 0x42, 0xf2, 0x8c, // 00b8 -+0xd1, 0xc0, 0x39, 0xef, 0x40, 0x02, 0x00, 0x00, // 00c0 -+0x00, 0x9e, 0x7f, 0x00, 0x29, 0x03, 0x00, 0xfe, // 00c8 -+0x00, 0x80, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, // 00d0 -+0xb6, 0x40, 0x8c, 0xf8, 0x20, 0x00, 0x00, 0x00, // 00d8 -+0x00, 0x30, 0x18, 0x00, 0x15, 0x40, 0x08, 0xf8, // 00e0 -+0x00, 0x80, 0x00, 0x00, 0xc0, 0x03, 0x14, 0x00, // 00e8 -+0x66, 0xed, 0xe0, 0xff, 0xff, 0xff, 0x88, 0xf8, // 00f0 -+0x20, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x18, 0x00, // 00f8 -+0x0c, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 0100 -+0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0x84, 0x6e, // 0108 -+0x09, 0x18, 0x69, 0xa0, 0x04, 0x5f, 0x1c, 0x8b, // 0110 -+0xf6, 0xc8, 0x45, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0118 -+0x63, 0x1f, 0xb6, 0x40, 0x04, 0xe8, 0x40, 0x00, // 0120 -+0x00, 0x00, 0x05, 0xe8, 0x00, 0x08, 0x00, 0x00, // 0128 -+0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0130 -+0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0138 -+0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0140 -+0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0148 -+0x5a, 0x00, 0x46, 0xc0, 0x50, 0x07, 0xa4, 0xff, // 0150 -+0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, 0x3e, 0x00, // 0158 -+0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, 0xe0, 0x03, // 0160 -+0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, 0x00, 0x67, // 0168 -+0x5a, 0x00, 0x00, 0xf6, 0x00, 0x80, 0x00, 0x04, // 0170 -+0x20, 0xed, 0x00, 0x08, 0x00, 0x00, 0x04, 0xe8, // 0178 -+0x20, 0x00, 0x00, 0x00, 0x8e, 0xf8, 0x20, 0x00, // 0180 -+0x00, 0x00, 0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, // 0188 -+0x00, 0x80, 0x81, 0x03, 0x26, 0xed, 0xe0, 0xff, // 0190 -+0xff, 0xff, 0x88, 0xf0, 0x20, 0x00, 0x86, 0x03, // 0198 -+0x08, 0x60, 0x64, 0x08, 0x46, 0xc0, 0x44, 0x37, // 01a0 -+0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, 0xa4, 0x6e, // 01a8 -+0x7f, 0x90, 0xb9, 0xff, 0x65, 0xa0, 0x04, 0x07, // 01b0 -+0x18, 0x8b, 0xf5, 0xc8, 0x41, 0xe8, 0x20, 0x00, // 01b8 -+0x00, 0x00, 0x66, 0x1f, 0x5a, 0x00, 0xe1, 0x40, // 01c0 -+0xf2, 0x40, 0x4f, 0xc3, 0x50, 0x7f, 0x02, 0x6f, // 01c8 -+0x03, 0xe8, 0x80, 0x00, 0x00, 0x00, 0x07, 0xe8, // 01d0 -+0x00, 0x02, 0x00, 0x00, 0xe8, 0x00, 0x08, 0x6d, // 01d8 -+0xe8, 0xbf, 0x80, 0x01, 0x04, 0x18, 0x08, 0xed, // 01e0 -+0x20, 0x10, 0x00, 0x00, 0x89, 0x40, 0x1a, 0x40, // 01e8 -+0x02, 0x6a, 0x2e, 0x18, 0xa1, 0x40, 0x98, 0x40, // 01f0 -+0xf2, 0x4a, 0x07, 0x1e, 0xff, 0x9f, 0xbb, 0xff, // 01f8 -+0x21, 0xed, 0x00, 0x08, 0x00, 0x00, 0x98, 0x40, // 0200 -+0x04, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x95, 0x60, // 0208 -+0x80, 0x90, 0x20, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0210 -+0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0218 -+0x80, 0x90, 0x18, 0x00, 0x04, 0xe8, 0x00, 0x08, // 0220 -+0x00, 0x00, 0x45, 0x60, 0x91, 0x40, 0xa8, 0x40, // 0228 -+0x80, 0x90, 0x10, 0x00, 0x48, 0xe8, 0x00, 0x04, // 0230 -+0x00, 0x00, 0x41, 0xe8, 0x20, 0x00, 0x00, 0x00, // 0238 -+0x80, 0x90, 0x08, 0x00, 0x4a, 0xe8, 0x00, 0x08, // 0240 -+0x00, 0x00, 0xf2, 0x8c, 0xd5, 0xc0, 0x29, 0x03, // 0248 -+0xef, 0x03, 0x0c, 0xf8, 0x00, 0x80, 0x00, 0x00, // 0250 -+0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, 0x00, 0x84, // 0258 -+0x40, 0x00, 0xc0, 0xf8, 0x04, 0x00, 0x00, 0x60, // 0260 -+0xff, 0x9f, 0x65, 0xff, 0x00, 0xe8, 0x00, 0x04, // 0268 -+0x00, 0x00, 0xff, 0x9f, 0x70, 0xff, 0x04, 0xff, // 0270 -+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0278 -+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0280 -+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0288 -+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xe8, // 0290 -+0x40, 0x00, 0x00, 0x00, 0x8c, 0xf8, 0x2f, 0x00, // 0298 -+0x00, 0x00, 0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, // 02a0 -+0xf0, 0xcf, 0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, // 02a8 -+0x11, 0x13, 0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, // 02b0 -+0x20, 0xf7, 0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, // 02b8 -+0xf0, 0xce, 0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, // 02c0 -+0x15, 0x53, 0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, // 02c8 -+0x20, 0xf7, 0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, // 02d0 -+0xf0, 0xcd, 0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, // 02d8 -+0x19, 0x93, 0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, // 02e0 -+0x20, 0xf7, 0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, // 02e8 -+0xf0, 0xcc, 0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, // 02f0 -+0x1d, 0xd3, 0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, // 02f8 -+0x20, 0xf7, 0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, // 0300 -+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, // 0308 -+0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0310 -+0x14, 0x00, 0x00, 0xed, 0x20, 0x00, 0x00, 0x00, // 0318 -+0x8c, 0xf8, 0x2f, 0x00, 0x00, 0x00, 0xe0, 0x63, // 0320 -+0x00, 0x00, 0x6f, 0x03, 0x00, 0x00, 0x00, 0x00, // 0328 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0330 -+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0338 ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 ++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 ++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 ++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 ++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 ++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 ++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030 ++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 ++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 ++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 ++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 ++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 ++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 ++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 ++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 ++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 ++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 ++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 ++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090 ++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 ++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 ++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 ++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 ++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 ++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 ++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 ++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 ++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 ++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 ++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 ++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 ++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 ++0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 ++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 ++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 ++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 ++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 ++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 ++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 ++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 ++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 ++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 ++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 ++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 ++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 ++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 ++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 ++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 ++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 ++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 ++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 ++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 ++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 ++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 ++0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8 ++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 ++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 ++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 ++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 ++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 ++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 ++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 ++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 ++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 ++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 ++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 ++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 ++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 ++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 ++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 ++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 ++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 ++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 ++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 ++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 ++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 ++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 ++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 ++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 +}; diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c new file mode 100644 -index 0000000000..eef98e5643 +index 0000000000..25ae294ff4 --- /dev/null +++ b/libavcodec/rpi_hevcdec.c -@@ -0,0 +1,5820 @@ +@@ -0,0 +1,6013 @@ +/* + * HEVC video Decoder + * @@ -27957,7 +28357,7 @@ index 0000000000..eef98e5643 + +// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3 +// (4 not required) -+static void set_cabac_stash(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a) ++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a) +{ + switch (ln) + { @@ -27984,6 +28384,18 @@ index 0000000000..eef98e5643 + *(uint32_t *)b_l = a; + *(uint32_t *)(b_l + 4) = a; + break; ++ case 4: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b_u = a; ++ *(uint32_t *)(b_u + 4) = a; ++ *(uint32_t *)(b_u + 8) = a; ++ *(uint32_t *)(b_u + 12) = a; ++ *(uint32_t *)b_l = a; ++ *(uint32_t *)(b_l + 4) = a; ++ *(uint32_t *)(b_l + 8) = a; ++ *(uint32_t *)(b_l + 12) = a; ++ break; + } +} + @@ -28023,7 +28435,7 @@ index 0000000000..eef98e5643 + + switch (ln) + { -+ case 0: // 1 ++ default: // 1 + f[0] |= 1 << sh; + break; + case 1: // 3 * 2 @@ -28038,7 +28450,7 @@ index 0000000000..eef98e5643 + f[stride * 2] |= n; + f[stride * 3] |= n; + break; -+ default: // 0xff * 8 ++ case 3: // 0xff * 8 + for (n = 0; n != 8; ++n, f += stride) + *f = 0xff; + break; @@ -28556,7 +28968,7 @@ index 0000000000..eef98e5643 +} + +void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int val, const int field) ++ const HEVCRpiFrame * const ref, const int val, const int field) +{ + if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { + HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data; @@ -28692,34 +29104,41 @@ index 0000000000..eef98e5643 + av_freep(&s->cabac_stash_up); + s->cabac_stash_left = NULL; // freed with _up + -+ av_freep(&s->tab_ipm); ++ av_freep(&s->mvf_up); ++ av_freep(&s->mvf_left); ++ + av_freep(&s->is_pcm); ++ av_freep(&s->is_intra_store); ++ s->is_intra = NULL; ++ av_freep(&s->rpl_tab); ++ s->rpl_tab_size = 0; + + av_freep(&s->qp_y_tab); + av_freep(&s->tab_slice_address); + av_freep(&s->filter_slice_edges); + + av_freep(&s->bs_horizontal); -+ av_freep(&s->bs_vertical); ++ s->bs_vertical = NULL; // freed with H + av_freep(&s->bsf_stash_left); + av_freep(&s->bsf_stash_up); + ++ av_freep(&s->rpl_up); ++ av_freep(&s->rpl_left); ++ + alloc_entry_points(&s->sh, 0); + -+ av_buffer_pool_uninit(&s->tab_mvf_pool); -+ av_buffer_pool_uninit(&s->rpl_tab_pool); ++ av_buffer_pool_uninit(&s->col_mvf_pool); +} + +/* allocate arrays that depend on frame dimensions */ -+static int pic_arrays_init(HEVCRpiContext *s, const HEVCRpiSPS *sps) ++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps) +{ -+ int log2_min_cb_size = sps->log2_min_cb_size; -+ int width = sps->width; -+ int height = sps->height; -+ int pic_size_in_ctb = ((width >> log2_min_cb_size) + 1) * ++ const unsigned int log2_min_cb_size = sps->log2_min_cb_size; ++ const unsigned int width = sps->width; ++ const unsigned int height = sps->height; ++ const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) * + ((height >> log2_min_cb_size) + 1); -+ int ctb_count = sps->ctb_width * sps->ctb_height; -+ int min_pu_size = sps->min_pu_width * sps->min_pu_height; ++ const unsigned int ctb_count = sps->ctb_size; + + { + unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK); @@ -28739,36 +29158,45 @@ index 0000000000..eef98e5643 + if (s->cabac_stash_up == NULL) + goto fail; + -+ s->tab_ipm = av_mallocz(min_pu_size); ++ // Round width up to max ctb size ++ s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); ++ // * Only needed if we have H tiles ++ s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); ++ + // We can overread by 1 line & one byte in deblock so alloc & zero + // We don't need to zero the extra @ start of frame as it will never be + // written + s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); -+ if (!s->tab_ipm || !s->is_pcm) ++ s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); ++ if (s->is_pcm == NULL || s->is_intra_store == NULL) + goto fail; + + s->filter_slice_edges = av_mallocz(ctb_count); -+ s->tab_slice_address = av_malloc_array(pic_size_in_ctb, ++ s->tab_slice_address = av_malloc_array(ctb_count, + sizeof(*s->tab_slice_address)); -+ s->qp_y_tab = av_malloc_array(pic_size_in_ctb, ++ s->qp_y_tab = av_malloc_array(pic_size_in_cb, + sizeof(*s->qp_y_tab)); + if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) + goto fail; + -+ s->bs_horizontal = av_mallocz(s->bs_size); -+ s->bs_vertical = av_mallocz(s->bs_size); -+ if (s->bs_horizontal == NULL || s->bs_vertical == NULL) ++ s->bs_horizontal = av_mallocz(s->bs_size * 2); ++ s->bs_vertical = s->bs_horizontal + s->bs_size; ++ if (s->bs_horizontal == NULL) ++ goto fail; ++ ++ s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up)); ++ s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left)); ++ if (s->rpl_left == NULL || s->rpl_up == NULL) + goto fail; + + if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL || + (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL) + goto fail; + -+ s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField), -+ av_buffer_allocz); -+ s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab), ++ s->col_mvf_stride = (width + 15) >> 4; ++ s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField), + av_buffer_allocz); -+ if (!s->tab_mvf_pool || !s->rpl_tab_pool) ++ if (s->col_mvf_pool == NULL) + goto fail; + + return 0; @@ -29186,10 +29614,9 @@ index 0000000000..eef98e5643 + if (s->ps.pps->dependent_slice_segments_enabled_flag) + sh->dependent_slice_segment_flag = get_bits1(gb); + -+ slice_address_length = av_ceil_log2(s->ps.sps->ctb_width * -+ s->ps.sps->ctb_height); ++ slice_address_length = av_ceil_log2(s->ps.sps->ctb_size); + sh->slice_segment_addr = get_bitsz(gb, slice_address_length); -+ if (sh->slice_segment_addr >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) { ++ if (sh->slice_segment_addr >= s->ps.sps->ctb_size) { + av_log(s->avctx, AV_LOG_ERROR, + "Invalid slice segment address: %u.\n", + sh->slice_segment_addr); @@ -30043,11 +30470,11 @@ index 0000000000..eef98e5643 +} + + -+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCFrame * const ref, -+ const Mv * const mv, const int y0, const int height) ++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref, ++ const MvXY xy, const int y0, const int height) +{ + if (s->threads_type != 0) { -+ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); ++ const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9); + + // Progress has to be attached to current job as the actual wait + // is in worker_core which can't use lc @@ -30060,8 +30487,8 @@ index 0000000000..eef98e5643 + +static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x0, const int y0, const int nPbW, -+ const int nPbH, const int log2_cb_size, const int part_idx, -+ const int merge_idx, MvField * const mv) ++ const int nPbH, ++ HEVCRpiMvField * const mv) +{ + enum InterPredIdc inter_pred_idc = PRED_L0; + int mvp_flag; @@ -30072,34 +30499,33 @@ index 0000000000..eef98e5643 + inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); + + if (inter_pred_idc != PRED_L1) { ++ MvXY mvd; ++ + if (s->sh.nb_refs[L0]) + mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]); + + mv->pred_flag = PF_L0; -+ ff_hevc_rpi_hls_mvd_coding(lc); ++ mvd = ff_hevc_rpi_hls_mvd_coding(lc); + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail, -+ part_idx, merge_idx, mv, mvp_flag, 0); -+ mv->mv[0].x += lc->pu.mvd.x; -+ mv->mv[0].y += lc->pu.mvd.y; ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, ++ mv, mvp_flag, 0); ++ mv->xy[0] = mvxy_add(mv->xy[0], mvd); + } + + if (inter_pred_idc != PRED_L0) { ++ MvXY mvd = 0; ++ + if (s->sh.nb_refs[L1]) -+ mv->ref_idx[1]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); ++ mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); + -+ if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) { -+ AV_ZERO32(&lc->pu.mvd); -+ } else { -+ ff_hevc_rpi_hls_mvd_coding(lc); -+ } ++ if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI) ++ mvd = ff_hevc_rpi_hls_mvd_coding(lc); + + mv->pred_flag += PF_L1; + mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); -+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, avail, -+ part_idx, merge_idx, mv, mvp_flag, 1); -+ mv->mv[1].x += lc->pu.mvd.x; -+ mv->mv[1].y += lc->pu.mvd.y; ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, ++ mv, mvp_flag, 1); ++ mv->xy[1] = mvxy_add(mv->xy[1], mvd); + } +} + @@ -30217,14 +30643,14 @@ index 0000000000..eef98e5643 +rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb, + const int x0, const int y0, + const int nPbW, const int nPbH, -+ const Mv *const mv, ++ const MvXY mv_xy, + const int weight_mul, + const int weight_offset, + AVFrame *const src_frame) +{ + const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; ++ const unsigned int mx = MV_X(mv_xy) & 3; ++ const unsigned int my = MV_Y(mv_xy) & 3; + const unsigned int my_mx = (my << 8) | mx; + const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; + const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); @@ -30235,8 +30661,8 @@ index 0000000000..eef98e5643 + + if (my_mx == 0) + { -+ const int x1 = x0 + (mv->x >> 2); -+ const int y1 = y0 + (mv->y >> 2); ++ const int x1 = x0 + (MV_X(mv_xy) >> 2); ++ const int y1 = y0 + (MV_Y(mv_xy) >> 2); + const int bh = nPbH; + + for (int start_x = 0; start_x < nPbW; start_x += 16) @@ -30276,8 +30702,8 @@ index 0000000000..eef98e5643 + } + else + { -+ const int x1_m3 = x0 + (mv->x >> 2) - 3; -+ const int y1_m3 = y0 + (mv->y >> 2) - 3; ++ const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3; ++ const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3; + const unsigned int bh = nPbH; + int start_x = 0; + @@ -30380,19 +30806,19 @@ index 0000000000..eef98e5643 +rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const int x0, const int y0, + const int nPbW, const int nPbH, -+ const struct MvField *const mv_field, ++ const struct HEVCRpiMvField *const mv_field, + const AVFrame *const src_frame, + const AVFrame *const src_frame2) +{ + const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); -+ const Mv * const mv = mv_field->mv + 0; -+ const Mv * const mv2 = mv_field->mv + 1; ++ const MvXY const mv = mv_field->xy[0]; ++ const MvXY const mv2 = mv_field->xy[1]; + -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; ++ const unsigned int mx = MV_X(mv) & 3; ++ const unsigned int my = MV_Y(mv) & 3; + const unsigned int my_mx = (my<<8) | mx; -+ const unsigned int mx2 = mv2->x & 3; -+ const unsigned int my2 = mv2->y & 3; ++ const unsigned int mx2 = MV_X(mv2) & 3; ++ const unsigned int my2 = MV_Y(mv2) & 3; + const unsigned int my2_mx2 = (my2<<8) | mx2; + const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; + const unsigned int ref_idx0 = mv_field->ref_idx[0]; @@ -30410,10 +30836,10 @@ index 0000000000..eef98e5643 + + if (my2_mx2_my_mx == 0) + { -+ const int x1 = x0 + (mv->x >> 2); -+ const int y1 = y0 + (mv->y >> 2); -+ const int x2 = x0 + (mv2->x >> 2); -+ const int y2 = y0 + (mv2->y >> 2); ++ const int x1 = x0 + (MV_X(mv) >> 2); ++ const int y1 = y0 + (MV_Y(mv) >> 2); ++ const int x2 = x0 + (MV_X(mv2) >> 2); ++ const int y2 = y0 + (MV_Y(mv2) >> 2); + const int bh = nPbH; + + // Can do chunks a full 16 wide if we don't want the H filter @@ -30454,10 +30880,10 @@ index 0000000000..eef98e5643 + else + { + // Filter requires a run-up of 3 -+ const int x1 = x0 + (mv->x >> 2) - 3; -+ const int y1 = y0 + (mv->y >> 2) - 3; -+ const int x2 = x0 + (mv2->x >> 2) - 3; -+ const int y2 = y0 + (mv2->y >> 2) - 3; ++ const int x1 = x0 + (MV_X(mv) >> 2) - 3; ++ const int y1 = y0 + (MV_Y(mv) >> 2) - 3; ++ const int x2 = x0 + (MV_X(mv2) >> 2) - 3; ++ const int y2 = y0 + (MV_Y(mv2) >> 2) - 3; + const int bh = nPbH; + + for (int start_x=0; start_x < nPbW; start_x += 8) @@ -30512,7 +30938,7 @@ index 0000000000..eef98e5643 +rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const unsigned int lx, const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, -+ const Mv * const mv, ++ const MvXY const mv, + const int16_t * const c_weights, + const int16_t * const c_offsets, + AVFrame * const src_frame) @@ -30521,11 +30947,11 @@ index 0000000000..eef98e5643 + const int hshift = 1; // = s->ps.sps->hshift[1]; + const int vshift = 1; // = s->ps.sps->vshift[1]; + -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; + const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); -+ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; -+ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)]; + const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]); + const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]); + qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; @@ -30563,7 +30989,7 @@ index 0000000000..eef98e5643 +rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const int x0_c, const int y0_c, + const int nPbW_c, const int nPbH_c, -+ const struct MvField * const mv_field, ++ const struct HEVCRpiMvField * const mv_field, + const int16_t * const c_weights, + const int16_t * const c_offsets, + const int16_t * const c_weights2, @@ -30574,23 +31000,23 @@ index 0000000000..eef98e5643 + const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); + const int hshift = 1; // s->ps.sps->hshift[1]; + const int vshift = 1; // s->ps.sps->vshift[1]; -+ const Mv * const mv = mv_field->mv + 0; -+ const Mv * const mv2 = mv_field->mv + 1; ++ const MvXY const mv = mv_field->xy[0]; ++ const MvXY const mv2 = mv_field->xy[1]; + -+ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); -+ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); ++ const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift); ++ const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift); + const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; + const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; + -+ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); -+ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); ++ const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift); + const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; + const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector + -+ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; -+ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1; + + const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]); + const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]); @@ -30637,22 +31063,65 @@ index 0000000000..eef98e5643 +} + + ++static inline void ++col_stash(const HEVCRpiContext * const s, ++ const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0, ++ const HEVCRpiMvField * const mvf) ++{ ++ ColMvField * const col_mvf = s->ref->col_mvf; ++ const unsigned int x = (x0 + 15) >> 4; ++ const unsigned int y = (y0 + 15) >> 4; ++ const unsigned int w = ((x0 + 15 + w0) >> 4) - x; ++ const unsigned int h = ((y0 + 15 + h0) >> 4) - y; ++ ++ if (col_mvf != NULL && w != 0 && h != 0) ++ { ++ // Only record MV from the top left of the 16x16 block ++ ++ const RefPicList * const rpl = s->refPicList; ++ const ColMvField cmv = { ++ .L = { ++ { ++ .poc = (mvf->pred_flag & PF_L0) == 0 ? ++ COL_POC_INTRA : ++ COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]), ++ .xy = mvf->xy[0] ++ }, ++ { ++ .poc = (mvf->pred_flag & PF_L1) == 0 ? ++ COL_POC_INTRA : ++ COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]), ++ .xy = mvf->xy[1] ++ } ++ } ++ }; ++ ++ ColMvField * p = col_mvf + y * s->col_mvf_stride + x; ++ const unsigned int stride = s->col_mvf_stride - w; ++ unsigned int j = h; ++ ++ do ++ { ++ unsigned int k = w; ++ do ++ { ++ *p++ = cmv; ++ } while (--k != 0); ++ p += stride; ++ } while (--j != 0); ++ } ++} ++ +static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, -+ const int x0, const int y0, -+ const int nPbW, const int nPbH, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, + const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) +{ + HEVCRpiJob * const jb = lc->jb0; + -+ struct MvField current_mv = {{{ 0 }}}; -+ -+ int min_pu_width = s->ps.sps->min_pu_width; -+ -+ MvField * const tab_mvf = s->ref->tab_mvf; -+ const RefPicList *const refPicList = s->ref->refPicList; -+ const HEVCFrame *ref0 = NULL, *ref1 = NULL; -+ int x_pu, y_pu; -+ int i, j; ++ struct HEVCRpiMvField current_mv = {{0}}; ++ const RefPicList *const refPicList = s->refPicList; ++ const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL; + + if (lc->cu.pred_mode != MODE_SKIP) + lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc); @@ -30664,28 +31133,34 @@ index 0000000000..eef98e5643 + ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, + partIdx, merge_idx, ¤t_mv); + } else { -+ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, -+ partIdx, 0, ¤t_mv); ++ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, ¤t_mv); + } + -+ x_pu = x0 >> s->ps.sps->log2_min_pu_size; -+ y_pu = y0 >> s->ps.sps->log2_min_pu_size; ++ { ++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); ++ unsigned int i, j; + -+ for (j = 0; j < nPbH >> s->ps.sps->log2_min_pu_size; j++) -+ for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++) -+ tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv; ++ for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++) ++ { ++ for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++) ++ p[i] = current_mv; ++ p += MVF_STASH_WIDTH_PU; ++ } ++ } ++ ++ col_stash(s, x0, y0, nPbW, nPbH, ¤t_mv); + + if (current_mv.pred_flag & PF_L0) { + ref0 = refPicList[0].ref[current_mv.ref_idx[0]]; + if (!ref0) + return; -+ hevc_await_progress(s, lc, ref0, ¤t_mv.mv[0], y0, nPbH); ++ hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH); + } + if (current_mv.pred_flag & PF_L1) { + ref1 = refPicList[1].ref[current_mv.ref_idx[1]]; + if (!ref1) + return; -+ hevc_await_progress(s, lc, ref1, ¤t_mv.mv[1], y0, nPbH); ++ hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH); + } + + if (current_mv.pred_flag == PF_L0) { @@ -30694,12 +31169,12 @@ index 0000000000..eef98e5643 + const int nPbW_c = nPbW >> ctx_hshift(s, 1); + const int nPbH_c = nPbH >> ctx_vshift(s, 1); + -+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 0, ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0], + s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], + ref0->frame); + + if (ctx_cfmt(s) != 0) { -+ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, ++ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0], + s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], + ref0->frame); + return; @@ -30710,12 +31185,12 @@ index 0000000000..eef98e5643 + const int nPbW_c = nPbW >> ctx_hshift(s, 1); + const int nPbH_c = nPbH >> ctx_vshift(s, 1); + -+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 1, ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1], + s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], + ref1->frame); + + if (ctx_cfmt(s) != 0) { -+ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, ++ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1], + s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], + ref1->frame); + return; @@ -30747,30 +31222,65 @@ index 0000000000..eef98e5643 + const unsigned int log2_cb_size, + const unsigned int ipm) +{ -+ const unsigned int min_pu_width = s->ps.sps->min_pu_width; -+ const unsigned int x_pu = x0 >> s->ps.sps->log2_min_pu_size; -+ const unsigned int y_pu = y0 >> s->ps.sps->log2_min_pu_size; ++ const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE; ++ const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE; + -+ set_bytes(s->tab_ipm + y_pu * min_pu_width + x_pu, min_pu_width, log2_cb_size - s->ps.sps->log2_min_pu_size, ipm); ++ { ++ const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE)); ++ set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm); ++ } + -+ if (lc->cu.pred_mode == MODE_INTRA) ++ // If IRAP then everything is Intra & we avoid ever looking at these ++ // stashes so don't bother setting them ++ if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA) + { -+ unsigned int j, k; -+ MvField * tab_mvf = s->ref->tab_mvf + y_pu * min_pu_width + x_pu; -+ const unsigned int size_in_pus = (1 << log2_cb_size) >> s->ps.sps->log2_min_pu_size; ++ if (s->is_intra != NULL) ++ { ++ set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE); ++ } + -+ if (size_in_pus <= 1) -+ tab_mvf[0].pred_flag = PF_INTRA; -+ else + { -+ for (j = 0; j < size_in_pus; j++, tab_mvf += min_pu_width) -+ for (k = 0; k < size_in_pus; k++) -+ tab_mvf[k].pred_flag = PF_INTRA; ++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); ++ const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1 ++ unsigned int n = size_in_pus; ++ ++ do ++ { ++ memset(p, 0, size_in_pus * sizeof(*p)); ++ p += MVF_STASH_WIDTH_PU; ++ } while (--n != 0); ++ } ++ ++ ++ if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0) ++ { ++ // Only record top left stuff ++ // Blocks should always be alinged on size boundries ++ // so cannot have overflow from a small block ++ ++ ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4); ++ const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4)); ++ const unsigned int stride = s->col_mvf_stride - size_in_col; ++ unsigned int j = size_in_col; ++ ++ do ++ { ++ unsigned int k = size_in_col; ++ do ++ { ++ p->L[0].poc = COL_POC_INTRA; ++ p->L[0].xy = 0; ++ p->L[1].poc = COL_POC_INTRA; ++ p->L[1].xy = 0; ++ ++p; ++ } while (--k != 0); ++ p += stride; ++ } while (--j != 0); + } + } +} + -+static void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, + const unsigned int log2_cb_size) +{ @@ -30786,17 +31296,14 @@ index 0000000000..eef98e5643 + int prev_intra_luma_pred_flag, + const unsigned int idx) +{ -+ int x_pu = x0 >> s->ps.sps->log2_min_pu_size; -+ int y_pu = y0 >> s->ps.sps->log2_min_pu_size; -+ int min_pu_width = s->ps.sps->min_pu_width; -+ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); -+ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); ++ const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size); ++ int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE; ++ int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE; + -+ // intra_pred_mode prediction does not cross vertical CTB boundaries -+ const unsigned int cand_up = y0b != 0 ? -+ s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC; -+ const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) != 0 || x0b) ? -+ s->tab_ipm[y_pu * min_pu_width + x_pu - 1] : INTRA_DC; ++ // Up does not cross boundries so as we always scan 1 slice-tile-line in an ++ // lc we can just keep 1 CTB lR stashes ++ const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu]; ++ const unsigned int cand_left = ((lc->ctb_avail & AVAIL_L) == 0 && xb_pu == 0) ? INTRA_DC : lc->ipm_left[yb_pu]; + + int intra_pred_mode; + int a, b, c; @@ -31075,7 +31582,7 @@ index 0000000000..eef98e5643 + + set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff); + -+ set_cabac_stash(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag); ++ set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag); + + return 0; +} @@ -31959,11 +32466,27 @@ index 0000000000..eef98e5643 + zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), s->ps.sps->log2_ctb_size - 3); + if ((lc->ctb_avail & AVAIL_L) == 0) + zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), s->ps.sps->log2_ctb_size - 3); ++#if MVF_STASH_WIDTH > 64 ++ // Restore left mvf stash at start of tile if not at start of line ++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap) ++ { ++ unsigned int i; ++ HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0); ++ const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); ++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) ++ { ++ *dst = *src++; ++ dst += MVF_STASH_WIDTH_PU; ++ } ++ } ++#endif + + // Set initial tu states + lc->tu.cu_qp_delta = 0; + lc->tu.is_cu_qp_delta_wanted = 0; + lc->tu.cu_chroma_qp_offset_wanted = 0; ++ ++ // Decode + more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); + + if (ff_hevc_rpi_cabac_overflow(lc)) @@ -31973,7 +32496,7 @@ index 0000000000..eef98e5643 + } + + if (more_data < 0) { -+ s->tab_slice_address[ctb_addr_rs] = -1; // Mark slice as broken ++ s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken + return more_data; + } + @@ -31988,6 +32511,41 @@ index 0000000000..eef98e5643 + } + } + ++ // --- Post CTB processing ++ ++ // Stash rpl top/left for deblock that needs to remember such things cross-slice ++ s->rpl_up[x_ctb >> s->ps.sps->log2_ctb_size] = s->refPicList; ++ s->rpl_left[y_ctb >> s->ps.sps->log2_ctb_size] = s->refPicList; ++ ++ if (!s->is_irap) ++ { ++ // Copy MVF up to up-left & stash to up ++ { ++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1); ++ HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE); ++ ++ // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst); ++ ++ lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE]; ++ memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE); ++ } ++ // Stash sideways if end of tile line but not end of line (no point) ++ // ** Could/should do this @ end of fn ++#if MVF_STASH_WIDTH > 64 ++ if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL) ++#endif ++ { ++ unsigned int i; ++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0); ++ HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); ++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) ++ { ++ *dst++ = *src; ++ src += MVF_STASH_WIDTH_PU; ++ } ++ } ++ } ++ + if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0) + ff_hevc_rpi_save_states(s, lc); + @@ -32237,7 +32795,29 @@ index 0000000000..eef98e5643 + else + { + movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); ++#if MVF_STASH_WIDTH > 64 ++ // Horrid calculations to work out what we want but luckily this should almost never execute ++ // **** Move to movlc ++ if (!s->is_irap) ++ { ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts]; ++ if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf ++ { ++ const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1; ++ unsigned int i; ++ const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); ++ HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); + ++ for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i) ++ { ++ *d_mvf = *s_mvf; ++ d_mvf += MVF_STASH_WIDTH_PU; ++ s_mvf += MVF_STASH_WIDTH_PU; ++ } ++ ++ } ++ } ++#endif + // When all done poke the thread 0 sem_in one final time +#if TRACE_WPP + printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in); @@ -32541,7 +33121,7 @@ index 0000000000..eef98e5643 +static void set_no_backward_pred(HEVCRpiContext * const s) +{ + int i, j; -+ const RefPicList *const refPicList = s->ref->refPicList; ++ const RefPicList *const refPicList = s->refPicList; + + s->no_backward_pred_flag = 0; + if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag) @@ -32714,24 +33294,43 @@ index 0000000000..eef98e5643 + +static int hevc_frame_start(HEVCRpiContext * const s) +{ -+ int pic_size_in_ctb = ((s->ps.sps->width >> s->ps.sps->log2_min_cb_size) + 1) * -+ ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); + int ret; + -+ memset(s->bs_horizontal, 0, s->bs_size); -+ memset(s->bs_vertical, 0, s->bs_size); ++ memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too + memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); -+ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address)); ++ ++ // Only need to remember intra for CIP ++ if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap) ++ s->is_intra = NULL; ++ else ++ { ++ s->is_intra = s->is_intra_store; ++ memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); ++ } + + s->is_decoded = 0; + s->first_nal_type = s->nal_unit_type; + + s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos); + ++ if (s->pkt.nb_nals > s->rpl_tab_size) ++ { ++ // In most cases it will be faster to free & realloc as that doesn't ++ // require (an unwanted) copy ++ av_freep(&s->rpl_tab); ++ s->rpl_tab_size = 0; ++ if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL) ++ goto fail; ++ s->rpl_tab_size = s->pkt.nb_nals; ++ } ++ memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab)); ++ + ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc); + if (ret < 0) + goto fail; + ++ // Resize rpl_tab to max that we might want + ret = ff_hevc_rpi_frame_rps(s); + if (ret < 0) { + av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n"); @@ -32827,6 +33426,7 @@ index 0000000000..eef98e5643 + s->nal_unit_type == HEVC_NAL_RADL_N || + s->nal_unit_type == HEVC_NAL_RASL_N); + s->offload_recon = s->threads_type != 0 && s->used_for_ref; ++ s->is_irap = IS_IRAP(s); + +#if DEBUG_DECODE_N + { @@ -32895,7 +33495,7 @@ index 0000000000..eef98e5643 + } + + ctb_addr_ts = hls_slice_data(s, nal); -+ if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) { ++ if (ctb_addr_ts >= s->ps.sps->ctb_size) { + s->is_decoded = 1; + } + @@ -33142,7 +33742,7 @@ index 0000000000..eef98e5643 + return avpkt->size; +} + -+static int hevc_ref_frame(HEVCRpiContext *s, HEVCFrame *dst, HEVCFrame *src) ++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src) +{ + int ret; + @@ -33150,22 +33750,15 @@ index 0000000000..eef98e5643 + if (ret < 0) + return ret; + -+ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); -+ if (!dst->tab_mvf_buf) -+ goto fail; -+ dst->tab_mvf = src->tab_mvf; -+ -+ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); -+ if (!dst->rpl_tab_buf) -+ goto fail; -+ dst->rpl_tab = src->rpl_tab; -+ -+ dst->rpl_buf = av_buffer_ref(src->rpl_buf); -+ if (!dst->rpl_buf) -+ goto fail; ++ if (src->col_mvf_buf != NULL) ++ { ++ dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf); ++ if (!dst->col_mvf_buf) ++ goto fail; ++ } ++ dst->col_mvf = src->col_mvf; + + dst->poc = src->poc; -+ dst->ctb_count = src->ctb_count; + dst->flags = src->flags; + dst->sequence = src->sequence; + return 0; @@ -33532,10 +34125,10 @@ index 0000000000..eef98e5643 + diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h new file mode 100644 -index 0000000000..ea08308be2 +index 0000000000..d324aa273c --- /dev/null +++ b/libavcodec/rpi_hevcdec.h -@@ -0,0 +1,959 @@ +@@ -0,0 +1,1087 @@ +/* + * HEVC video decoder + * @@ -33574,6 +34167,7 @@ index 0000000000..ea08308be2 +#include "rpi_hevcpred.h" +#include "h2645_parse.h" +#include "hevc.h" ++#include "rpi_hevc_mv.h" +#include "rpi_hevc_ps.h" +#include "rpi_hevc_sei.h" +#include "rpi_hevcdsp.h" @@ -33581,6 +34175,10 @@ index 0000000000..ea08308be2 +#include "thread.h" +#include "videodsp.h" + ++#if ARCH_ARM ++#include "arm/rpi_hevc_misc_neon.h" ++#endif ++ +#define MAX_NB_THREADS 16 +#define SHIFT_CTB_WPP 2 + @@ -33663,10 +34261,7 @@ index 0000000000..ea08308be2 + + +// Min CTB size is 16 -+#if ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) >= (1 << 16) -+#error Check CTB translation array el sizes (currently uint16_t) -+#endif -+ ++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) + +/** + * Value of the luma sample at position (x, y) in the 2D array tab. @@ -33791,9 +34386,9 @@ index 0000000000..ea08308be2 +}; + +typedef struct RefPicList { -+ struct HEVCFrame *ref[HEVC_MAX_REFS]; ++ struct HEVCRpiFrame *ref[HEVC_MAX_REFS]; + int list[HEVC_MAX_REFS]; -+ int isLongTerm[HEVC_MAX_REFS]; ++ uint8_t isLongTerm[HEVC_MAX_REFS]; + int nb_refs; +} RefPicList; + @@ -33820,7 +34415,6 @@ index 0000000000..ea08308be2 + uint8_t intra_pred_mode[4]; + uint8_t intra_pred_mode_c[4]; + uint8_t chroma_mode_c[4]; -+ Mv mvd; + uint8_t merge_flag; +} RpiPredictionUnit; + @@ -33848,19 +34442,14 @@ index 0000000000..ea08308be2 + +struct HEVCRpiJob; + -+typedef struct HEVCFrame { ++typedef struct HEVCRpiFrame { + AVFrame *frame; + ThreadFrame tf; -+ MvField *tab_mvf; -+ RefPicList *refPicList; -+ RefPicListTab **rpl_tab; -+ int ctb_count; ++ ColMvField *col_mvf; + int poc; -+ struct HEVCFrame *collocated_ref; ++ struct HEVCRpiFrame *collocated_ref; + -+ AVBufferRef *tab_mvf_buf; -+ AVBufferRef *rpl_tab_buf; -+ AVBufferRef *rpl_buf; ++ AVBufferRef *col_mvf_buf; + + /** + * A sequence counter, so that old frames are output first @@ -33876,7 +34465,7 @@ index 0000000000..ea08308be2 + // Entry no in DPB - can be used as a small unique + // frame identifier (within the current thread) + uint8_t dpb_no; -+} HEVCFrame; ++} HEVCRpiFrame; + +typedef struct HEVCRpiLocalContext { + HEVCRpiTransformUnit tu; @@ -33948,6 +34537,18 @@ index 0000000000..ea08308be2 + * of the deblocking filter */ + unsigned int boundary_flags; + ++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE) ++ uint8_t ipm_left[IPM_TAB_SIZE]; ++ uint8_t ipm_up[IPM_TAB_SIZE]; ++ ++//#define MVF_STASH_WIDTH 128 ++#define MVF_STASH_WIDTH 64 ++#define MVF_STASH_HEIGHT 64 ++#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE) ++#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE) ++ HEVCRpiMvField mvf_ul[1]; ++ HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU]; ++ + /* +7 is for subpixel interpolation, *2 for high bit depths */ +// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; + /* The extended size between the new edge emu buffer is abused by SAO */ @@ -34199,6 +34800,7 @@ index 0000000000..ea08308be2 + /** 1 if the independent slice segment header was successfully parsed */ + uint8_t slice_initialized; + char used_for_ref; // rpi ++ char is_irap; + char offload_recon; + uint8_t eos; ///< current packet contains an EOS/EOB NAL + uint8_t last_eos; ///< last packet contains an EOS/EOB NAL @@ -34242,14 +34844,14 @@ index 0000000000..ea08308be2 + uint8_t *sao_pixel_buffer_h[3]; + uint8_t *sao_pixel_buffer_v[3]; + -+ AVBufferPool *tab_mvf_pool; -+ AVBufferPool *rpl_tab_pool; ++ unsigned int col_mvf_stride; ++ AVBufferPool *col_mvf_pool; + + RpiSAOParams *sao; + DBParams *deblock; + enum HEVCNALUnitType nal_unit_type; + int temporal_id; ///< temporal_id_plus1 - 1 -+ HEVCFrame *ref; ++ HEVCRpiFrame *ref; + int poc; + int pocTid0; + int slice_idx; ///< number of the slice being currently decoded @@ -34265,12 +34867,27 @@ index 0000000000..ea08308be2 + uint8_t *bsf_stash_up; + uint8_t *bsf_stash_left; + -+ int32_t *tab_slice_address; ++#if HEVC_RPI_MAX_CTBS >= 0xffff ++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0 ++ uint32_t *tab_slice_address; ++#else ++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0 ++ uint16_t *tab_slice_address; ++#endif ++ ++ // Bitfield 1 bit per 8 pels (min pcm size) ++ uint8_t *is_pcm; ++ // Bitfield 1 bit per 8 pels (min cb size) ++ // Only needed for CIP as CIP processing is async to the main thread ++ uint8_t *is_intra; + + // PU -+ uint8_t *tab_ipm; ++ HEVCRpiMvField *mvf_up; ++ HEVCRpiMvField *mvf_left; + -+ uint8_t *is_pcm; ++ const RefPicList **rpl_up; ++ const RefPicList **rpl_left; ++ RefPicList * refPicList; + + // CTB-level flags affecting loop filter operation + uint8_t *filter_slice_edges; @@ -34297,6 +34914,11 @@ index 0000000000..ea08308be2 + + struct AVMD5 *md5_ctx; + ++ RefPicListTab * rpl_tab; ++ unsigned int rpl_tab_size; ++ ++ uint8_t *is_intra_store; ++ + RpiSliceHeader sh; + + HEVCRpiParamSets ps; @@ -34304,7 +34926,7 @@ index 0000000000..ea08308be2 + HEVCRpiLocalContext *HEVClc; + HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS]; + -+ HEVCFrame DPB[HEVC_DPB_ELS]; ++ HEVCRpiFrame DPB[HEVC_DPB_ELS]; + + ///< candidate references for the current frame + RefPicList rps[5]; @@ -34337,9 +34959,6 @@ index 0000000000..ea08308be2 + */ +void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s); + -+const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref, -+ int x0, int y0); -+ +/** + * Construct the reference picture sets for the current frame. + */ @@ -34366,7 +34985,7 @@ index 0000000000..ea08308be2 + +void ff_hevc_rpi_bump_frame(HEVCRpiContext *s); + -+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags); ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags); + +unsigned int ff_hevc_rpi_tb_avail_flags( + const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, @@ -34374,11 +34993,13 @@ index 0000000000..ea08308be2 + +void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, + int nPbH, int log2_cb_size, int part_idx, -+ int merge_idx, MvField * const mv); -+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, const unsigned int avail, int part_idx, -+ int merge_idx, MvField * const mv, -+ int mvp_lx_flag, int LX); ++ int merge_idx, HEVCRpiMvField * const mv); ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ HEVCRpiMvField * const mv, ++ const unsigned int mvp_lx_flag, const unsigned int LX); +void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase); +void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, + const unsigned int x0, const unsigned int y0, @@ -34398,14 +35019,14 @@ index 0000000000..ea08308be2 +#endif + +void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int val, const int field); ++ const HEVCRpiFrame * const ref, const int val, const int field); + +void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field); + +// All of these expect that s->threads_type == FF_THREAD_FRAME + +static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int y) ++ const HEVCRpiFrame * const ref, const int y) +{ + if (s->threads_type != 0) + ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); @@ -34418,7 +35039,7 @@ index 0000000000..ea08308be2 +} + +static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int y) ++ const HEVCRpiFrame * const ref, const int y) +{ + ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); +} @@ -34440,7 +35061,7 @@ index 0000000000..ea08308be2 + +// Set all done - signal nothing (used in missing refs) +// Works for both rpi & non-rpi -+static inline void ff_hevc_rpi_progress_set_all_done(HEVCFrame * const ref) ++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref) +{ + if (ref->tf.progress != NULL) + { @@ -34494,13 +35115,113 @@ index 0000000000..ea08308be2 +#define RPI_ZC_SAND128_ONLY 1 +#endif + ++#ifndef ff_hevc_rpi_copy_vert ++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ int i; ++ switch (pixel_shift) ++ { ++ case 2: ++ for (i = 0; i < height; i++) { ++ *(uint32_t *)dst = *(uint32_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ case 1: ++ for (i = 0; i < height; i++) { ++ *(uint16_t *)dst = *(uint16_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ default: ++ for (i = 0; i < height; i++) { ++ *dst = *src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ } ++} ++#endif ++ ++ ++#if MVF_STASH_WIDTH == 64 ++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)); ++} ++ ++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ const unsigned int y0_ctb = y0 & mask_cs_hi; ++ ++ return (HEVCRpiMvField *)((y < y0_ctb) ? ++ (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) : ++ (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) : ++ lc->mvf_stash + ++ ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ++ ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE))); ++} ++ ++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, ++ const unsigned int x0, ++ const unsigned int x) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU; ++} ++ ++#else ++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1))); ++} ++ ++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ const unsigned int y0_ctb = y0 & mask_cs_hi; ++ ++ // If not in the same CTB for Y assume up ++ if (y < y0_ctb) { ++ // If not in the same CTB for X too assume up-left ++ return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)); ++ } ++ return mvf_stash_ptr(s, lc, x, y); ++} ++ ++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, ++ const unsigned int x0, ++ const unsigned int x) ++{ ++ return MVF_STASH_WIDTH_PU; ++} ++#endif ++ +#endif /* AVCODEC_RPI_HEVCDEC_H */ diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c new file mode 100644 -index 0000000000..b041e0fd3f +index 0000000000..ac29789e7f --- /dev/null +++ b/libavcodec/rpi_hevcdsp.c -@@ -0,0 +1,444 @@ +@@ -0,0 +1,450 @@ +/* + * HEVC video decoder + * @@ -34526,6 +35247,7 @@ index 0000000000..b041e0fd3f + */ + +#include "rpi_hevcdsp.h" ++#include "rpi_hevc_mv.h" + +static const int8_t transform[32][32] = { + { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, @@ -34626,9 +35348,9 @@ index 0000000000..b041e0fd3f +#include "rpi_hevcdsp_template.c" +#undef BIT_DEPTH + -+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const MvField *curr, const MvField *neigh, ++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ int in_inc) ++ int in_inc0, int in_inc1) +{ + int shift = 32; + uint32_t bs = 0; @@ -34636,8 +35358,13 @@ index 0000000000..b041e0fd3f + int strength, out; + int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; + int curr_refL1 = curr_rpl1[curr->ref_idx[1]]; -+ int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]]; -+ int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]]; ++ int nr_idx0 = neigh->ref_idx[0]; ++ int nr_idx1 = neigh->ref_idx[1]; ++ int neigh_refL0 = neigh_rpl0[nr_idx0]; ++ int neigh_refL1 = neigh_rpl1[nr_idx1]; ++ ++ av_assert0(nr_idx0 >= 0 && nr_idx0 <=31); ++ av_assert0(nr_idx1 >= 0 && nr_idx1 <=31); + +#if 1 // This more directly matches the original implementation + if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { @@ -34645,24 +35372,24 @@ index 0000000000..b041e0fd3f + if (curr_refL0 == neigh_refL0 && + curr_refL0 == curr_refL1 && + neigh_refL0 == neigh_refL1) { -+ if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) && -+ (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)) ++ if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) && ++ (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)) + strength = 1; + else + strength = 0; + } else if (neigh_refL0 == curr_refL0 && + neigh_refL1 == curr_refL1) { -+ if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) ++ if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) + strength = 1; + else + strength = 0; + } else if (neigh_refL1 == curr_refL0 && + neigh_refL0 == curr_refL1) { -+ if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4) ++ if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4) + strength = 1; + else + strength = 0; @@ -34670,24 +35397,24 @@ index 0000000000..b041e0fd3f + strength = 1; + } + } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV -+ Mv curr_mv0, neigh_mv0; ++ MvXY curr_mv0, neigh_mv0; + + if (curr->pred_flag & 1) { -+ curr_mv0 = curr->mv[0]; ++ curr_mv0 = curr->xy[0]; + } else { -+ curr_mv0 = curr->mv[1]; ++ curr_mv0 = curr->xy[1]; + curr_refL0 = curr_refL1; + } + + if (neigh->pred_flag & 1) { -+ neigh_mv0 = neigh->mv[0]; ++ neigh_mv0 = neigh->xy[0]; + } else { -+ neigh_mv0 = neigh->mv[1]; ++ neigh_mv0 = neigh->xy[1]; + neigh_refL0 = neigh_refL1; + } + + if (curr_refL0 == neigh_refL0) { -+ if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4) ++ if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4) + strength = 1; + else + strength = 0; @@ -34696,10 +35423,10 @@ index 0000000000..b041e0fd3f + } else + strength = 1; +#else // This has exactly the same effect, but is more suitable for vectorisation -+ Mv curr_mv[2]; -+ Mv neigh_mv[2]; -+ memcpy(curr_mv, curr->mv, sizeof curr_mv); -+ memcpy(neigh_mv, neigh->mv, sizeof neigh_mv); ++ MvXY curr_mv[2]; ++ MvXY neigh_mv[2]; ++ memcpy(curr_mv, curr->xy, sizeof curr_mv); ++ memcpy(neigh_mv, neigh->xy, sizeof neigh_mv); + + if (!(curr->pred_flag & 2)) { + curr_mv[1] = curr_mv[0]; @@ -34721,18 +35448,18 @@ index 0000000000..b041e0fd3f + strength = 1; + + strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) | -+ (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) | -+ (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4); ++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) | ++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4); + + strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) | -+ (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) | -+ (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4); ++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) | ++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4); + + strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2); +#endif + -+ curr += in_inc / sizeof (MvField); -+ neigh += in_inc / sizeof (MvField); ++ curr += in_inc0 / sizeof (HEVCRpiMvField); ++ neigh += in_inc1 / sizeof (HEVCRpiMvField); + + for (out = dup; out > 0; out--) + { @@ -34947,10 +35674,10 @@ index 0000000000..b041e0fd3f +} diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h new file mode 100644 -index 0000000000..0b532f874b +index 0000000000..5a7cdeeb66 --- /dev/null +++ b/libavcodec/rpi_hevcdsp.h -@@ -0,0 +1,185 @@ +@@ -0,0 +1,177 @@ +/* + * HEVC video decoder + * @@ -34981,6 +35708,8 @@ index 0000000000..0b532f874b +#include "hevc.h" +#include "get_bits.h" + ++struct HEVCRpiMvField; ++ +#define MAX_PB_SIZE 64 + +#define RPI_HEVC_SAO_BUF_STRIDE 160 @@ -34995,16 +35724,6 @@ index 0000000000..0b532f874b + +} RpiSAOParams; + -+typedef struct Mv { -+ int16_t x; ///< horizontal component of motion vector -+ int16_t y; ///< vertical component of motion vector -+} Mv; -+ -+typedef struct MvField { -+ DECLARE_ALIGNED(4, Mv, mv)[2]; -+ int8_t ref_idx[2]; -+ int8_t pred_flag; -+} MvField; + +// This controls how many sao dsp functions there are +// N=5 has width = 8, 16, 32, 48, 64 @@ -35119,9 +35838,9 @@ index 0000000000..0b532f874b + uint8_t * src_l, + unsigned int no_f); + -+ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const MvField *curr, const MvField *neigh, ++ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ int in_inc); ++ int in_inc0, int inc_inc1); + + void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height); +} HEVCDSPContext; @@ -37721,10 +38440,10 @@ index 0000000000..6e594277c0 +#endif /* AVCODEC_RPI_HEVCPRED_H */ diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c new file mode 100644 -index 0000000000..23835a320e +index 0000000000..2f710626cf --- /dev/null +++ b/libavcodec/rpi_hevcpred_template.c -@@ -0,0 +1,1487 @@ +@@ -0,0 +1,1522 @@ +/* + * HEVC video decoder + * @@ -37755,7 +38474,6 @@ index 0000000000..23835a320e +#include "rpi_hevcdec.h" +#include "rpi_hevcpred.h" + -+ +#define DUMP_PRED 0 + +#define POS(x, y) src[(x) + stride * (y)] @@ -37889,32 +38607,78 @@ index 0000000000..23835a320e + +// Beware that this inverts the avail ordering +// For CIP it seems easier this way round -+static unsigned int cip_avail(const MvField * mvf, const int mvf_stride, const unsigned int log2_pu_size, const unsigned int avail, unsigned int size, -+ unsigned int s0, unsigned int s1) ++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask, ++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, ++ unsigned int s0, unsigned int odd_s) +{ -+ const unsigned int n = 1 << (log2_pu_size - 2); ++ const unsigned int n = 1 << log2_intra_bits; + unsigned int fa = 0; -+ unsigned int i = 0; ++ unsigned int i; + + size >>= 2; // Now in 4-pel units + s0 >>= 2; -+ s1 >>= 2; + -+ if ((avail & 4) != 0) ++ if ((avail & AVAIL_DL) != 0) + fa |= ((1 << s0) - 1) << (size - s0); -+ if ((avail & 2) != 0) -+ fa |= ((1 << s1) - 1) << size; -+ if ((avail & 1) != 0) ++ if ((avail & AVAIL_L) != 0) ++ fa |= ((1 << size) - 1) << size; ++ if ((avail & AVAIL_UL) != 0) + fa |= 1 << (size << 1); + -+ for (i = 0; (fa >> i) != 0; i += n, mvf += mvf_stride) { -+ if ((fa & (((1 << n) - 1) << i)) != 0 && mvf->pred_flag != PF_INTRA) -+ fa &= ~(((1 << n) - 1) << i); ++ if (odd_s) { ++ if ((fa & 1) != 0 && (*is_intra & i_mask) == 0) ++ fa &= ~1; ++ is_intra += i_stride; ++ } ++ ++ for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) { ++ const unsigned int m = ((1 << n) - 1) << i; ++ if ((fa & m) != 0 && (*is_intra & i_mask) == 0) ++ fa &= ~m; + } + + return fa; +} + ++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift, ++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, ++ unsigned int s1, unsigned int odd_s) ++{ ++ if ((avail & (AVAIL_U | AVAIL_UR)) == 0) ++ { ++ return 0; ++ } ++ else ++ { ++ const unsigned int n = 1 << log2_intra_bits; ++ unsigned int fa = 0; ++ unsigned int i; ++ unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift; ++ ++ size >>= 2; // Now in 4-pel units ++ s1 >>= 2; ++ ++ if ((avail & AVAIL_U) != 0) ++ fa |= ((1 << size) - 1); ++ if ((avail & AVAIL_UR) != 0) ++ fa |= ((1 << s1) - 1) << size; ++ ++ if (odd_s) { ++ fa &= im | ~1; ++ im >>= 1; ++ } ++ ++ for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) { ++ const unsigned int m = ((1 << n) - 1) << i; ++ if ((im & 1) == 0) ++ fa &= ~m; ++ } ++ return fa; ++ } ++} ++ ++ ++ +static inline unsigned int rmbd(unsigned int x) +{ +#if 1 @@ -38053,14 +38817,6 @@ index 0000000000..23835a320e +#define EXTEND(ptr, val, len) extend_32(ptr, val, len) +#endif + -+ -+#define PU(x) \ -+ ((x) >> s->ps.sps->log2_min_pu_size) -+#define MVF(x, y) \ -+ (s->ref->tab_mvf[(x) + (y) * s->ps.sps->min_pu_width]) -+#define MVF_PU(x, y) \ -+ MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift)))) -+ +// Reqs: +// +// Planar: DL[0], L, ul, U, UR[0] @@ -38560,24 +39316,31 @@ index 0000000000..23835a320e + src_ur += stripe_adj; + } + ++ // Can deal with I-slices in 'normal' code even if CIP ++ // This also means that we don't need to generate (elsewhere) is_intra ++ // for IRAP frames + if (s->ps.pps->constrained_intra_pred_flag == 1 && -+ s->sh.slice_type != HEVC_SLICE_I) // Can deal with I-slices in 'normal' code ++ s->sh.slice_type != HEVC_SLICE_I) + { -+ const unsigned int l2_pu_s = FFMAX(s->ps.sps->log2_min_pu_size - hshift, 2); -+ const unsigned int l2_pu_stride_s = l2_pu_s - (s->ps.sps->log2_min_pu_size - hshift); -+ -+ unsigned int avail_l = cip_avail(&MVF_PU(-1, size * 2 - 1), -+ -(int)(s->ps.sps->min_pu_width << l2_pu_stride_s), -+ l2_pu_s, -+ avail >> AVAIL_S_UL, -+ size, -+ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), size); -+ unsigned int avail_u = cip_avail(&MVF_PU(0, -1), -+ 1 << l2_pu_stride_s, -+ l2_pu_s, -+ avail << 1, -+ size, -+ size, FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size)); ++ // * If we ever actually care about CIP performance then we should ++ // special case out size 4 stuff (can be done by 'normal') and ++ // have 8-pel avail masks ++ unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)), ++ -(int)(s->ps.sps->pcm_width), ++ 1 << (((x - 1) >> (3 - hshift)) & 7), ++ 1 - hshift, ++ avail, ++ size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), ++ vshift != 0 ? 0 : (y >> 2) & 1); ++ ++ unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)), ++ (x >> (3 - hshift)) & 7, ++ 1 - hshift, ++ avail, ++ size, ++ FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size), ++ hshift != 0 ? 0 : (x >> 2) & 1); + + // Anything left? + if ((avail_l | avail_u) == 0) @@ -39190,16 +39953,7 @@ index 0000000000..23835a320e +#undef c_src_ptr_t +#undef c_dst_ptr_t + -+#undef EXTEND_LEFT_CIP -+#undef EXTEND_RIGHT_CIP -+#undef EXTEND_UP_CIP -+#undef EXTEND_DOWN_CIP -+#undef IS_INTRA -+#undef MVF_PU -+#undef MVF -+#undef PU +#undef EXTEND -+#undef MIN_TB_ADDR_ZS +#undef POS +#undef PW + @@ -41519,7 +42273,7 @@ index 0000000000..26fb3be999 +#endif + diff --git a/libavfilter/Makefile b/libavfilter/Makefile -index 3a9fb02556..32e56f6b15 100644 +index bcd5d437ff..ccb49ec8c0 100644 --- a/libavfilter/Makefile +++ b/libavfilter/Makefile @@ -346,6 +346,7 @@ OBJS-$(CONFIG_TONEMAP_FILTER) += vf_tonemap.o @@ -41932,10 +42686,10 @@ index 0000000000..64578b7ac4 +}; + diff --git a/libavformat/utils.c b/libavformat/utils.c -index f2f2cc4239..f152a3bcc2 100644 +index c25eab4d49..4db44315c7 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c -@@ -2996,6 +2996,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) +@@ -3005,6 +3005,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) return 1; } @@ -41976,7 +42730,7 @@ index f2f2cc4239..f152a3bcc2 100644 /* returns 1 or 0 if or if not decoded data was returned, or a negative error */ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, AVDictionary **options) -@@ -3030,7 +3064,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, +@@ -3039,7 +3073,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, av_dict_set(options ? options : &thread_opt, "threads", "1", 0); if (s->codec_whitelist) av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0); @@ -41989,7 +42743,7 @@ index f2f2cc4239..f152a3bcc2 100644 if (!options) av_dict_free(&thread_opt); if (ret < 0) { -@@ -3061,6 +3099,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, +@@ -3070,6 +3108,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, if (avctx->codec_type == AVMEDIA_TYPE_VIDEO || avctx->codec_type == AVMEDIA_TYPE_AUDIO) { ret = avcodec_send_packet(avctx, &pkt); @@ -42004,7 +42758,7 @@ index f2f2cc4239..f152a3bcc2 100644 if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) break; if (ret >= 0) -@@ -3654,9 +3700,20 @@ FF_ENABLE_DEPRECATION_WARNINGS +@@ -3663,9 +3709,20 @@ FF_ENABLE_DEPRECATION_WARNINGS // Try to just open decoders, in case this is enough to get parameters. if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) { if (codec && !avctx->codec) @@ -43290,7 +44044,7 @@ index 0000000000..59c0d3959e +# -Wa,-ahls diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh new file mode 100755 -index 0000000000..66c455539d +index 0000000000..40549a35e5 --- /dev/null +++ b/pi-util/conf_pi2.sh @@ -0,0 +1,32 @@ @@ -43313,7 +44067,7 @@ index 0000000000..66c455539d + --disable-thumb\ + --enable-mmal\ + --enable-rpi\ -+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ ++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ + --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\ + --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ + --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ @@ -43509,10 +44263,10 @@ index 0000000000..e9556f0837 + diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py new file mode 100755 -index 0000000000..27cc453963 +index 0000000000..8bb326943f --- /dev/null +++ b/pi-util/ffperf.py -@@ -0,0 +1,124 @@ +@@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + +import time @@ -43583,6 +44337,7 @@ index 0000000000..27cc453963 + argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename") + argp.add_argument("--csv_in", help="CSV input filename") + argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).") ++ argp.add_argument("--repeat", default=3, type=int, help="Run repeat count") + + args = argp.parse_args() + @@ -43617,7 +44372,7 @@ index 0000000000..27cc453963 + print ("====", f) + + t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999}) -+ for i in range(3): ++ for i in range(args.repeat): + t = tstats.time_file(f, prefix) + print ("...", t.times_str()) + if t0 > t: