diff --git a/packages/addons/addon-depends/ffmpegx/package.mk b/packages/addons/addon-depends/ffmpegx/package.mk index 7992ac12d79..695f1c7aa7e 100644 --- a/packages/addons/addon-depends/ffmpegx/package.mk +++ b/packages/addons/addon-depends/ffmpegx/package.mk @@ -149,6 +149,8 @@ configure_target() { \ `#General options` \ --enable-avresample \ + --disable-lzma \ + --disable-alsa \ \ `#Toolchain options` \ --arch="$TARGET_ARCH" \ diff --git a/packages/compress/libarchive/package.mk b/packages/compress/libarchive/package.mk new file mode 100644 index 00000000000..b493f8cc3f7 --- /dev/null +++ b/packages/compress/libarchive/package.mk @@ -0,0 +1,35 @@ +################################################################################ +# This file is part of LibreELEC - https://libreelec.tv +# Copyright (C) 2017-present Team LibreELEC +# +# LibreELEC is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# LibreELEC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LibreELEC. If not, see . +################################################################################ + +PKG_NAME="libarchive" +PKG_VERSION="3.3.2" +PKG_SHA256="ed2dbd6954792b2c054ccf8ec4b330a54b85904a80cef477a1c74643ddafa0ce" +PKG_ARCH="any" +PKG_LICENSE="GPL" +PKG_SITE="https://www.libarchive.org" +PKG_URL="https://www.libarchive.org/downloads/$PKG_NAME-$PKG_VERSION.tar.gz" +PKG_DEPENDS_HOST="toolchain" +PKG_DEPENDS_TARGET="toolchain" +PKG_SECTION="compress" +PKG_SHORTDESC="libarchive data compressor/decompressor" + +PKG_CMAKE_OPTS_TARGET="-DENABLE_SHARED=0 -DENABLE_STATIC=1 -DCMAKE_POSITION_INDEPENDENT_CODE=1 -DENABLE_EXPAT=0 -DENABLE_ICONV=0 -DENABLE_LIBXML2=0 -DENABLE_LZO=1 -DENABLE_TEST=0 -DENABLE_COVERAGE=0" + +post_makeinstall_target() { + rm -rf $INSTALL +} diff --git a/packages/compress/libarchive/patches/libarchive-01-die-Werror.patch b/packages/compress/libarchive/patches/libarchive-01-die-Werror.patch new file mode 100644 index 00000000000..9a831f2a2ec --- /dev/null +++ b/packages/compress/libarchive/patches/libarchive-01-die-Werror.patch @@ -0,0 +1,37 @@ +From f3c2f0ca7916288c72da07a2c3352b85b8f96e55 Mon Sep 17 00:00:00 2001 +From: Arne Morten Kvarving +Date: Sat, 11 Nov 2017 23:42:40 +0100 +Subject: [PATCH] die Werror + + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 73bf07b..08e8f49 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -96,7 +96,7 @@ IF (CMAKE_C_COMPILER_ID MATCHES "^GNU$") + ################################################################# + # Set compile flags for debug build. + # This is added into CMAKE_C_FLAGS when CMAKE_BUILD_TYPE is "Debug" +- SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror") ++ #SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wextra") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wunused") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wshadow") +@@ -112,7 +112,7 @@ IF (CMAKE_C_COMPILER_ID MATCHES "^Clang$") + # Set compile flags for debug build. + # This is added into CMAKE_C_FLAGS when CMAKE_BUILD_TYPE is "Debug" + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g") +- SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror") ++ #SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Werror") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wextra") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wunused") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -Wshadow") +@@ -133,7 +133,7 @@ + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -qflag=w:w") + SET(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -qinfo=pro:use") + ENDIF(CMAKE_C_COMPILER_ID MATCHES "^XL$") +-IF (MSVC) ++IF (0) + ################################################################# + # Set compile flags for debug build. + # This is added into CMAKE_C_FLAGS when CMAKE_BUILD_TYPE is "Debug" diff --git a/packages/compress/libarchive/patches/libarchive-02-static-please.patch b/packages/compress/libarchive/patches/libarchive-02-static-please.patch new file mode 100644 index 00000000000..fb2bbc275a2 --- /dev/null +++ b/packages/compress/libarchive/patches/libarchive-02-static-please.patch @@ -0,0 +1,14 @@ +--- libarchive/libarchive/CMakeLists.txt ++++ libarchive/libarchive/CMakeLists.txt +@@ -224,9 +224,9 @@ + ENDIF() + + # Libarchive is a shared library +-ADD_LIBRARY(archive SHARED ${libarchive_SOURCES} ${include_HEADERS}) ++ADD_LIBRARY(archive STATIC ${libarchive_SOURCES} ${include_HEADERS}) + TARGET_LINK_LIBRARIES(archive ${ADDITIONAL_LIBS}) +-SET_TARGET_PROPERTIES(archive PROPERTIES SOVERSION ${SOVERSION}) ++SET_TARGET_PROPERTIES(archive PROPERTIES COMPILE_DEFINITIONS LIBARCHIVE_STATIC) + + # archive_static is a static library + ADD_LIBRARY(archive_static STATIC ${libarchive_SOURCES} ${include_HEADERS}) diff --git a/packages/compress/lz4/package.mk b/packages/compress/lz4/package.mk new file mode 100644 index 00000000000..359797af93e --- /dev/null +++ b/packages/compress/lz4/package.mk @@ -0,0 +1,37 @@ +################################################################################ +# This file is part of LibreELEC - https://libreelec.tv +# Copyright (C) 2017-present Team LibreELEC +# +# LibreELEC is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# LibreELEC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LibreELEC. If not, see . +################################################################################ + +PKG_NAME="lz4" +PKG_VERSION="1.8.0" +PKG_SHA256="2ca482ea7a9bb103603108b5a7510b7592b90158c151ff50a28f1ca8389fccf6" +PKG_ARCH="any" +PKG_LICENSE="GPL" +PKG_SITE="https://github.com/lz4/lz4" +PKG_URL="https://github.com/lz4/lz4/archive/v$PKG_VERSION.tar.gz" +PKG_DEPENDS_HOST="toolchain" +PKG_DEPENDS_TARGET="toolchain" +PKG_SECTION="compress" +PKG_SHORTDESC="lz4 data compressor/decompressor" + +PKG_CMAKE_SCRIPT="$PKG_BUILD/contrib/cmake_unofficial/CMakeLists.txt" + +PKG_CMAKE_OPTS_TARGET="-DBUILD_SHARED_LIBS=0 -DCMAKE_POSITION_INDEPENDENT_CODE=0" + +post_makeinstall_target() { + rm -rf $INSTALL +} diff --git a/packages/compress/xz/package.mk b/packages/compress/xz/package.mk index 225b7b61bdc..fbc70f8b4fa 100644 --- a/packages/compress/xz/package.mk +++ b/packages/compress/xz/package.mk @@ -36,3 +36,9 @@ PKG_CONFIGURE_OPTS_HOST="--disable-shared --enable-static \ --enable-lzma-links \ --disable-scripts \ --disable-nls" + +PKG_CONFIGURE_OPTS_TARGET="--disable-shared --enable-static" + +post_makeinstall_target() { + rm -rf $INSTALL +} diff --git a/packages/compress/xz/patches/xz-01-init-uninitialized-variables.patch b/packages/compress/xz/patches/xz-01-init-uninitialized-variables.patch new file mode 100644 index 00000000000..0d88880702a --- /dev/null +++ b/packages/compress/xz/patches/xz-01-init-uninitialized-variables.patch @@ -0,0 +1,27 @@ +From 5cd389f1fe1fe095cdf555194df875ee3ab445cf Mon Sep 17 00:00:00 2001 +From: MilhouseVH +Date: Sun, 26 Nov 2017 22:21:15 +0000 +Subject: [PATCH] uninitialized variables build error + +--- + src/liblzma/lzma/lzma_encoder.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/liblzma/lzma/lzma_encoder.c b/src/liblzma/lzma/lzma_encoder.c +index ba9ce69..08e8c87 100644 +--- a/src/liblzma/lzma/lzma_encoder.c ++++ b/src/liblzma/lzma/lzma_encoder.c +@@ -359,8 +359,8 @@ lzma_lzma_encode(lzma_lzma1_encoder *restrict coder, lzma_mf *restrict mf, + // - UINT32_MAX: not a match but a literal + // Value ranges for len: + // - [MATCH_LEN_MIN, MATCH_LEN_MAX] +- uint32_t len; +- uint32_t back; ++ uint32_t len = 0; ++ uint32_t back = 0; + + if (coder->fast_mode) + lzma_lzma_optimum_fast(coder, mf, &back, &len); +-- +2.14.1 + diff --git a/packages/devel/libplist/package.mk b/packages/devel/libplist/package.mk index 02cf133f3d2..523e932c5d6 100644 --- a/packages/devel/libplist/package.mk +++ b/packages/devel/libplist/package.mk @@ -17,13 +17,13 @@ ################################################################################ PKG_NAME="libplist" -PKG_VERSION="1.12" -PKG_SHA256="0effdedcb3de128c4930d8c03a3854c74c426c16728b8ab5f0a5b6bdc0b644be" +PKG_VERSION="2.0.0" +PKG_SHA256="3a7e9694c2d9a85174ba1fa92417cfabaea7f6d19631e544948dc7e17e82f602" PKG_ARCH="any" PKG_LICENSE="GPL" -PKG_SITE="http://matt.colyer.name/projects/iphone-linux/" +PKG_SITE="http://www.libimobiledevice.org/" PKG_URL="http://www.libimobiledevice.org/downloads/$PKG_NAME-$PKG_VERSION.tar.bz2" -PKG_DEPENDS_TARGET="toolchain libxml2 glib" +PKG_DEPENDS_TARGET="toolchain glib" PKG_SECTION="devel" PKG_SHORTDESC="libplist: a library for manipulating Apple Binary and XML Property Lists" PKG_LONGDESC="libplist is a library for manipulating Apple Binary and XML Property Lists" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk index b19c93e02e0..12279bfe00f 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.modplug/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.modplug" -PKG_VERSION="72018cd" -PKG_SHA256="e799c0a7405c4df89058b91b0925f0e7860d750c1613e3ef38e141f12fa78904" +PKG_VERSION="63c6715" +PKG_SHA256="95a001229ff68420f0f0bd8424067b1daca44c312abf22739425d272fe167729" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk index 5cbeeb3ff27..aa264ddcb96 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.organya/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.organya" -PKG_VERSION="ff7ab78" -PKG_SHA256="21b363e4fd72ae9d696d18ee0728f5c53413634cfb6464d68ed1eb42427b0874" +PKG_VERSION="0f3d367" +PKG_SHA256="4e0125900881ab6a438e0e9d14bde5c8ed756ef845c5d5288cc00fedd581d99b" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk index e240dc79a3e..718c75cfc81 100644 --- a/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.timidity/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audiodecoder.timidity" -PKG_VERSION="ed61c04" -PKG_SHA256="5378463a7f30869d0f3ef659396fbd5d8cf6e62f3226307882293524899b80db" +PKG_VERSION="1e13049" +PKG_SHA256="bac11b90751d241bc191840b48327312b607dcac2149ef1d2855a09a84332a60" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk index ca82adda097..df513be7728 100644 --- a/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk +++ b/packages/mediacenter/kodi-binary-addons/audioencoder.flac/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="audioencoder.flac" -PKG_VERSION="add8481" -PKG_SHA256="0afb2faeb025bec534df099b497dd085f05cb66e237d8259aa9c577dd14cfb05" +PKG_VERSION="817e0de" +PKG_SHA256="c122f4e09d38cfde167386376ed55760414ad4a742ea56f62e99d8306fe9194b" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk index 2060310f81e..d2c7f9f42a9 100644 --- a/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk +++ b/packages/mediacenter/kodi-binary-addons/imagedecoder.raw/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="imagedecoder.raw" -PKG_VERSION="aa45f0a" -PKG_SHA256="5883d0f49e0f88e00a13dfcccf622032f0e0df5b9f67e99747d98fd500bbffb8" +PKG_VERSION="87449b5" +PKG_SHA256="0254f48d67204a85bea6c1c310b61ce01e5f2cf970608d3f58d7e42c474e0804" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk index 12d8dec86b4..62c8000616e 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="inputstream.adaptive" -PKG_VERSION="853144d" -PKG_SHA256="cadc38ee93894b37603af30eb71f248fcf2df056d3cff9b840de1e895679d6e4" +PKG_VERSION="d2081b2" +PKG_SHA256="3032079ede0f234781b7cf929010ce63d8af458389dd188c36be925eb301b669" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/peak3d/inputstream.adaptive/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk index d8af5a1e348..620197f5c07 100644 --- a/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk +++ b/packages/mediacenter/kodi-binary-addons/inputstream.rtmp/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="inputstream.rtmp" -PKG_VERSION="0702f7e" -PKG_SHA256="ab7a8d36c39dc7f5dd1925da2f5f94f5ee5bff9c24a14f9477ebcd761654de22" +PKG_VERSION="c772497" +PKG_SHA256="7408e26e43b08f9b57adb660ac56c6313bec65f01178c78b30280050d2e58a9d" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" PKG_URL="https://github.com/notspiff/inputstream.rtmp/archive/$PKG_VERSION.tar.gz" diff --git a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk index 3315e061055..cefcd1e69ea 100644 --- a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk +++ b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="peripheral.joystick" -PKG_VERSION="a5cc154" -PKG_SHA256="0582603842c82fcaecb66c0bf78e134a1be8cdd08f19d275b5217fbdc0963499" +PKG_VERSION="33b43ce" +PKG_SHA256="1554469f4fbcbb2a37de9c1ece6b1b41c9e71f1087a48cb7f9ec3ae7d425dc41" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk index aaac08fc1fb..dcdce4b2728 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.argustv/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.argustv" -PKG_VERSION="23cc0e8" -PKG_SHA256="e965a98240f6c7dee277a1a705ac5e26b138b2f1572572aea50bbfef92a54bf3" +PKG_VERSION="1a48789" +PKG_SHA256="236a55371cae180ec755be055238d7edb145aab9e9e918bd8b797344aa74709a" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk index ccacf0d41cd..fe8c8d9c585 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.demo/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.demo" -PKG_VERSION="122dedd" -PKG_SHA256="6422a64924ae219dc6c12e1c12d04247aea92c9143d784a8535c75e0990e3934" +PKG_VERSION="94c4817" +PKG_SHA256="a2511806f593d8281631b2ac745d091ff9dd1f0b84e26c2e56f0fb41c9c5487c" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk index 3005edb2a11..03c7f5de8a9 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.dvbviewer/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.dvbviewer" -PKG_VERSION="afe584a" -PKG_SHA256="1df0174100a30df460351fa9d5e21a1d46d474234cdaf24d19ec7975c0b0defb" +PKG_VERSION="6129441" +PKG_SHA256="c924922900c4d7982ca826c666c467f541b311e8a61afe66a224f6c88690afd3" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk index 833c1999859..3c1c23c2f88 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.filmon/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.filmon" -PKG_VERSION="f67f905" -PKG_SHA256="2829be270876a460c7ec47af04b5217d2a3bfc63f92acae312bb030c96a719ca" +PKG_VERSION="2dee2ca" +PKG_SHA256="caf3bd4f31863584a72f60e176e3e07443a1ee748908bdf3e955b023a6caebbe" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk index 3f4ef8f2c7d..99712510108 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hdhomerun/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.hdhomerun" -PKG_VERSION="ddfe2cb" -PKG_SHA256="8f464cc4df525371c7d67426424857fc823170c82a38cb80b2d6f9ca2f70117d" +PKG_VERSION="3af3e91" +PKG_SHA256="d27003e108dea71d80f71649f9b2b98634d9093eb532f185e152c33719a648a9" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk index 0883c518682..d71bd2b94cd 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.hts" -PKG_VERSION="295893c" -PKG_SHA256="513d8b4c969915b17fee7b79da212a52fca1a9e8aa1d8c84c171a239728ee952" +PKG_VERSION="67fe2df" +PKG_SHA256="cd5bc330522a1a0d92bb7191dd296779ef2cddbb6f426235eed7eb359e1596e0" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk index ad2860b3c72..fa8b29c75cb 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mediaportal.tvserver/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.mediaportal.tvserver" -PKG_VERSION="fa1069d" -PKG_SHA256="b998eb70fee844105c6a5dfe5663d4641be82d4da0f18050a02a62f6478e8e3a" +PKG_VERSION="6c35e88" +PKG_SHA256="6355e47381023aed857c63c43c54dd610a123a077c19b74991e004879569f113" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk index 5d6df221dbe..8f4b77e5d19 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.mythtv" -PKG_VERSION="d5a6aa3" -PKG_SHA256="21af28423221148cdd15a5a69261717e9847ee0c976d2da3fba37eea20153584" +PKG_VERSION="ffaa1b5" +PKG_SHA256="997b44e35aa1b422bf7306449fe480d3996eaf40fe82286740c4561132d937e8" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk index c9d75732e99..04e0ad8d32b 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.nextpvr" -PKG_VERSION="5b7caa2" -PKG_SHA256="9781a1b90287e146c4fed57a3604799dbad791cac7046827f449828a39d2077d" +PKG_VERSION="938bb48" +PKG_SHA256="5ccd5d2acef47e6fdffaabe0caee2d5ecc19e577682201dc8da5ca34e4f7e48c" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk index fd2cf0c53b6..b95736ebecc 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.njoy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.njoy" -PKG_VERSION="cc1cb56" -PKG_SHA256="35425e762e780fc19759cdbc504a25f23be15e0da25a58c30056aeb9709061c1" +PKG_VERSION="4a5efef" +PKG_SHA256="dd03dc9882a127053a0b255eaaedd467ff4822362d2d08eb0d51908192fb42be" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk index 9d005042e57..3df8a4ec6f4 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.octonet/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.octonet" -PKG_VERSION="f1f8d44" -PKG_SHA256="eb2e90750ec648d2fbecfcf982f7e4a55368f2e129da3df60d5fd8bf0b85d802" +PKG_VERSION="c4af00a" +PKG_SHA256="4a62a84c957517044e1d44a40ae5db93576e8c0244acebf9c3a37203920202f9" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk index 00f35a7900c..30795a062ee 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.pctv/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.pctv" -PKG_VERSION="c6d10cb" -PKG_SHA256="686c63860a0a94bbd27a703debe5fcbcf866b171fd9ce89bc97fc1547c4542a2" +PKG_VERSION="6484615" +PKG_SHA256="a5d475207b504e59190d0659b5477d3a3ec0430c1c0c1c6420d51a500ac27d6d" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk index 146f0d7f1e7..e8fdf12f5a7 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.stalker/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.stalker" -PKG_VERSION="5f53d18" -PKG_SHA256="a73dc965734eb5fd17580f3d1f8e27f87da1c3f5fc490eaa998a7dbf0856e44f" +PKG_VERSION="44025a1" +PKG_SHA256="70f279c473eb2b2432908760e9003cdf05d32037b64bb7eacf6429caa390a31e" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk index ad87bf1656f..67ec3e22f56 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.teleboy/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.teleboy" -PKG_VERSION="2d092c4" -PKG_SHA256="9866fc70520bec037f6df27a74e64e36c8aaaa9807ad92b45ec0735298ecc89f" +PKG_VERSION="e431126" +PKG_SHA256="c4945fb73890b3be738985f74c0a5b6e0f99ca337e2ba0d97397f23f6ec7423d" PKG_REV="1" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk index 627e44ad2d2..fe121a3643c 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vbox/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vbox" -PKG_VERSION="56052da" -PKG_SHA256="96c5cd2e3fb2d0d8cdb6f6c9d1afa3cfccb65604407eaac531ca784f01387a9b" +PKG_VERSION="619c32b" +PKG_SHA256="d9b4a4f1053dad95fc44fbfeaf69b719931a0eb20cd6f3ca4dc911f76d483780" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk index 2c1087a7d7d..06cbb9793c6 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vdr.vnsi/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vdr.vnsi" -PKG_VERSION="92e4c64" -PKG_SHA256="5083e0763c6724809762c36ac22df08080bde5bfd1ec73dfef2b74ea647210b3" +PKG_VERSION="0ec3e77" +PKG_SHA256="f77fe1049233de6b7ff0d883783db0da9c98705221ba62cea208a86cf656993c" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk index f6fd567697a..404dea96bac 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.vuplus" -PKG_VERSION="eb0d16d" -PKG_SHA256="1aaf83aa6ebfbb22d8deca7b856a794eeaeda5e738c06a7cd972a37419816689" +PKG_VERSION="7ea6b21" +PKG_SHA256="48ea86488ea9a7faf8baeed4cf0340dfb6c16c3ee2e6a1d1366d3c208dd712bd" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk index 12fc511840e..b8e872a742c 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.wmc/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.wmc" -PKG_VERSION="02edba6" -PKG_SHA256="7c7f526e8ff6d19a019f01e5b7a979ef7cbdd909ebe3a13f9e666fc282db9be5" +PKG_VERSION="55e701b" +PKG_SHA256="9e3dcc8d96934bc2959ebc2e9e89dbcce8f664b2a6cfdbdce8512fa68307d590" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk index b07f8c4b46f..20b888099f9 100644 --- a/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk +++ b/packages/mediacenter/kodi-binary-addons/pvr.zattoo/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="pvr.zattoo" -PKG_VERSION="70fafbe" -PKG_SHA256="acdc5b65a9f97bd1957e5ca7cae4d401a8621dd70e1880c27ede1602704fa5a7" +PKG_VERSION="23d9993" +PKG_SHA256="4c9caad94059093f16a59865b72645d88d91357cb067db570d1c94ea274fb673" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk index dbd3d3873ac..8d550ea364b 100644 --- a/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk +++ b/packages/mediacenter/kodi-binary-addons/screensaver.pingpong/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="screensaver.pingpong" -PKG_VERSION="3a27396" -PKG_SHA256="e87d270e05b446174a937b0e1d468812476f332ed0c194387adbbdf2df1c2163" +PKG_VERSION="ce794e9" +PKG_SHA256="ab5cbd929f5125127474447a1a9c9848aa0a3186f8f7dc7cfeee97a5f2658e06" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk new file mode 100644 index 00000000000..f63014029f1 --- /dev/null +++ b/packages/mediacenter/kodi-binary-addons/vfs.libarchive/package.mk @@ -0,0 +1,33 @@ +################################################################################ +# This file is part of LibreELEC - https://libreelec.tv +# Copyright (C) 2017-present Team LibreELEC +# +# LibreELEC is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# LibreELEC is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LibreELEC. If not, see . +################################################################################ + +PKG_NAME="vfs.libarchive" +PKG_VERSION="d3f3953" +PKG_SHA256="df094f8f217f25b9c288556cf1ca30c822d35d3e9dfc4973dd994e5f40508edd" +PKG_REV="1" +PKG_ARCH="any" +PKG_LICENSE="GPL" +PKG_SITE="http://www.kodi.tv" +PKG_URL="https://github.com/notspiff/vfs.libarchive/archive/$PKG_VERSION.tar.gz" +PKG_DEPENDS_TARGET="toolchain kodi-platform bzip2 libarchive lz4 lzo xz zlib" +PKG_SECTION="" +PKG_SHORTDESC="vfs.libarchive" +PKG_LONGDESC="vfs.libarchive" + +PKG_IS_ADDON="yes" +PKG_ADDON_TYPE="kodi.vfs" diff --git a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk index 125daaa21f6..2b7b5cc91d2 100644 --- a/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk +++ b/packages/mediacenter/kodi-binary-addons/vfs.rar/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="vfs.rar" -PKG_VERSION="4eacaec" -PKG_SHA256="96d162e295c786d0e07cced1b7377c6ba07ea691389d5fac02aba7a12974d8b5" +PKG_VERSION="2904d06" +PKG_SHA256="0de99949939acad8753156cf5bc87a33a2cdc1459ee15ea450c5fee75dd73b70" PKG_REV="2" PKG_ARCH="any" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk b/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk index dd4779cf48b..c4fbf657b3b 100644 --- a/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk +++ b/packages/mediacenter/kodi-binary-addons/visualization.vsxu/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="visualization.vsxu" -PKG_VERSION="caedcbc" -PKG_SHA256="a471095637e8c24d68b37a8c13ced75b66ef4fba7c9dab8a25defefe1c1dc807" +PKG_VERSION="c3d8264" +PKG_SHA256="cc9a0e287cd272e83e99003d60599b1546265299c8e4f7a5c061cb3f8d4348cd" PKG_REV="2" PKG_ARCH="i386 x86_64" PKG_LICENSE="GPL" diff --git a/packages/mediacenter/kodi/config/guisettings.xml b/packages/mediacenter/kodi/config/guisettings.xml index 39d322489e2..0cf1b761475 100755 --- a/packages/mediacenter/kodi/config/guisettings.xml +++ b/packages/mediacenter/kodi/config/guisettings.xml @@ -1,4 +1,4 @@ - + 2 diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk index 25598955bbe..4593d1f4f52 100644 --- a/packages/mediacenter/kodi/package.mk +++ b/packages/mediacenter/kodi/package.mk @@ -17,8 +17,8 @@ ################################################################################ PKG_NAME="kodi" -PKG_VERSION="055f6ee" -PKG_SHA256="bbdca237bafcdcf28c0d4a3e6309ca616ab9ca4a35f3fcef5de8bac3ad2f6cc6" +PKG_VERSION="c356fa8" +PKG_SHA256="a2ee06b44d6d3e6306aef3df15c57e1a14022bb80a0cb25f2c98caa4cdf4fe56" PKG_ARCH="any" PKG_LICENSE="GPL" PKG_SITE="http://www.kodi.tv" @@ -220,6 +220,7 @@ PKG_CMAKE_OPTS_TARGET="-DNATIVEPREFIX=$TOOLCHAIN \ -DENABLE_EVENTCLIENTS=ON \ -DENABLE_LDGOLD=ON \ -DENABLE_DEBUGFISSION=OFF \ + -DENABLE_APP_AUTONAME=OFF \ $KODI_ARCH \ $KODI_NEON \ $KODI_VDPAU \ diff --git a/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch b/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch index 9c140bbed4b..877605afd89 100644 --- a/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch +++ b/packages/mediacenter/kodi/patches/kodi-100.07-disable-minimize.patch @@ -15,7 +15,7 @@ index 95ae98c..9aca1e3 100644 void CApplication::Minimize() { -- g_Windowing.Minimize(); +- CServiceBroker::GetWinSystem().Minimize(); } PLAYERCOREID CApplication::GetCurrentPlayer() diff --git a/packages/mediacenter/kodi/patches/kodi-999.99-PR12936.patch b/packages/mediacenter/kodi/patches/kodi-999.99-PR12936.patch deleted file mode 100644 index 09cb77d55bb..00000000000 --- a/packages/mediacenter/kodi/patches/kodi-999.99-PR12936.patch +++ /dev/null @@ -1,178 +0,0 @@ -From 64b82bba85398be323e61cba3d559997752f9e6f Mon Sep 17 00:00:00 2001 -From: wsnipex -Date: Thu, 19 Oct 2017 11:47:12 +0200 -Subject: [PATCH 1/2] [cmake] support building with ninja on posix platforms - ---- - addons/xbmc.json/addon.xml.in | 2 +- - cmake/scripts/common/GenerateVersionedFiles.cmake | 10 +++++----- - cmake/scripts/common/Macros.cmake | 10 ++++++++-- - xbmc/interfaces/json-rpc/schema/CMakeLists.txt | 1 + - xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake | 5 +++-- - xbmc/interfaces/json-rpc/schema/version.txt | 2 +- - 6 files changed, 19 insertions(+), 11 deletions(-) - -diff --git a/addons/xbmc.json/addon.xml.in b/addons/xbmc.json/addon.xml.in -index fbd2923a8b02..6f09fa24126d 100644 ---- a/addons/xbmc.json/addon.xml.in -+++ b/addons/xbmc.json/addon.xml.in -@@ -1,5 +1,5 @@ - -- -+ - - - -diff --git a/cmake/scripts/common/GenerateVersionedFiles.cmake b/cmake/scripts/common/GenerateVersionedFiles.cmake -index 90b2173e6a4d..011f4956f1fe 100644 ---- a/cmake/scripts/common/GenerateVersionedFiles.cmake -+++ b/cmake/scripts/common/GenerateVersionedFiles.cmake -@@ -13,12 +13,11 @@ endfunction() - - # add-on xml's - file(GLOB ADDON_XML_IN_FILE ${CORE_SOURCE_DIR}/addons/*/addon.xml.in) --foreach(loop_var ${ADDON_XML_IN_FILE}) -- # prevent 'xbmc.json'; will be obtained from 'xbmc/interfaces/json-rpc/schema/CMakeLists.txt'. -- if(loop_var MATCHES "xbmc.json") -- continue() -- endif() - -+# remove 'xbmc.json', will be created from 'xbmc/interfaces/json-rpc/schema/CMakeLists.txt' -+list(REMOVE_ITEM ADDON_XML_IN_FILE xbmc.json) -+ -+foreach(loop_var ${ADDON_XML_IN_FILE}) - list(GET loop_var 0 xml_name) - - string(REPLACE "/addon.xml.in" "" source_dir ${xml_name}) -@@ -35,4 +34,5 @@ foreach(loop_var ${ADDON_XML_IN_FILE}) - unset(xml_name) - endforeach() - -+ - generate_versioned_file(xbmc/CompileInfo.cpp.in ${CORE_BUILD_DIR}/xbmc/CompileInfo.cpp) -diff --git a/cmake/scripts/common/Macros.cmake b/cmake/scripts/common/Macros.cmake -index 205117720c83..10ed15163678 100644 ---- a/cmake/scripts/common/Macros.cmake -+++ b/cmake/scripts/common/Macros.cmake -@@ -643,6 +643,7 @@ endfunction() - # APP_VERSION - the app version (${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}-${APP_VERSION_TAG}) - # APP_ADDON_API - the addon API version in the form of 16.9.702 - # FILE_VERSION - file version in the form of 16,9,702,0 - Windows only -+# JSONRPC_VERSION - the json api version in the form of 8.3.0 - # - # Set various variables defined in "versions.h" - macro(core_find_versions) -@@ -657,8 +658,9 @@ macro(core_find_versions) - - include(CMakeParseArguments) - core_file_read_filtered(version_list ${CORE_SOURCE_DIR}/version.txt) -- string(REPLACE " " ";" version_list "${version_list}") -- cmake_parse_arguments(APP "" "APP_NAME;COMPANY_NAME;WEBSITE;VERSION_MAJOR;VERSION_MINOR;VERSION_TAG;VERSION_CODE;ADDON_API;APP_PACKAGE" "" ${version_list}) -+ core_file_read_filtered(json_version ${CORE_SOURCE_DIR}/xbmc/interfaces/json-rpc/schema/version.txt) -+ string(REPLACE " " ";" version_list "${version_list} ${json_version}") -+ cmake_parse_arguments(APP "" "APP_NAME;COMPANY_NAME;WEBSITE;VERSION_MAJOR;VERSION_MINOR;VERSION_TAG;VERSION_CODE;ADDON_API;APP_PACKAGE;JSONRPC_VERSION" "" ${version_list}) - - if(NOT ${APP_VERSION_CODE} MATCHES "^[0-9]+\\.[0-9][0-9]?\\.[0-9][0-9]?[0-9]?$") - message(FATAL_ERROR "VERSION_CODE was set to ${APP_VERSION_CODE} in version.txt, but it has to match '^\\d+\\.\\d{1,2}\\.\\d{1,3}$'") -@@ -674,6 +676,7 @@ macro(core_find_versions) - string(TOLOWER ${APP_VERSION_TAG} APP_VERSION_TAG_LC) - endif() - string(REPLACE "." "," FILE_VERSION ${APP_ADDON_API}.0) -+ set(JSONRPC_VERSION ${APP_JSONRPC_VERSION}) - - # Set defines used in addon.xml.in and read from versions.h to set add-on - # version parts automatically -@@ -698,6 +701,9 @@ macro(core_find_versions) - if(NOT DEFINED APP_VERSION_MAJOR OR NOT DEFINED APP_VERSION_MINOR) - message(FATAL_ERROR "Could not determine app version! Make sure that ${CORE_SOURCE_DIR}/version.txt exists") - endif() -+ if(NOT DEFINED JSONRPC_VERSION) -+ message(FATAL_ERROR "Could not determine json-rpc version! Make sure that ${CORE_SOURCE_DIR}/xbmc/interfaces/json-rpc/schema/version.txt exists") -+ endif() - endmacro() - - # add-on xml's -diff --git a/xbmc/interfaces/json-rpc/schema/CMakeLists.txt b/xbmc/interfaces/json-rpc/schema/CMakeLists.txt -index aa6142bc7718..a4d5583fdbed 100644 ---- a/xbmc/interfaces/json-rpc/schema/CMakeLists.txt -+++ b/xbmc/interfaces/json-rpc/schema/CMakeLists.txt -@@ -14,6 +14,7 @@ add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/addons/xbmc.json/addon.xml - COMMAND ${CMAKE_COMMAND} - -DCMAKE_SOURCE_DIR=${CMAKE_SOURCE_DIR} - -DCORE_BINARY_DIR=${CMAKE_BINARY_DIR} -+ -DCORE_SYSTEM_NAME=${CORE_SYSTEM_NAME} - -P ${CMAKE_CURRENT_SOURCE_DIR}/GenerateAddonXml.cmake - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} - DEPENDS ${JSON_SRCS} ${CMAKE_SOURCE_DIR}/addons/xbmc.json/addon.xml.in -diff --git a/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake b/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake -index 53afaf8272f3..7f0817b6a801 100644 ---- a/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake -+++ b/xbmc/interfaces/json-rpc/schema/GenerateAddonXml.cmake -@@ -1,5 +1,6 @@ --file(STRINGS ${CMAKE_SOURCE_DIR}/xbmc/interfaces/json-rpc/schema/version.txt jsonrpc_version) -+include(${CMAKE_SOURCE_DIR}/cmake/scripts/common/Macros.cmake) -+core_find_versions() - --execute_process(COMMAND ${CMAKE_COMMAND} -E remove ${CORE_BINARY_DIR}/addons/xbmc.json/addon.xml) -+file(REMOVE ${CORE_BINARY_DIR}/addons/xbmc.json/addon.xml) - configure_file(${CMAKE_SOURCE_DIR}/addons/xbmc.json/addon.xml.in - ${CORE_BINARY_DIR}/addons/xbmc.json/addon.xml @ONLY) -diff --git a/xbmc/interfaces/json-rpc/schema/version.txt b/xbmc/interfaces/json-rpc/schema/version.txt -index 2bf50aaf17a6..7b9e4ea4acce 100644 ---- a/xbmc/interfaces/json-rpc/schema/version.txt -+++ b/xbmc/interfaces/json-rpc/schema/version.txt -@@ -1 +1 @@ --8.3.0 -+JSONRPC_VERSION 8.3.0 - -From 1cea4c73d24af3ed22789ece095379c66269fa6c Mon Sep 17 00:00:00 2001 -From: wsnipex -Date: Wed, 1 Nov 2017 20:20:19 +0100 -Subject: [PATCH 2/2] [JsonSchemabuilder] adjust version parsing - ---- - tools/depends/native/JsonSchemaBuilder/CMakeLists.txt | 2 ++ - tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp | 3 ++- - tools/depends/native/JsonSchemaBuilder/src/Makefile.am | 1 + - 3 files changed, 5 insertions(+), 1 deletion(-) - -diff --git a/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt b/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt -index 4fe8fdce40e8..783b8a3c5801 100644 ---- a/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt -+++ b/tools/depends/native/JsonSchemaBuilder/CMakeLists.txt -@@ -1,3 +1,5 @@ - set(SOURCES src/JsonSchemaBuilder.cpp) - -+set(CMAKE_CXX_STANDARD 11) -+set(CMAKE_CXX_STANDARD_REQUIRED ON) - add_executable(JsonSchemaBuilder ${SOURCES}) -diff --git a/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp b/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp -index a267fd9d346e..9a8acdbd9bf5 100644 ---- a/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp -+++ b/tools/depends/native/JsonSchemaBuilder/src/JsonSchemaBuilder.cpp -@@ -22,6 +22,7 @@ - #include - #include - #include -+#include - - using namespace std; - -@@ -29,7 +30,7 @@ void print_version(ifstream &in, ofstream &out) - { - string line; - if (getline(in, line)) -- out << line; -+ out << regex_replace(line, regex("(\\s+)?JSONRPC_VERSION\\s+|(\\s+)?#.*"), ""); - } - - void print_license(ifstream &in, ofstream &out) -diff --git a/tools/depends/native/JsonSchemaBuilder/src/Makefile.am b/tools/depends/native/JsonSchemaBuilder/src/Makefile.am -index 1d5e071bcaab..99454a1005e8 100644 ---- a/tools/depends/native/JsonSchemaBuilder/src/Makefile.am -+++ b/tools/depends/native/JsonSchemaBuilder/src/Makefile.am -@@ -1,3 +1,4 @@ - bin_PROGRAMS = JsonSchemaBuilder - JsonSchemaBuilder_SOURCES = JsonSchemaBuilder.cpp -+AM_CXXFLAGS = -O2 -std=c++11 - diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk index 0b58ddb9d57..c203364dc18 100644 --- a/packages/multimedia/ffmpeg/package.mk +++ b/packages/multimedia/ffmpeg/package.mk @@ -18,8 +18,8 @@ PKG_NAME="ffmpeg" # Current branch is: release/3.4-kodi -PKG_VERSION="d056a4c" -PKG_SHA256="c041ac2837473fdafbcbc2605d4104f7a3b9ba4d19e21a27487e3eb8581f7b6c" +PKG_VERSION="d413620" +PKG_SHA256="c02de2197f8b70544f018e83f48c1bed2a1b47e1a1aa34ef59d9167fb0d2090a" PKG_ARCH="any" PKG_LICENSE="LGPLv2.1+" PKG_SITE="https://ffmpeg.org" @@ -87,8 +87,11 @@ pre_configure_target() { strip_gold if [ "$KODIPLAYER_DRIVER" = "bcm2835-driver" ]; then - CFLAGS="-I$SYSROOT_PREFIX/usr/include/interface/vcos/pthreads -I$SYSROOT_PREFIX/usr/include/interface/vmcs_host/linux -DRPI=1 $CFLAGS" + CFLAGS="-I$SYSROOT_PREFIX/usr/include/interface/vcos/pthreads -I$SYSROOT_PREFIX/usr/include/interface/vmcs_host/linux $CFLAGS" FFMPEG_LIBS="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm" + FFMPEG_RPI="--enable-rpi" + else + FFMPEG_RPI="--disable-rpi" fi } @@ -149,6 +152,7 @@ configure_target() { --disable-crystalhd \ $FFMPEG_VAAPI \ $FFMPEG_VDPAU \ + $FFMPEG_RPI \ --disable-dxva2 \ --enable-runtime-cpudetect \ $FFMPEG_TABLES \ @@ -175,6 +179,8 @@ configure_target() { --enable-filters \ --disable-avisynth \ --enable-bzlib \ + --disable-lzma \ + --disable-alsa \ --disable-frei0r \ --disable-libopencore-amrnb \ --disable-libopencore-amrwb \ diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-00.0000-fix-version-script.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-00.0000-fix-version-script.patch deleted file mode 100644 index 18e6501a884..00000000000 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-00.0000-fix-version-script.patch +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/ffbuild/version.sh b/ffbuild/version.sh -index edc4dd3..d2b90a9 100755 ---- a/ffbuild/version.sh -+++ b/ffbuild/version.sh -@@ -16,6 +16,9 @@ fi - test "$revision" || revision=$(cd "$1" && - git log -1 --pretty=format:"git-%cd-%h" --date=short 2> /dev/null) - -+# ignore any current revision, which is just that of the LibreELEC repository -+revision= -+ - # Snapshots from gitweb are in a directory called ffmpeg-hhhhhhh or - # ffmpeg-HEAD-hhhhhhh. - if [ -z "$revision" ]; then diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch similarity index 100% rename from packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch rename to packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch deleted file mode 100644 index 5240cf58ce4..00000000000 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch +++ /dev/null @@ -1,24 +0,0 @@ -From 73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105 Mon Sep 17 00:00:00 2001 -From: Hendrik Leppkes -Date: Mon, 1 Sep 2014 11:39:09 +0200 -Subject: [PATCH] h264_parser: force grabing a new timestamp until a frame - start was found - ---- - libavcodec/h264_parser.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c -index 2fd3f2b..7165652 100644 ---- a/libavcodec/h264_parser.c -+++ b/libavcodec/h264_parser.c -@@ -525,6 +525,9 @@ static int h264_parse(AVCodecParserContext *s, - } else { - next = h264_find_frame_end(p, buf, buf_size); - -+ if (next == END_NOT_FOUND && pc->frame_start_found == 0) -+ s->fetch_timestamp = 1; -+ - if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) { - *poutbuf = NULL; - *poutbuf_size = 0; diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch new file mode 100644 index 00000000000..c3c09d6325d --- /dev/null +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch @@ -0,0 +1,48 @@ +From 214a8ccc1489db28ce6cec2739365d7eebbdb0f9 Mon Sep 17 00:00:00 2001 +From: popcornmix +Date: Fri, 5 Jun 2015 22:48:33 +0100 +Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp + point + +--- + libavcodec/avcodec.h | 1 + + libavcodec/mpeg4videodec.c | 4 ++++ + 2 files changed, 5 insertions(+) + +diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h +index c207d3a784..08aa8112b1 100644 +--- a/libavcodec/avcodec.h ++++ b/libavcodec/avcodec.h +@@ -2967,6 +2967,7 @@ typedef struct AVCodecContext { + #define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders. + #define FF_BUG_TRUNCATED 16384 + #define FF_BUG_IEDGE 32768 ++#define FF_BUG_GMC_UNSUPPORTED (1<<30) + + /** + * strictly follow the standard (MPEG-4, ...). +diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c +index cd39131d55..d8c8227cb4 100644 +--- a/libavcodec/mpeg4videodec.c ++++ b/libavcodec/mpeg4videodec.c +@@ -2250,6 +2250,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) + + if (ctx->divx_version >= 0) + s->workaround_bugs |= FF_BUG_HPEL_CHROMA; ++ ++ if (ctx->num_sprite_warping_points > 1) ++ s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED; + } + + if (s->workaround_bugs & FF_BUG_STD_QPEL) { +@@ -2274,6 +2277,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx) + s->workaround_bugs, ctx->lavc_build, ctx->xvid_build, + ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : ""); + ++ avctx->workaround_bugs = s->workaround_bugs; + if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 && + s->codec_id == AV_CODEC_ID_MPEG4 && + avctx->idct_algo == FF_IDCT_AUTO) { +-- +2.14.1 + diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch similarity index 60% rename from packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch rename to packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch index 7caefad8a7e..5104bfd261b 100644 --- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch @@ -18,23 +18,53 @@ index dabb51762d..0b1f739d22 100644 /ffmpeg /ffplay /ffprobe +diff --git a/configure b/configure +index 18d80ee87a..d519af9074 100755 +--- a/configure ++++ b/configure +@@ -313,6 +313,7 @@ External library support: + --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no] + --enable-libnpp enable Nvidia Performance Primitives-based code [no] + --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] ++ --enable-rpi enable other rpi specific stuff [no] + --disable-nvenc disable Nvidia video encoding code [autodetect] + --enable-omx enable OpenMAX IL code [no] + --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] +@@ -1682,6 +1683,7 @@ FEATURE_LIST=" + gray + hardcoded_tables + omx_rpi ++ rpi + runtime_cpudetect + safe_bitstream_reader + shared +@@ -2500,6 +2502,8 @@ hap_decoder_select="snappy texturedsp" + hap_encoder_deps="libsnappy" + hap_encoder_select="texturedspenc" + hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp" ++hevc_rpi_decoder_deps="rpi" ++hevc_rpi_decoder_select="hevc_decoder" + huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" + huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" + iac_decoder_select="imc_decoder" diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c -index 3ee31473dc..aac5b2f6c3 100644 +index 3ee31473dc..312864d737 100644 --- a/fftools/ffmpeg.c +++ b/fftools/ffmpeg.c -@@ -23,6 +23,11 @@ - * multimedia converter based on the FFmpeg libraries +@@ -24,6 +24,12 @@ */ -+#ifdef RPI + #include "config.h" ++ ++#if CONFIG_RPI +#define RPI_DISPLAY +#define RPI_DISPLAY_ALL 0 +#endif + - #include "config.h" #include #include -@@ -43,6 +48,7 @@ + #include +@@ -43,6 +49,7 @@ #include "libavformat/avformat.h" #include "libavdevice/avdevice.h" #include "libswresample/swresample.h" @@ -42,7 +72,7 @@ index 3ee31473dc..aac5b2f6c3 100644 #include "libavutil/opt.h" #include "libavutil/channel_layout.h" #include "libavutil/parseutils.h" -@@ -69,6 +75,25 @@ +@@ -69,6 +76,25 @@ # include "libavfilter/buffersrc.h" # include "libavfilter/buffersink.h" @@ -68,7 +98,7 @@ index 3ee31473dc..aac5b2f6c3 100644 #if HAVE_SYS_RESOURCE_H #include #include -@@ -165,6 +190,241 @@ static int restore_tty; +@@ -165,6 +191,241 @@ static int restore_tty; static void free_input_threads(void); #endif @@ -310,7 +340,7 @@ index 3ee31473dc..aac5b2f6c3 100644 /* sub2video hack: Convert subtitles to video with alpha to insert them in filter graphs. This is a temporary solution until libavfilter gets real subtitles support. -@@ -575,6 +835,11 @@ static void ffmpeg_cleanup(int ret) +@@ -575,6 +836,11 @@ static void ffmpeg_cleanup(int ret) avformat_close_input(&input_files[i]->ctx); av_freep(&input_files[i]); } @@ -322,7 +352,7 @@ index 3ee31473dc..aac5b2f6c3 100644 for (i = 0; i < nb_input_streams; i++) { InputStream *ist = input_streams[i]; -@@ -586,7 +851,9 @@ static void ffmpeg_cleanup(int ret) +@@ -586,7 +852,9 @@ static void ffmpeg_cleanup(int ret) av_freep(&ist->filters); av_freep(&ist->hwaccel_device); av_freep(&ist->dts_buffer); @@ -333,7 +363,7 @@ index 3ee31473dc..aac5b2f6c3 100644 avcodec_free_context(&ist->dec_ctx); av_freep(&input_streams[i]); -@@ -617,6 +884,7 @@ static void ffmpeg_cleanup(int ret) +@@ -617,6 +885,7 @@ static void ffmpeg_cleanup(int ret) } term_exit(); ffmpeg_exited = 1; @@ -341,7 +371,7 @@ index 3ee31473dc..aac5b2f6c3 100644 } void remove_avoptions(AVDictionary **a, AVDictionary *b) -@@ -1052,6 +1320,15 @@ static void do_video_out(OutputFile *of, +@@ -1052,6 +1321,15 @@ static void do_video_out(OutputFile *of, if (ost->source_index >= 0) ist = input_streams[ost->source_index]; @@ -357,7 +387,7 @@ index 3ee31473dc..aac5b2f6c3 100644 frame_rate = av_buffersink_get_frame_rate(filter); if (frame_rate.num > 0 && frame_rate.den > 0) duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base)); -@@ -2165,8 +2442,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) +@@ -2165,8 +2443,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) ifilter->channel_layout != frame->channel_layout; break; case AVMEDIA_TYPE_VIDEO: @@ -368,7 +398,7 @@ index 3ee31473dc..aac5b2f6c3 100644 break; } -@@ -2896,6 +3173,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) +@@ -2896,6 +3174,12 @@ static int init_input_stream(int ist_index, char *error, int error_len) ist->dec_ctx->opaque = ist; ist->dec_ctx->get_format = get_format; ist->dec_ctx->get_buffer2 = get_buffer; @@ -396,52 +426,78 @@ index aacc185059..33c054294c 100644 ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; ifilter->sample_rate = frame->sample_rate; +diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c +index 100fa76e46..93a1b8edaf 100644 +--- a/fftools/ffmpeg_opt.c ++++ b/fftools/ffmpeg_opt.c +@@ -706,11 +706,19 @@ static AVCodec *choose_decoder(OptionsContext *o, AVFormatContext *s, AVStream * + + MATCH_PER_STREAM_OPT(codec_names, str, codec_name, s, st); + if (codec_name) { ++ if (strcmp("hevc_rpi", codec_name) == 0) { ++ return avcodec_find_decoder_by_id_and_fmt(AV_CODEC_ID_HEVC, st->codecpar->format); ++ } + AVCodec *codec = find_codec_or_die(codec_name, st->codecpar->codec_type, 0); + st->codecpar->codec_id = codec->id; + return codec; + } else ++ { ++ if (st->codecpar->codec_id == AV_CODEC_ID_HEVC) { ++ return avcodec_find_decoder_by_id_and_fmt(st->codecpar->codec_id, st->codecpar->format); ++ } + return avcodec_find_decoder(st->codecpar->codec_id); ++ } + } + + /* Add all the streams from the given input file to the global diff --git a/libavcodec/Makefile b/libavcodec/Makefile -index c4ec09b1c4..54297da0ea 100644 +index c4ec09b1c4..3b94d47e9a 100644 --- a/libavcodec/Makefile +++ b/libavcodec/Makefile -@@ -4,6 +4,15 @@ DESC = FFmpeg codec library +@@ -4,6 +4,7 @@ DESC = FFmpeg codec library HEADERS = avcodec.h \ avdct.h \ avfft.h \ -+ rpi_qpu.h \ -+ rpi_shader.h \ -+ rpi_shader_cmd.h \ -+ rpi_shader_template.h \ -+ rpi_shader_template_fn.h \ -+ rpi_mailbox.h \ -+ rpi_hevc_transform8.h \ -+ rpi_hevc_transform10.h \ + rpi_zc.h \ d3d11va.h \ dirac.h \ dv_profile.h \ -@@ -48,6 +57,11 @@ OBJS = allcodecs.o \ - resample.o \ - resample2.o \ - utils.o \ -+ rpi_qpu.o \ -+ rpi_shader.o \ -+ rpi_shader_template.o \ -+ rpi_mailbox.o \ -+ rpi_zc.o \ - vorbis_parser.o \ - xiph.o \ - -@@ -1143,3 +1157,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h +@@ -123,6 +124,7 @@ OBJS-$(CONFIG_QSVDEC) += qsvdec.o + OBJS-$(CONFIG_QSVENC) += qsvenc.o + OBJS-$(CONFIG_RANGECODER) += rangecoder.o + OBJS-$(CONFIG_RDFT) += rdft.o ++OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o + OBJS-$(CONFIG_RV34DSP) += rv34dsp.o + OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o + OBJS-$(CONFIG_SINEWIN) += sinewin.o sinewin_fixed.o +@@ -351,6 +353,12 @@ OBJS-$(CONFIG_HAP_ENCODER) += hapenc.o hap.o + OBJS-$(CONFIG_HEVC_DECODER) += hevcdec.o hevc_mvs.o \ + hevc_cabac.o hevc_refs.o hevcpred.o \ + hevcdsp.o hevc_filter.o hevc_data.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \ ++ rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \ ++ rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \ ++ rpi_hevc_shader.o rpi_hevc_shader_template.o \ ++ rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \ ++ rpi_hevc_sei.o rpi_hevc_data.o + OBJS-$(CONFIG_HEVC_CUVID_DECODER) += cuvid.o + OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o + OBJS-$(CONFIG_HEVC_NVENC_ENCODER) += nvenc_hevc.o +@@ -1143,3 +1151,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h endif + ++ifdef CONFIG_HEVC_RPI_DECODER +QASM_PY := ../local/bin/qasm.py +VASMVIDCORE := ../local/bin/vasmvidcore_std + +ifneq ("$(wildcard $(QASM_PY))","") -+$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm -+ $(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@ ++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm ++ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,rpi_hevc_shader $< > $@ + -+$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm -+ $(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@ ++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm ++ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_shader,rpi_hevc_shader $< > $@ +endif + +ifneq ("$(wildcard $(VASMVIDCORE))","") @@ -454,40 +510,52 @@ index c4ec09b1c4..54297da0ea 100644 + python pi-util/make_array.py $< +$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin + python pi-util/make_array.py $< -+ +endif + +$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h -+$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h ++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h ++endif diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c -index 4f34312e67..bba96efade 100644 +index 4f34312e67..5361a22141 100644 --- a/libavcodec/allcodecs.c +++ b/libavcodec/allcodecs.c -@@ -731,6 +731,7 @@ static void register_all(void) - REGISTER_PARSER(H261, h261); - REGISTER_PARSER(H263, h263); - REGISTER_PARSER(H264, h264); -+ REGISTER_PARSER(H264_MVC, h264_mvc); - REGISTER_PARSER(HEVC, hevc); - REGISTER_PARSER(MJPEG, mjpeg); - REGISTER_PARSER(MLP, mlp); +@@ -222,6 +222,7 @@ static void register_all(void) + REGISTER_DECODER(H264_VDPAU, h264_vdpau); + #endif + REGISTER_ENCDEC (HAP, hap); ++ REGISTER_DECODER(HEVC_RPI, hevc_rpi); + REGISTER_DECODER(HEVC, hevc); + REGISTER_DECODER(HEVC_QSV, hevc_qsv); + REGISTER_DECODER(HEVC_RKMPP, hevc_rkmpp); diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile -index 1eeac5449e..7e23777f5d 100644 +index 1eeac5449e..64aca64e52 100644 --- a/libavcodec/arm/Makefile +++ b/libavcodec/arm/Makefile -@@ -134,9 +134,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ +@@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ + arm/sbrdsp_init_arm.o + OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o + OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o + OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o + OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o + OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o +@@ -134,9 +135,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ -+ arm/hevc_misc_neon.o \ ++ arm/hevcdsp_idct_neon.o \ arm/hevcdsp_deblock_neon.o \ -+ arm/hevcdsp_epel_neon.o \ - arm/hevcdsp_idct_neon.o \ -- arm/hevcdsp_qpel_neon.o -+ arm/hevcdsp_cres_neon.o \ -+ arm/hevcdsp_res16_neon.o \ -+ arm/hevcdsp_qpel_neon.o \ -+ arm/hevcdsp_sao_neon.o +- arm/hevcdsp_idct_neon.o \ + arm/hevcdsp_qpel_neon.o ++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \ ++ arm/rpi_hevc_misc_neon.o \ ++ arm/rpi_hevcdsp_deblock_neon.o \ ++ arm/rpi_hevcdsp_epel_neon.o \ ++ arm/rpi_hevcdsp_idct_neon.o \ ++ arm/rpi_hevcdsp_res16_neon.o \ ++ arm/rpi_hevcdsp_qpel_neon.o \ ++ arm/rpi_hevcdsp_sao_neon.o \ ++ arm/rpi_hevcdsp_cres_neon.o NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ arm/rv40dsp_neon.o @@ -673,11 +741,11 @@ index fdbf86b45e..0a3980a1ef 100644 #endif /* HAVE_ARMV6T2_INLINE */ #endif /* AVCODEC_ARM_CABAC_H */ -diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h +diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h new file mode 100644 index 0000000000..31d3c59205 --- /dev/null -+++ b/libavcodec/arm/hevc_cabac.h ++++ b/libavcodec/arm/rpi_hevc_cabac.h @@ -0,0 +1,491 @@ +/* + * This file is part of FFmpeg. @@ -1170,11 +1238,11 @@ index 0000000000..31d3c59205 +#endif /* HAVE_ARMV6T2_INLINE */ + +#endif /* AVCODEC_ARM_HEVC_CABAC_H */ -diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S +diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S new file mode 100644 -index 0000000000..380d3c8d3b +index 0000000000..91a7bd4f4f --- /dev/null -+++ b/libavcodec/arm/hevc_idct_fn_neon.S ++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S @@ -0,0 +1,224 @@ +@ Included multiple times from hevc_idct_neon.S +@ Macros defined there @@ -1183,7 +1251,7 @@ index 0000000000..380d3c8d3b +#define DC_ADD (1 | (1 << (14 - BIT_DEPTH))) +#define TRN_SHIFT (20 - BIT_DEPTH) + -+function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r1, #DC_ADD + asr r1, #DC_SHIFT @@ -1193,7 +1261,7 @@ index 0000000000..380d3c8d3b + bx lr +endfunc + -+function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r1, #DC_ADD + asr r1, #DC_SHIFT @@ -1209,7 +1277,7 @@ index 0000000000..380d3c8d3b + bx lr +endfunc + -+function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r1, #DC_ADD + asr r1, #DC_SHIFT @@ -1228,7 +1296,7 @@ index 0000000000..380d3c8d3b + bx lr +endfunc + -+function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1 + ldrsh r1, [r0] + add r1, #DC_ADD + asr r1, #DC_SHIFT @@ -1248,7 +1316,7 @@ index 0000000000..380d3c8d3b +endfunc + + -+function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1 + vpush {d8-d15} + vld1.16 {q14, q15}, [r0] // coeffs + ldr r3, =0x00240053 // 36 and 83 @@ -1273,7 +1341,7 @@ index 0000000000..380d3c8d3b + + + -+function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1 + vpush {d8-d15} + vld1.16 {q14, q15}, [r0] // coeffs + ldr r3, =0x4a // 74 @@ -1301,7 +1369,7 @@ index 0000000000..380d3c8d3b + + + -+function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1 + push {r4-r8} + vpush {d8-d15} + mov r5, #16 @@ -1400,11 +1468,11 @@ index 0000000000..380d3c8d3b +#undef DC_ADD +#undef TRN_SHIFT + -diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S +diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S new file mode 100644 index 0000000000..373576b4cb --- /dev/null -+++ b/libavcodec/arm/hevc_misc_neon.S ++++ b/libavcodec/arm/rpi_hevc_misc_neon.S @@ -0,0 +1,62 @@ +#include "libavutil/arm/asm.S" +#include "neon.S" @@ -1468,11 +1536,43 @@ index 0000000000..373576b4cb + +endfunc + -diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S +diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h +new file mode 100644 +index 0000000000..62b9326532 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_arm.h +@@ -0,0 +1,26 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H ++#define AVCODEC_ARM_HEVCDSP_ARM_H ++ ++#include "libavcodec/rpi_hevcdsp.h" ++ ++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth); ++ ++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ +diff --git a/libavcodec/arm/rpi_hevcdsp_cres_neon.S b/libavcodec/arm/rpi_hevcdsp_cres_neon.S new file mode 100644 -index 0000000000..bafefd4318 +index 0000000000..883cde35dc --- /dev/null -+++ b/libavcodec/arm/hevcdsp_cres_neon.S ++++ b/libavcodec/arm/rpi_hevcdsp_cres_neon.S @@ -0,0 +1,296 @@ +#include "libavutil/arm/asm.S" +#include "neon.S" @@ -1493,7 +1593,7 @@ index 0000000000..bafefd4318 +@ ptrdiff_t stride, [r2] +@ int dc_v) [r3] + -+function ff_hevc_add_residual_4x4_u_neon_8, export=1 ++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1 + vld1.8 {d16}, [r0, :64], r2 + vld1.8 {d17}, [r0, :64], r2 + vld1.8 {d18}, [r0, :64], r2 @@ -1529,7 +1629,7 @@ index 0000000000..bafefd4318 +@ ptrdiff_t stride) [r2] +@ int dc_v) [r3] + -+function ff_hevc_add_residual_8x8_u_neon_8, export=1 ++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1 + mov r12, #4 + vdup.16 q15, r3 +1: @@ -1560,7 +1660,7 @@ index 0000000000..bafefd4318 +@ ptrdiff_t stride) [r2] +@ int dc_v) [r3] + -+function ff_hevc_add_residual_16x16_u_neon_8, export=1 ++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1 + mov r12, #16 + vdup.16 q15, r3 +1: @@ -1590,7 +1690,7 @@ index 0000000000..bafefd4318 +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + -+function ff_hevc_add_residual_4x4_v_neon_8, export=1 ++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1 + vld1.8 {d16}, [r0, :64], r2 + vld1.8 {d17}, [r0, :64], r2 + vld1.8 {d18}, [r0, :64], r2 @@ -1625,7 +1725,7 @@ index 0000000000..bafefd4318 +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + -+function ff_hevc_add_residual_8x8_v_neon_8, export=1 ++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1 + mov r12, #4 + vdup.16 q15, r3 +1: @@ -1655,7 +1755,7 @@ index 0000000000..bafefd4318 +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + -+function ff_hevc_add_residual_16x16_v_neon_8, export=1 ++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1 + mov r12, #16 + vdup.16 q15, r3 +1: @@ -1685,7 +1785,7 @@ index 0000000000..bafefd4318 +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + -+function ff_hevc_add_residual_4x4_c_neon_8, export=1 ++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1 + vld1.8 {d16}, [r0, :64], r2 + vld1.8 {d17}, [r0, :64], r2 + vld1.8 {d18}, [r0, :64], r2 @@ -1718,7 +1818,7 @@ index 0000000000..bafefd4318 +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + -+function ff_hevc_add_residual_8x8_c_neon_8, export=1 ++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1 + mov r12, #8 + add r3, r1, #(8*8*2) @ Offset to V +1: @@ -1742,7 +1842,7 @@ index 0000000000..bafefd4318 +@ const int16_t *res, [r1] +@ ptrdiff_t stride) [r2] + -+function ff_hevc_add_residual_16x16_c_neon_8, export=1 ++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1 + mov r12, #16 + add r3, r1, #(16*16*2) @ Offset to V +1: @@ -1770,41 +1870,44 @@ index 0000000000..bafefd4318 +@ 32x32 chroma never occurs so NIF + +@ ============================================================================ -diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S -index 166bddb104..15c4329cdb 100644 ---- a/libavcodec/arm/hevcdsp_deblock_neon.S -+++ b/libavcodec/arm/hevcdsp_deblock_neon.S -@@ -15,7 +15,7 @@ - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software -- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S +new file mode 100644 +index 0000000000..d691cda836 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S +@@ -0,0 +1,1483 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1 - */ - - -@@ -24,70 +24,238 @@ - - .macro hevc_loop_filter_chroma_start - ldr r12, [r2] -- ldr r3, [r2, #4] -- add r2, r3, r12 -- cmp r2, #0 ++ */ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.macro hevc_loop_filter_chroma_start ++ ldr r12, [r2] + ldr r2, [r2, #4] + orrs r2, r12, r2, lsl #16 - it eq - bxeq lr - .endm - --.macro hevc_loop_filter_chroma_body -- vsubl.u8 q3, d4, d2 -- vsubl.u8 q11, d18, d19 -- vshl.i16 q3, #2 -- vadd.i16 q11, q3 -- vdup.16 d0, r12 -- vdup.16 d1, r3 -- vrshr.s16 q11, q11, #3 -- vneg.s16 q12, q0 ++ it eq ++ bxeq lr ++.endm ++ +@ Uses: d2, d4, d18, d19 +@ Returns: d2, d4 +@ Modifies: d0-d7, d22-d25, r12 @@ -1886,13 +1989,7 @@ index 166bddb104..15c4329cdb 100644 + @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all) + vrshr.s16 q0, #3 + vdup.16 d6, r2 - vmovl.u8 q2, d4 -- vmin.s16 q11, q11, q0 -- vmax.s16 q11, q11, q12 -- vaddw.u8 q1, q11, d2 -- vsub.i16 q2, q11 -- vqmovun.s16 d2, q1 -- vqmovun.s16 d4, q2 ++ vmovl.u8 q2, d4 + vmovl.u8 q3, d6 + vuzp.16 d4, d5 + vrshr.s16 q1, #3 @@ -1916,8 +2013,8 @@ index 166bddb104..15c4329cdb 100644 + + vqmovun.s16 \Q0u, q2 + vqmovun.s16 \Q0v, q3 - .endm - ++.endm ++ +@ Preserves r12 +@ Clobbers r2 +.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth @@ -1967,27 +2064,14 @@ index 166bddb104..15c4329cdb 100644 + + + - .macro hevc_loop_filter_luma_start - ldr r12, [r3] - ldr r3, [r3, #4] -- lsl r3, #16 -- orr r3, r12 -- cmp r3, #0 ++.macro hevc_loop_filter_luma_start ++ ldr r12, [r3] ++ ldr r3, [r3, #4] + orrs r3, r12, r3, lsl #16 - it eq - bxeq lr -- lsr r3, #16 - .endm - --.macro hevc_loop_filter_luma_body -- vmovl.u8 q8, d16 -- vmovl.u8 q9, d18 -- vmovl.u8 q10, d20 -- vmovl.u8 q11, d22 -- vmovl.u8 q12, d24 -- vmovl.u8 q13, d26 -- vmovl.u8 q14, d28 -- vmovl.u8 q15, d30 ++ it eq ++ bxeq lr ++.endm ++ +@ Uses: r2, r3, r12 +@ Modifies: r5, r6, r7, r8, r9 + @@ -2009,7 +2093,7 @@ index 166bddb104..15c4329cdb 100644 +@ r10[16:23] no_q[0] +@ r10[24:31] no_q[1] + - ++ +.macro m_filter_luma bit_depth +.if \bit_depth == 8 + vmovl.u8 q15, d23 @@ -2021,78 +2105,225 @@ index 166bddb104..15c4329cdb 100644 + vmovl.u8 q9, d17 + vmovl.u8 q8, d16 +.endif - vadd.i16 q7, q9, q11 ++ vadd.i16 q7, q9, q11 +.if \bit_depth > 8 + lsl r2, r2, #(\bit_depth - 8) +.endif - vadd.i16 q6, q14, q12 ++ vadd.i16 q6, q14, q12 +.if \bit_depth > 8 + lsl r3, r3, #(\bit_depth - 8) +.endif - vsub.i16 q7, q10 ++ vsub.i16 q7, q10 + ldr r5, [sp, #96] @ Bolt no_x values together into r10 - vsub.i16 q6, q13 - vabd.s16 q7, q7, q10 - vabd.s16 q6, q6, q13 -- ++ vsub.i16 q6, q13 ++ vabd.s16 q7, q7, q10 ++ vabd.s16 q6, q6, q13 + ldrh r10, [r5] - - vdup.16 q0, r2 - vmov q4, q7 - vmov q5, q6 -- vdup.16 d4, r12 ++ ++ vdup.16 q0, r2 ++ vmov q4, q7 ++ vmov q5, q6 + ldr r5, [sp, #100] + vdup.16 d4, r3 + lsr r3, r3, #16 - vtrn.16 q7, q4 ++ vtrn.16 q7, q4 + ldrh r5, [r5] - vtrn.16 q6, q5 - - vshl.u64 q7, #32 - vshr.u64 q4, #32 - vshl.u64 q6, #32 ++ vtrn.16 q6, q5 ++ ++ vshl.u64 q7, #32 ++ vshr.u64 q4, #32 ++ vshl.u64 q6, #32 + orr r10, r10, r5, lsl #16 - vshr.u64 q5, #32 - vshr.u64 q7, #32 - vshr.u64 q6, #32 -@@ -152,7 +320,7 @@ - - and r9, r8, r7 - cmp r9, #0 -- beq weakfilter_\@ ++ vshr.u64 q5, #32 ++ vshr.u64 q7, #32 ++ vshr.u64 q6, #32 ++ vshl.u64 q5, #32 ++ vshl.u64 q4, #32 ++ vorr q6, q5 ++ vorr q7, q4 ++ vdup.16 d5, r3 ++ vadd.i16 q5, q7, q6 ++ ++ vmov q4, q5 ++ vmov q3, q5 ++ vtrn.32 q3, q4 ++ ++ vadd.i16 q4, q3 ++ ++ vshl.s16 q5, q5, #1 ++ vcgt.s16 q3, q0, q4 ++ ++ vmovn.i16 d6, q3 ++ vshr.s16 q1, q0, #2 ++ vmovn.i16 d6, q3 ++ vcgt.s16 q5, q1, q5 ++ vmov r7, s12 ++ cmp r7, #0 ++ beq bypasswrite ++ ++ vpadd.i32 d0, d14, d12 ++ vpadd.i32 d1, d15, d13 ++ vmov q4, q2 ++ vshl.s16 q2, #2 ++ vshr.s16 q1, q1, #1 ++ vrhadd.s16 q2, q4 ++ ++ vabd.s16 q7, q8, q11 ++ vaba.s16 q7, q15, q12 ++ ++ vmovn.i32 d0, q0 ++ vmov r5, r6, s0, s1 ++ vcgt.s16 q6, q1, q7 ++ vand q5, q5, q6 ++ vabd.s16 q7, q11, q12 ++ vcgt.s16 q6, q2, q7 ++ vand q5, q5, q6 ++ ++ vmov q2, q5 ++ vtrn.s16 q5, q2 ++ vshr.u64 q2, #32 ++ vshl.u64 q5, #32 ++ vshl.u64 q2, #32 ++ vshr.u64 q5, #32 ++ vorr q5, q2 ++ ++ vmov q2, q5 ++ vshl.i16 q7, q4, #1 ++ vtrn.32 q2, q5 ++ vand q5, q2 ++ vneg.s16 q6, q7 ++ vmovn.i16 d4, q5 ++ vmovn.i16 d4, q2 ++ vmov r8, s8 ++ ++ and r9, r8, r7 ++ cmp r9, #0 + beq 1f - - vadd.i16 q2, q11, q12 - vadd.i16 q4, q9, q8 -@@ -210,11 +378,11 @@ - vbit q13, q3, q5 - vbit q14, q2, q5 - --weakfilter_\@: ++ ++ vadd.i16 q2, q11, q12 ++ vadd.i16 q4, q9, q8 ++ vadd.i16 q1, q2, q10 ++ vdup.16 d10, r9 ++ vadd.i16 q0, q1, q9 ++ vshl.i16 q4, #1 ++ lsr r9, #16 ++ vadd.i16 q1, q0 ++ vrshr.s16 q3, q0, #2 ++ vadd.i16 q1, q13 ++ vadd.i16 q4, q0 ++ vsub.i16 q3, q10 ++ vrshr.s16 q1, #3 ++ vrshr.s16 q4, #3 ++ vmax.s16 q3, q6 ++ vsub.i16 q1, q11 ++ vsub.i16 q4, q9 ++ vmin.s16 q3, q7 ++ vmax.s16 q4, q6 ++ vmax.s16 q1, q6 ++ vadd.i16 q3, q10 ++ vmin.s16 q4, q7 ++ vmin.s16 q1, q7 ++ vdup.16 d11, r9 ++ vadd.i16 q4, q9 ++ vadd.i16 q1, q11 ++ vbit q9, q4, q5 ++ vadd.i16 q4, q2, q13 ++ vbit q11, q1, q5 ++ vadd.i16 q0, q4, q14 ++ vadd.i16 q2, q15, q14 ++ vadd.i16 q4, q0 ++ ++ vshl.i16 q2, #1 ++ vadd.i16 q4, q10 ++ vbit q10, q3, q5 ++ vrshr.s16 q4, #3 ++ vadd.i16 q2, q0 ++ vrshr.s16 q3, q0, #2 ++ vsub.i16 q4, q12 ++ vrshr.s16 q2, #3 ++ vsub.i16 q3, q13 ++ vmax.s16 q4, q6 ++ vsub.i16 q2, q14 ++ vmax.s16 q3, q6 ++ vmin.s16 q4, q7 ++ vmax.s16 q2, q6 ++ vmin.s16 q3, q7 ++ vadd.i16 q4, q12 ++ vmin.s16 q2, q7 ++ vadd.i16 q3, q13 ++ vbit q12, q4, q5 ++ vadd.i16 q2, q14 ++ vbit q13, q3, q5 ++ vbit q14, q2, q5 ++ +1: - mvn r8, r8 - and r9, r8, r7 - cmp r9, #0 -- beq ready_\@ ++ mvn r8, r8 ++ and r9, r8, r7 ++ cmp r9, #0 + beq 2f - - vdup.16 q4, r2 - -@@ -275,111 +443,1041 @@ weakfilter_\@: - vbit q11, q0, q5 - vbit q12, q4, q5 - --ready_\@: ++ ++ vdup.16 q4, r2 ++ ++ vdup.16 d10, r9 ++ lsr r9, #16 ++ vmov q1, q4 ++ vdup.16 d11, r9 ++ vshr.s16 q1, #1 ++ vsub.i16 q2, q12, q11 ++ vadd.i16 q4, q1 ++ vshl.s16 q0, q2, #3 ++ vshr.s16 q4, #3 ++ vadd.i16 q2, q0 ++ vsub.i16 q0, q13, q10 ++ vsub.i16 q2, q0 ++ vshl.i16 q0, q0, #1 ++ vsub.i16 q2, q0 ++ vshl.s16 q1, q7, 2 ++ vrshr.s16 q2, q2, #4 ++ vadd.i16 q1, q7 ++ vabs.s16 q3, q2 ++ vshr.s16 q6, q6, #1 ++ vcgt.s16 q1, q1, q3 ++ vand q5, q1 ++ vshr.s16 q7, q7, #1 ++ vmax.s16 q2, q2, q6 ++ vmin.s16 q2, q2, q7 ++ ++ vshr.s16 q7, q7, #1 ++ vrhadd.s16 q3, q9, q11 ++ vneg.s16 q6, q7 ++ vsub.s16 q3, q10 ++ vdup.16 d2, r5 ++ vhadd.s16 q3, q2 ++ vdup.16 d3, r6 ++ vmax.s16 q3, q3, q6 ++ vcgt.s16 q1, q4, q1 ++ vmin.s16 q3, q3, q7 ++ vand q1, q5 ++ vadd.i16 q3, q10 ++ lsr r5, #16 ++ lsr r6, #16 ++ vbit q10, q3, q1 ++ ++ vrhadd.s16 q3, q14, q12 ++ vdup.16 d2, r5 ++ vsub.s16 q3, q13 ++ vdup.16 d3, r6 ++ vhsub.s16 q3, q2 ++ vcgt.s16 q1, q4, q1 ++ vmax.s16 q3, q3, q6 ++ vand q1, q5 ++ vmin.s16 q3, q3, q7 ++ vadd.i16 q3, q13 ++ vbit q13, q3, q1 ++ vadd.i16 q0, q11, q2 ++ vsub.i16 q4, q12, q2 ++ vbit q11, q0, q5 ++ vbit q12, q4, q5 ++ +2: +.if \bit_depth == 8 - vqmovun.s16 d16, q8 -- vqmovun.s16 d18, q9 -- vqmovun.s16 d20, q10 -- vqmovun.s16 d22, q11 -- vqmovun.s16 d24, q12 -- vqmovun.s16 d26, q13 -- vqmovun.s16 d28, q14 -- vqmovun.s16 d30, q15 ++ vqmovun.s16 d16, q8 + cmp r10, #0 + vqmovun.s16 d17, q9 + vqmovun.s16 d18, q10 @@ -2121,14 +2352,14 @@ index 166bddb104..15c4329cdb 100644 + vmin.s16 q14, q1 +.endif + mov pc, lr - .endm - ++.endm ++ +function hevc_loop_filter_luma_body + m_filter_luma 8 +endfunc + -+@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8])) -+function ff_hevc_v_loop_filter_luma2_neon_8, export=1 ++@ ff_hevc_rpi_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8])) ++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1 + hevc_loop_filter_luma_start + push {r4-r10,lr} @ 8 regs = 32 bytes + @@ -2137,7 +2368,7 @@ index 166bddb104..15c4329cdb 100644 +endfunc + + -+@ void ff_hevc_v_loop_filter_luma_neon( ++@ void ff_hevc_rpi_v_loop_filter_luma_neon( +@ uint8_t *_pix, [r0] +@ ptrdiff_t _stride, [r1] +@ int _beta, [r2] @@ -2146,35 +2377,13 @@ index 166bddb104..15c4329cdb 100644 +@ uint8_t *_no_q) [sp+4] + + - function ff_hevc_v_loop_filter_luma_neon, export=1 - hevc_loop_filter_luma_start -- push {r5-r11} ++function ff_hevc_rpi_v_loop_filter_luma_neon, export=1 ++ hevc_loop_filter_luma_start + push {r4-r10,lr} + + sub r4, r0, #4 +v_loop_luma_common: - vpush {d8-d15} -- sub r0, #4 -- vld1.8 {d16}, [r0], r1 -- vld1.8 {d18}, [r0], r1 -- vld1.8 {d20}, [r0], r1 -- vld1.8 {d22}, [r0], r1 -- vld1.8 {d24}, [r0], r1 -- vld1.8 {d26}, [r0], r1 -- vld1.8 {d28}, [r0], r1 -- vld1.8 {d30}, [r0], r1 -- sub r0, r0, r1, lsl #3 -- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 -- hevc_loop_filter_luma_body -- transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30 -- vst1.8 {d16}, [r0], r1 -- vst1.8 {d18}, [r0], r1 -- vst1.8 {d20}, [r0], r1 -- vst1.8 {d22}, [r0], r1 -- vst1.8 {d24}, [r0], r1 -- vst1.8 {d26}, [r0], r1 -- vst1.8 {d28}, [r0], r1 -- vst1.8 {d30}, [r0] ++ vpush {d8-d15} + + @ Uses slightly fewer instructions to do laned loads than unlaned + @ and transpose. This also means that we can use the same code for @@ -2242,12 +2451,10 @@ index 166bddb104..15c4329cdb 100644 + vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] +1: +bypasswrite: - vpop {d8-d15} -- pop {r5-r11} -- bx lr ++ vpop {d8-d15} + pop {r4-r10,pc} - endfunc - ++endfunc ++ +.macro m_filter_v_luma_common_16 bit_depth + vpush {d8-d15} + @@ -2332,42 +2539,25 @@ index 166bddb104..15c4329cdb 100644 +@ +@ Src should always be on 8 byte boundry & all in the same slice + - function ff_hevc_h_loop_filter_luma_neon, export=1 - hevc_loop_filter_luma_start -- push {r5-r11} ++function ff_hevc_rpi_h_loop_filter_luma_neon, export=1 ++ hevc_loop_filter_luma_start + push {r4-r10,lr} + - vpush {d8-d15} - sub r0, r0, r1, lsl #2 ++ vpush {d8-d15} ++ sub r0, r0, r1, lsl #2 + - vld1.8 {d16}, [r0], r1 ++ vld1.8 {d16}, [r0], r1 + vld1.8 {d17}, [r0], r1 - vld1.8 {d18}, [r0], r1 ++ vld1.8 {d18}, [r0], r1 + vld1.8 {d19}, [r0], r1 - vld1.8 {d20}, [r0], r1 ++ vld1.8 {d20}, [r0], r1 + vld1.8 {d21}, [r0], r1 - vld1.8 {d22}, [r0], r1 -- vld1.8 {d24}, [r0], r1 -- vld1.8 {d26}, [r0], r1 -- vld1.8 {d28}, [r0], r1 -- vld1.8 {d30}, [r0], r1 -- sub r0, r0, r1, lsl #3 -- add r0, r1 -- hevc_loop_filter_luma_body -- vst1.8 {d18}, [r0], r1 -- vst1.8 {d20}, [r0], r1 -- vst1.8 {d22}, [r0], r1 -- vst1.8 {d24}, [r0], r1 -- vst1.8 {d26}, [r0], r1 -- vst1.8 {d28}, [r0] --bypasswrite: ++ vld1.8 {d22}, [r0], r1 + vld1.8 {d23}, [r0] + + bl hevc_loop_filter_luma_body + - vpop {d8-d15} -- pop {r5-r11} -- bx lr ++ vpop {d8-d15} + + neg r1, r1 + add r0, r0, r1 @@ -2497,13 +2687,13 @@ index 166bddb104..15c4329cdb 100644 +.endm + + -+@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r, // r0 ++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ unsigned int no_f); // r3 +@ +@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] -+function ff_hevc_h_loop_filter_uv_neon_8, export=1 ++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 + sub r0, r0, r1, lsl #1 + vld2.8 {d16,d17}, [r0], r1 + vld2.8 {d18,d19}, [r0], r1 @@ -2543,7 +2733,7 @@ index 166bddb104..15c4329cdb 100644 +endfunc + + -+@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 ++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ unsigned int no_f); // r3 @@ -2593,7 +2783,7 @@ index 166bddb104..15c4329cdb 100644 +.endm + + -+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ uint8_t * src_l, // r3 @@ -2601,7 +2791,7 @@ index 166bddb104..15c4329cdb 100644 +@ +@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] + -+function ff_hevc_v_loop_filter_uv2_neon_8, export=1 ++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 + vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r3], r1 + vld4.8 {d20[0], d21[0], d22[0], d23[0]}, [r0], r1 + sub r12, r0, r3 @@ -2681,10 +2871,10 @@ index 166bddb104..15c4329cdb 100644 + vst2.8 {d18[1], d19[1]}, [r2], r1 + vst2.8 {d18[0], d19[0]}, [r2] + bx lr - endfunc - ++endfunc + -+@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++ ++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 +@ unsigned int stride, // r1 +@ uint32_t tc4, // r2 +@ uint8_t * src_l, // r3 @@ -2775,8 +2965,8 @@ index 166bddb104..15c4329cdb 100644 + + + - function ff_hevc_v_loop_filter_chroma_neon, export=1 - hevc_loop_filter_chroma_start ++function ff_hevc_rpi_v_loop_filter_chroma_neon, export=1 ++ hevc_loop_filter_chroma_start + + sub r0, #2 + vld4.8 {d16[0], d17[0], d18[0], d19[0]}, [r0], r1 @@ -2851,7 +3041,7 @@ index 166bddb104..15c4329cdb 100644 +.macro m_filter_v_chroma_16 bit_depth + hevc_loop_filter_chroma_start + - sub r0, #4 ++ sub r0, #4 + vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r0], r1 + vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0], r1 + vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r0], r1 @@ -2920,40 +3110,24 @@ index 166bddb104..15c4329cdb 100644 +.endm + + -+@ void ff_hevc_h_loop_filter_chroma_neon( ++@ void ff_hevc_rpi_h_loop_filter_chroma_neon( +@ uint8_t *_pix, [r0] +@ ptrdiff_t _stride, [r1] +@ int *_tc, [r2] +@ uint8_t *_no_p, [r3] +@ uint8_t *_no_q); [sp+0] + -+function ff_hevc_h_loop_filter_chroma_neon, export=1 ++function ff_hevc_rpi_h_loop_filter_chroma_neon, export=1 + hevc_loop_filter_chroma_start + sub r0, r0, r1, lsl #1 - vld1.8 {d16}, [r0], r1 - vld1.8 {d17}, [r0], r1 - vld1.8 {d18}, [r0], r1 -- vld1.8 {d2}, [r0], r1 -- vld1.8 {d4}, [r0], r1 -- vld1.8 {d19}, [r0], r1 -- vld1.8 {d20}, [r0], r1 -- vld1.8 {d21}, [r0], r1 -- sub r0, r0, r1, lsl #3 -- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 -- hevc_loop_filter_chroma_body -- transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21 -- vst1.8 {d16}, [r0], r1 ++ vld1.8 {d16}, [r0], r1 ++ vld1.8 {d17}, [r0], r1 ++ vld1.8 {d18}, [r0], r1 + vld1.8 {d19}, [r0] + sub r0, r0, r1, lsl #1 + hevc_loop_filter_chroma_body d16, d17, d18, d19 + bne 1f @ Partial write - vst1.8 {d17}, [r0], r1 -- vst1.8 {d18}, [r0], r1 -- vst1.8 {d2}, [r0], r1 -- vst1.8 {d4}, [r0], r1 -- vst1.8 {d19}, [r0], r1 -- vst1.8 {d20}, [r0], r1 -- vst1.8 {d21}, [r0] ++ vst1.8 {d17}, [r0], r1 + vst1.8 {d18}, [r0] + bx lr +1: @@ -2974,25 +3148,17 @@ index 166bddb104..15c4329cdb 100644 + it eq + streq r3, [r0, #4] + - bx lr - endfunc - --function ff_hevc_h_loop_filter_chroma_neon, export=1 ++ bx lr ++endfunc ++ +.macro m_filter_h_chroma_16 bit_depth - hevc_loop_filter_chroma_start - sub r0, r0, r1, lsl #1 -- vld1.8 {d18}, [r0], r1 -- vld1.8 {d2}, [r0], r1 -- vld1.8 {d4}, [r0], r1 -- vld1.8 {d19}, [r0] ++ hevc_loop_filter_chroma_start ++ sub r0, r0, r1, lsl #1 + vld1.16 {q8}, [r0], r1 + vld1.16 {q9}, [r0], r1 + vld1.16 {q10}, [r0], r1 + vld1.16 {q11}, [r0] - sub r0, r0, r1, lsl #1 -- hevc_loop_filter_chroma_body -- vst1.8 {d2}, [r0], r1 -- vst1.8 {d4}, [r0] ++ sub r0, r0, r1, lsl #1 + hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth + bne 1f @ Partial write + vst1.16 {q9}, [r0], r1 @@ -3020,15 +3186,15 @@ index 166bddb104..15c4329cdb 100644 + add r0, #8 + vst1.16 {d21}, [r0] + - bx lr ++ bx lr +.endm + + -+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i ++/* ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_i + * int *curr_rpl0, int *curr_ + * MvField *curr, MvField *ne + */ -+function ff_hevc_deblocking_boundary_strengths_neon, export=1 ++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 + add ip, sp, #4*4 + push {a2-a4,v1-v8,lr} + ldmia ip, {v5-v7} @@ -3156,11 +3322,11 @@ index 166bddb104..15c4329cdb 100644 + m_filter_luma 10 +endfunc + -+function ff_hevc_h_loop_filter_luma_neon_10, export=1 ++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1 + m_filter_h_luma_16 10 +endfunc + -+function ff_hevc_v_loop_filter_luma2_neon_10, export=1 ++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1 + hevc_loop_filter_luma_start + push {r4-r10,lr} @ 8 regs = 32 bytes + @@ -3168,7 +3334,7 @@ index 166bddb104..15c4329cdb 100644 + b v_loop_luma_common_10 +endfunc + -+function ff_hevc_v_loop_filter_luma_neon_10, export=1 ++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1 + hevc_loop_filter_luma_start + push {r4-r10,lr} + @@ -3177,27 +3343,27 @@ index 166bddb104..15c4329cdb 100644 + m_filter_v_luma_common_16 10 +endfunc + -+function ff_hevc_h_loop_filter_uv_neon_10, export=1 ++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1 + m_filter_h_uv_16 10 +endfunc + -+function ff_hevc_v_loop_filter_uv2_neon_10, export=1 ++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1 + m_filter_v_uv2_16 10 +endfunc + -+function ff_hevc_h_loop_filter_chroma_neon_10, export=1 ++function ff_hevc_rpi_h_loop_filter_chroma_neon_10, export=1 + m_filter_h_chroma_16 10 +endfunc + -+function ff_hevc_v_loop_filter_chroma_neon_10, export=1 ++function ff_hevc_rpi_v_loop_filter_chroma_neon_10, export=1 + m_filter_v_chroma_16 10 - endfunc ++endfunc + -diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S +diff --git a/libavcodec/arm/rpi_hevcdsp_epel_neon.S b/libavcodec/arm/rpi_hevcdsp_epel_neon.S new file mode 100644 -index 0000000000..00eab9eeee +index 0000000000..acc6911091 --- /dev/null -+++ b/libavcodec/arm/hevcdsp_epel_neon.S ++++ b/libavcodec/arm/rpi_hevcdsp_epel_neon.S @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2014 - 2015 Seppo Tomperi @@ -3303,7 +3469,7 @@ index 0000000000..00eab9eeee + vqshrn.s32 d6, q7, #6 +.endm + -+function ff_hevc_put_epel_h_neon_8, export=1 ++function ff_hevc_rpi_put_epel_h_neon_8, export=1 + push {r4-r7} + mov r4, MAX_PB_SIZE + ldr r7, [sp, #16] // mx @@ -3365,7 +3531,7 @@ index 0000000000..00eab9eeee + bx lr +endfunc + -+function ff_hevc_put_epel_v_neon_8, export=1 ++function ff_hevc_rpi_put_epel_v_neon_8, export=1 + push {r4-r7} + mov r4, MAX_PB_SIZE + ldr r7, [sp, #20] // my @@ -3441,7 +3607,7 @@ index 0000000000..00eab9eeee + bx lr +endfunc + -+function ff_hevc_put_epel_hv_neon_8, export=1 ++function ff_hevc_rpi_put_epel_hv_neon_8, export=1 + push {r4-r7} + mov r4, MAX_PB_SIZE + ldr r6, [sp, #16] // mx @@ -3536,104 +3702,113 @@ index 0000000000..00eab9eeee + .byte 4, 28, 46, 6 + .byte 2, 16, 54, 4 + .byte 2, 10, 58, 2 -diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S -index e39d00634b..ee2111f9b2 100644 ---- a/libavcodec/arm/hevcdsp_idct_neon.S -+++ b/libavcodec/arm/hevcdsp_idct_neon.S -@@ -21,82 +21,6 @@ - #include "libavutil/arm/asm.S" - #include "neon.S" - --function ff_hevc_idct_4x4_dc_neon_8, export=1 -- ldrsh r1, [r0] -- ldr r2, =0x20 -- add r1, #1 -- asr r1, #1 -- add r1, r2 -- asr r1, #6 -- vdup.16 q0, r1 -- vdup.16 q1, r1 -- vst1.16 {q0, q1}, [r0] -- bx lr --endfunc -- --function ff_hevc_idct_8x8_dc_neon_8, export=1 -- ldrsh r1, [r0] -- ldr r2, =0x20 -- add r1, #1 -- asr r1, #1 -- add r1, r2 -- asr r1, #6 -- vdup.16 q8, r1 -- vdup.16 q9, r1 -- vmov.16 q10, q8 -- vmov.16 q11, q8 -- vmov.16 q12, q8 -- vmov.16 q13, q8 -- vmov.16 q14, q8 -- vmov.16 q15, q8 -- vstm r0, {q8-q15} -- bx lr --endfunc -- --function ff_hevc_idct_16x16_dc_neon_8, export=1 -- ldrsh r1, [r0] -- ldr r2, =0x20 -- add r1, #1 -- asr r1, #1 -- add r1, r2 -- asr r1, #6 -- vdup.16 q8, r1 -- vdup.16 q9, r1 -- vmov.16 q10, q8 -- vmov.16 q11, q8 -- vmov.16 q12, q8 -- vmov.16 q13, q8 -- vmov.16 q14, q8 -- vmov.16 q15, q8 -- vstm r0!, {q8-q15} -- vstm r0!, {q8-q15} -- vstm r0!, {q8-q15} -- vstm r0, {q8-q15} -- bx lr --endfunc -- --function ff_hevc_idct_32x32_dc_neon_8, export=1 -- ldrsh r1, [r0] -- ldr r2, =0x20 -- add r1, #1 -- asr r1, #1 -- add r1, r2 -- asr r1, #6 -- mov r3, #16 -- vdup.16 q8, r1 -- vdup.16 q9, r1 -- vmov.16 q10, q8 -- vmov.16 q11, q8 -- vmov.16 q12, q8 -- vmov.16 q13, q8 -- vmov.16 q14, q8 -- vmov.16 q15, q8 --1: subs r3, #1 -- vstm r0!, {q8-q15} -- bne 1b -- bx lr --endfunc -- - function ff_hevc_add_residual_4x4_neon_8, export=1 - vldm r1, {q0-q1} - vld1.32 d4[0], [r0], r2 -@@ -168,6 +92,131 @@ function ff_hevc_add_residual_32x32_neon_8, export=1 - bx lr - endfunc - +diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S +new file mode 100644 +index 0000000000..cd79460984 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S +@@ -0,0 +1,379 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1 ++ vldm r1, {q0-q1} ++ vld1.32 d4[0], [r0], r2 ++ vld1.32 d4[1], [r0], r2 ++ vld1.32 d5[0], [r0], r2 ++ vld1.32 d5[1], [r0], r2 ++ sub r0, r0, r2, lsl #2 ++ vmovl.u8 q8, d4 ++ vmovl.u8 q9, d5 ++ vqadd.s16 q0, q0, q8 ++ vqadd.s16 q1, q1, q9 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.32 d0[0], [r0], r2 ++ vst1.32 d0[1], [r0], r2 ++ vst1.32 d1[0], [r0], r2 ++ vst1.32 d1[1], [r0], r2 ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1 ++ mov r3, #8 ++1: subs r3, #1 ++ vld1.16 {q0}, [r1]! ++ vld1.8 d16, [r0] ++ vmovl.u8 q8, d16 ++ vqadd.s16 q0, q8 ++ vqmovun.s16 d0, q0 ++ vst1.32 d0, [r0], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1 ++ mov r3, #16 ++1: subs r3, #1 ++ vld1.16 {q0, q1}, [r1]! ++ vld1.8 {q8}, [r0] ++ vmovl.u8 q9, d16 ++ vmovl.u8 q10, d17 ++ vqadd.s16 q0, q9 ++ vqadd.s16 q1, q10 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.8 {q0}, [r0], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1 ++ mov r3, #32 ++1: subs r3, #1 ++ vldm r1!, {q0-q3} ++ vld1.8 {q8, q9}, [r0] ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vqmovun.s16 d2, q2 ++ vqmovun.s16 d3, q3 ++ vst1.8 {q0, q1}, [r0], r2 ++ bne 1b ++ bx lr ++endfunc ++ + -+@ ff_hevc_add_residual_4x4_dc_neon_8( ++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + -+function ff_hevc_add_residual_4x4_dc_neon_8, export=1 ++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1 + vdup.16 q15, r2 + + vld1.32 d4[0], [r0], r1 @@ -3653,23 +3828,23 @@ index e39d00634b..ee2111f9b2 100644 +endfunc + + -+@ ff_hevc_add_residual_4x4_dc_c_neon_8( ++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + -+function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1 ++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1 + vdup.32 q15, r2 + mov r3, #4 + b 1f +endfunc + -+@ ff_hevc_add_residual_8x8_dc_neon_8( ++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + -+function ff_hevc_add_residual_8x8_dc_neon_8, export=1 ++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1 + vdup.16 q15, r2 + mov r3, #8 + @@ -3683,23 +3858,23 @@ index e39d00634b..ee2111f9b2 100644 +endfunc + + -+@ ff_hevc_add_residual_8x8_dc_c_neon_8( ++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + -+function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1 ++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1 + vdup.32 q15, r2 + mov r3, #8 + b 1f +endfunc + -+@ ff_hevc_add_residual_16x16_dc_neon_8( ++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + -+function ff_hevc_add_residual_16x16_dc_neon_8, export=1 ++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1 + vdup.16 q15, r2 + mov r3, #16 + @@ -3715,23 +3890,23 @@ index e39d00634b..ee2111f9b2 100644 +endfunc + + -+@ ff_hevc_add_residual_16x16_dc_c_neon_8( ++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + -+function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1 ++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1 + vdup.32 q15, r2 + mov r3, #16 + b 1f +endfunc + -+@ ff_hevc_add_residual_32x32_dc_neon_8( ++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8( +@ uint8_t * dst, // [r0] +@ unsigned int stride, // [r1] +@ int dc) // [r2] + -+function ff_hevc_add_residual_32x32_dc_neon_8, export=1 ++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1 + vdup.16 q15, r2 + mov r3, #32 + @@ -3752,512 +3927,640 @@ index e39d00634b..ee2111f9b2 100644 + + + - .macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7 - vtrn.64 \r0, \r4 - vtrn.64 \r1, \r5 -@@ -263,55 +312,6 @@ endfunc - vqrshrn.s32 \r3, q3, \shift - .endm - --function ff_hevc_transform_4x4_neon_8, export=1 -- vpush {d8-d15} -- vld1.16 {q14, q15}, [r0] // coeffs -- ldr r3, =0x00240053 // 36 and 83 -- vmov.32 d0[0], r3 -- -- tr4_shift d28, d29, d30, d31, #7 -- -- vtrn.16 d28, d29 -- vtrn.16 d30, d31 -- vtrn.32 q14, q15 -- -- tr4_shift d28, d29, d30, d31, #12 -- -- vtrn.16 d28, d29 -- vtrn.16 d30, d31 -- vtrn.32 q14, q15 -- -- vst1.16 {q14, q15}, [r0] -- vpop {d8-d15} -- bx lr --endfunc -- --function ff_hevc_transform_luma_4x4_neon_8, export=1 -- vpush {d8-d15} -- vld1.16 {q14, q15}, [r0] // coeffs -- ldr r3, =0x4a // 74 -- vmov.32 d0[0], r3 -- ldr r3, =0x1d // 29 -- vmov.32 d0[1], r3 -- ldr r3, =0x37 // 55 -- vmov.32 d1[0], r3 -- -- tr4_luma_shift d28, d29, d30, d31, #7 -- -- vtrn.16 d28, d29 -- vtrn.16 d30, d31 -- vtrn.32 q14, q15 -- -- tr4_luma_shift d28, d29, d30, d31, #12 -- -- vtrn.16 d28, d29 -- vtrn.16 d30, d31 -- vtrn.32 q14, q15 -- vst1.16 {q14, q15}, [r0] -- vpop {d8-d15} -- bx lr --endfunc -- - .macro tr8_begin in0, in1, in2, in3 - vmull.s16 q7, \in0, d1[1] // 89 * src1 - vmull.s16 q8, \in0, d1[0] // 75 * src1 -@@ -356,100 +356,6 @@ endfunc - vqrshrn.s32 d8, q5, \shift - .endm - --function ff_hevc_transform_8x8_neon_8, export=1 -- push {r4-r8} -- vpush {d8-d15} -- mov r5, #16 -- -- adr r3, tr4f -- vld1.16 {d0, d1}, [r3] -- -- // left half -- vld1.16 {d24}, [r0], r5 -- vld1.16 {d25}, [r0], r5 -- vld1.16 {d26}, [r0], r5 -- vld1.16 {d27}, [r0], r5 -- vld1.16 {d28}, [r0], r5 -- vld1.16 {d29}, [r0], r5 -- vld1.16 {d30}, [r0], r5 -- vld1.16 {d31}, [r0], r5 -- sub r0, #128 -- tr8_begin d25, d27, d29, d31 -- tr4 d24, d26, d28, d30 -- tr8_end #7 -- vst1.16 {d2}, [r0], r5 -- vst1.16 {d3}, [r0], r5 -- vst1.16 {d4}, [r0], r5 -- vst1.16 {d5}, [r0], r5 -- vst1.16 {d6}, [r0], r5 -- vst1.16 {d7}, [r0], r5 -- vst1.16 {d8}, [r0], r5 -- vst1.16 {d9}, [r0], r5 -- sub r0, #128 -- //skip right half if col_limit in r1 is less than 4 -- cmp r1, #4 -- blt 1f -- //right half -- add r0, #8 -- vld1.16 {d24}, [r0], r5 -- vld1.16 {d25}, [r0], r5 -- vld1.16 {d26}, [r0], r5 -- vld1.16 {d27}, [r0], r5 -- vld1.16 {d28}, [r0], r5 -- vld1.16 {d29}, [r0], r5 -- vld1.16 {d30}, [r0], r5 -- vld1.16 {d31}, [r0], r5 -- sub r0, #128 -- tr8_begin d25, d27, d29, d31 -- tr4 d24, d26, d28, d30 -- tr8_end #7 -- vst1.16 {d2}, [r0], r5 -- vst1.16 {d3}, [r0], r5 -- vst1.16 {d4}, [r0], r5 -- vst1.16 {d5}, [r0], r5 -- vst1.16 {d6}, [r0], r5 -- vst1.16 {d7}, [r0], r5 -- vst1.16 {d8}, [r0], r5 -- vst1.16 {d9}, [r0], r5 -- sub r0, #136 --1: -- // top half -- vldm r0, {q12-q15} // coeffs -- transpose_16b_4x4 d24, d26, d28, d30 -- transpose_16b_4x4 d25, d27, d29, d31 -- tr8_begin d26, d30, d27, d31 -- tr4 d24, d28, d25, d29 -- tr8_end #12 -- transpose_16b_4x4 d2, d3, d4, d5 -- transpose_16b_4x4 d6, d7, d8, d9 -- vswp d7, d5 -- vswp d7, d8 -- vswp d3, d6 -- vswp d6, d4 -- vstm r0!, {q1-q4} -- -- // bottom half -- vldm r0, {q12-q15} // coeffs -- transpose_16b_4x4 d24, d26, d28, d30 -- transpose_16b_4x4 d25, d27, d29, d31 -- tr8_begin d26, d30, d27, d31 -- tr4 d24, d28, d25, d29 -- tr8_end #12 -- transpose_16b_4x4 d2, d3, d4, d5 -- transpose_16b_4x4 d6, d7, d8, d9 -- vswp d7, d5 -- vswp d7, d8 -- vswp d3, d6 -- vswp d6, d4 -- //vstm r0, {q1-q4} -- vst1.16 {q1-q2}, [r0] -- add r0, #32 -- vst1.16 {q3-q4}, [r0] -- sub r0, #32 -- vpop {d8-d15} -- pop {r4-r8} -- bx lr --endfunc - - .align 4 - tr4f: -@@ -463,3 +369,11 @@ tr16: - .word 0x00500046 // 80, d2[2] = 70 - .word 0x0039002b // 57, d2[0] = 43 - .word 0x00190009 // 25, d2[2] = 9 ++.macro transpose_16b_8x8 r0, r1, r2, r3, r4, r5, r6, r7 ++ vtrn.64 \r0, \r4 ++ vtrn.64 \r1, \r5 ++ vtrn.64 \r2, \r6 ++ vtrn.64 \r3, \r7 ++ vtrn.32 \r0, \r2 ++ vtrn.32 \r1, \r3 ++ vtrn.32 \r4, \r6 ++ vtrn.32 \r5, \r7 ++ vtrn.16 \r0, \r1 ++ vtrn.16 \r2, \r3 ++ vtrn.16 \r4, \r5 ++ vtrn.16 \r6, \r7 ++.endm + -+#define BIT_DEPTH 8 -+#include "hevc_idct_fn_neon.S" ++// in 4 q regs ++// output 8 d regs ++.macro transpose_16b_4x4 r0, r1, r2, r3 ++ vtrn.32 \r0, \r2 ++ vtrn.32 \r1, \r3 ++ vtrn.16 \r0, \r1 ++ vtrn.16 \r2, \r3 ++.endm + -+#undef BIT_DEPTH -+#define BIT_DEPTH 10 -+#include "hevc_idct_fn_neon.S" -+ -diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c -index 1a3912c609..ad625d067a 100644 ---- a/libavcodec/arm/hevcdsp_init_neon.c -+++ b/libavcodec/arm/hevcdsp_init_neon.c -@@ -22,11 +22,41 @@ - #include "libavutil/arm/cpu.h" - #include "libavcodec/hevcdsp.h" - #include "hevcdsp_arm.h" -+#include "libavcodec/avcodec.h" -+#include "libavcodec/bit_depth_template.c" - - void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); - void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); - void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); - void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+ -+void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); -+ -+#ifdef RPI -+void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, const int32_t tc[2], -+ const uint8_t no_p[2], const uint8_t no_q[2], -+ uint8_t * _pix_l); -+void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, -+ unsigned int no_f); -+void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, -+ uint8_t * src_l, -+ unsigned int no_f); ++/* uses registers q2 - q9 for temp values */ ++/* TODO: reorder */ ++.macro tr4_luma_shift r0, r1, r2, r3, shift ++ vaddl.s16 q5, \r0, \r2 // c0 = src0 + src2 ++ vaddl.s16 q2, \r2, \r3 // c1 = src2 + src3 ++ vsubl.s16 q4, \r0, \r3 // c2 = src0 - src3 ++ vmull.s16 q6, \r1, d0[0] // c3 = 74 * src1 ++ ++ vaddl.s16 q7, \r0, \r3 // src0 + src3 ++ vsubw.s16 q7, q7, \r2 // src0 - src2 + src3 ++ vmul.s32 q7, q7, d0[0] // dst2 = 74 * (src0 - src2 + src3) ++ ++ vmul.s32 q8, q5, d0[1] // 29 * c0 ++ vmul.s32 q9, q2, d1[0] // 55 * c1 ++ vadd.s32 q8, q9 // 29 * c0 + 55 * c1 ++ vadd.s32 q8, q6 // dst0 = 29 * c0 + 55 * c1 + c3 ++ ++ vmul.s32 q2, q2, d0[1] // 29 * c1 ++ vmul.s32 q9, q4, d1[0] // 55 * c2 ++ vsub.s32 q9, q2 // 55 * c2 - 29 * c1 ++ vadd.s32 q9, q6 // dst1 = 55 * c2 - 29 * c1 + c3 ++ ++ vmul.s32 q5, q5, d1[0] // 55 * c0 ++ vmul.s32 q4, q4, d0[1] // 29 * c2 ++ vadd.s32 q5, q4 // 55 * c0 + 29 * c2 ++ vsub.s32 q5, q6 // dst3 = 55 * c0 + 29 * c2 - c3 ++ ++ vqrshrn.s32 \r0, q8, \shift ++ vqrshrn.s32 \r1, q9, \shift ++ vqrshrn.s32 \r2, q7, \shift ++ vqrshrn.s32 \r3, q5, \shift ++.endm + -+void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, const int32_t tc[2], -+ const uint8_t no_p[2], const uint8_t no_q[2], -+ uint8_t * _pix_l); -+void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, -+ unsigned int no_f); -+void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, -+ uint8_t * src_l, -+ unsigned int no_f); -+#endif ++/* uses registers q2 - q6 for temp values */ ++.macro tr4 r0, r1, r2, r3 ++ vmull.s16 q4, \r1, d0[0] // 83 * src1 ++ vmull.s16 q6, \r1, d0[1] // 36 * src1 ++ vshll.s16 q2, \r0, #6 // 64 * src0 ++ vshll.s16 q3, \r2, #6 // 64 * src2 ++ vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0 ++ vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1 ++ vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0 ++ vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1 ++ ++ vsub.s32 q3, q5, q4 // e0 - o0 ++ vadd.s32 q4, q5, q4 // e0 + o0 ++ vadd.s32 q5, q2, q6 // e1 + o1 ++ vsub.s32 q6, q2, q6 // e1 - o1 ++.endm + - void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit); - void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit); - void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs); -@@ -34,6 +64,15 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs); - void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs); - void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs); - void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs); -+ -+void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit); -+void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit); -+void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs); -+void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs); -+void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs); -+void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs); -+void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs); -+ - void ff_hevc_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); - void ff_hevc_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, -@@ -43,6 +82,157 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, - void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, - ptrdiff_t stride); - -+void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++.macro tr4_shift r0, r1, r2, r3, shift ++ vmull.s16 q4, \r1, d0[0] // 83 * src1 ++ vmull.s16 q6, \r1, d0[1] // 36 * src1 ++ vshll.s16 q2, \r0, #6 // 64 * src0 ++ vshll.s16 q3, \r2, #6 // 64 * src2 ++ vadd.s32 q5, q2, q3 // 64 * (src0 + src2) e0 ++ vsub.s32 q2, q2, q3 // 64 * (src0 - src2) e1 ++ vmlal.s16 q4, \r3, d0[1] // 83 * src1 + 36 * src3 o0 ++ vmlsl.s16 q6, \r3, d0[0] // 36 * src1 - 83 * src3 o1 ++ ++ vsub.s32 q3, q5, q4 // e0 - o0 ++ vadd.s32 q4, q5, q4 // e0 + o0 ++ vadd.s32 q5, q2, q6 // e1 + o1 ++ vsub.s32 q6, q2, q6 // e1 - o1 ++ ++ vqrshrn.s32 \r0, q4, \shift ++ vqrshrn.s32 \r1, q5, \shift ++ vqrshrn.s32 \r2, q6, \shift ++ vqrshrn.s32 \r3, q3, \shift ++.endm + ++.macro tr8_begin in0, in1, in2, in3 ++ vmull.s16 q7, \in0, d1[1] // 89 * src1 ++ vmull.s16 q8, \in0, d1[0] // 75 * src1 ++ vmull.s16 q9, \in0, d1[3] // 50 * src1 ++ vmull.s16 q10, \in0, d1[2] // 18 * src1 ++ ++ vmlal.s16 q7, \in1, d1[0] // 75 * src3 ++ vmlsl.s16 q8, \in1, d1[2] //-18 * src3 ++ vmlsl.s16 q9, \in1, d1[1] //-89 * src3 ++ vmlsl.s16 q10, \in1, d1[3] //-50 * src3 ++ ++ vmlal.s16 q7, \in2, d1[3] // 50 * src5 ++ vmlsl.s16 q8, \in2, d1[1] //-89 * src5 ++ vmlal.s16 q9, \in2, d1[2] // 18 * src5 ++ vmlal.s16 q10, \in2, d1[0] // 75 * src5 ++ ++ vmlal.s16 q7, \in3, d1[2] // 18 * src7 ++ vmlsl.s16 q8, \in3, d1[3] //-50 * src7 ++ vmlal.s16 q9, \in3, d1[0] // 75 * src7 ++ vmlsl.s16 q10, \in3, d1[1] //-89 * src7 ++.endm + -+void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); -+void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, -+ ptrdiff_t stride); ++.macro tr8_end shift ++ vadd.s32 q1, q4, q7 // e_8[0] + o_8[0], dst[0] ++ vsub.s32 q4, q4, q7 // e_8[0] - o_8[0], dst[7] ++ ++ vadd.s32 q2, q5, q8 // e_8[1] + o_8[1], dst[1] ++ vsub.s32 q5, q5, q8 // e_8[1] - o_8[1], dst[6] ++ ++ vadd.s32 q11, q6, q9 // e_8[2] + o_8[2], dst[2] ++ vsub.s32 q6, q6, q9 // e_8[2] - o_8[2], dst[5] ++ ++ vadd.s32 q12, q3, q10 // e_8[3] + o_8[3], dst[3] ++ vsub.s32 q3, q3, q10 // e_8[3] - o_8[3], dst[4] ++ vqrshrn.s32 d2, q1, \shift ++ vqrshrn.s32 d3, q2, \shift ++ vqrshrn.s32 d4, q11, \shift ++ vqrshrn.s32 d5, q12, \shift ++ vqrshrn.s32 d6, q3, \shift ++ vqrshrn.s32 d7, q6, \shift ++ vqrshrn.s32 d9, q4, \shift ++ vqrshrn.s32 d8, q5, \shift ++.endm + -+void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); -+void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); + ++.align 4 ++tr4f: ++.word 0x00240053 // 36 and d1[0] = 83 ++.word 0x00000000 ++tr8f: ++.word 0x0059004b // 89, d0[0] = 75 ++.word 0x00320012 // 50, d0[2] = 18 ++tr16: ++.word 0x005a0057 // 90, d2[0] = 87 ++.word 0x00500046 // 80, d2[2] = 70 ++.word 0x0039002b // 57, d2[0] = 43 ++.word 0x00190009 // 25, d2[2] = 9 + -+#if RPI_HEVC_SAND -+void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride); -+void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++#define BIT_DEPTH 8 ++#include "rpi_hevc_idct_fn_neon.S" + ++#undef BIT_DEPTH ++#define BIT_DEPTH 10 ++#include "rpi_hevc_idct_fn_neon.S" + -+void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_v); -+void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, -+ ptrdiff_t stride, int dc_u); -+void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, +diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c +new file mode 100644 +index 0000000000..109fa98c29 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c +@@ -0,0 +1,32 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/arm/cpu.h" ++#include "libavcodec/rpi_hevcdsp.h" ++#include "rpi_hevcdsp_arm.h" ++ ++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) ++ ff_hevcdsp_rpi_init_neon(c, bit_depth); ++} +diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c +new file mode 100644 +index 0000000000..472d9d75c9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c +@@ -0,0 +1,652 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "config.h" ++#include "libavutil/attributes.h" ++#include "libavutil/arm/cpu.h" ++#include "libavcodec/rpi_hevcdsp.h" ++#include "rpi_hevcdsp_arm.h" ++#include "libavcodec/avcodec.h" ++#include "libavcodec/bit_depth_template.c" ++ ++void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t tc[2], ++ const uint8_t no_p[2], const uint8_t no_q[2], ++ uint8_t * _pix_l); ++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs); ++ ++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs); ++ ++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++ ++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++ ++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); -+void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, ++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride, int dc_u); -+void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, ++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); -+void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, ++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); -+void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, ++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, + ptrdiff_t stride); -+void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); -+#endif ++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); + -+void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); + -+void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); -+void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); + -+#if RPI_HEVC_SAND -+void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); -+void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); -+void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); + -+void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); -+void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); -+void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height); + -+void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); -+void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); -+void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); + -+void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); -+void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); -+void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height); -+#endif + -+void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); + -+void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); -+void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height); + + - #define PUT_PIXELS(name) \ - void name(int16_t *dst, uint8_t *src, \ - ptrdiff_t srcstride, int height, \ -@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8); - PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8); - PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8); - #undef PUT_PIXELS -+void ff_hevc_put_epel_h_neon_8(int16_t *dst, uint8_t *src, ++#define PUT_PIXELS(name) \ ++ void name(int16_t *dst, uint8_t *src, \ ++ ptrdiff_t srcstride, int height, \ ++ intptr_t mx, intptr_t my, int width) ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w2_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w4_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w6_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w8_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w12_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w16_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w24_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w32_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w48_neon_8); ++PUT_PIXELS(ff_hevc_rpi_put_pixels_w64_neon_8); ++#undef PUT_PIXELS ++void ff_hevc_rpi_put_epel_h_neon_8(int16_t *dst, uint8_t *src, + ptrdiff_t srcstride, int height, + intptr_t mx, intptr_t my, int width); -+void ff_hevc_put_epel_v_neon_8(int16_t *dst, uint8_t *src, ++void ff_hevc_rpi_put_epel_v_neon_8(int16_t *dst, uint8_t *src, + ptrdiff_t srcstride, int height, + intptr_t mx, intptr_t my, int width); -+void ff_hevc_put_epel_hv_neon_8(int16_t *dst, uint8_t *src, ++void ff_hevc_rpi_put_epel_hv_neon_8(int16_t *dst, uint8_t *src, + ptrdiff_t srcstride, int height, + intptr_t mx, intptr_t my, int width); - - static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, - int height, int width); -@@ -142,14 +341,124 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t - put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); - } - -+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, ++ ++static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, ++ int height, int width); ++static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int width, int height, int16_t* src2, ptrdiff_t src2stride); ++void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, ++ int height, intptr_t mx, intptr_t my, int width); ++void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, ++ int height, intptr_t mx, intptr_t my, int width); ++void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width); ++#define QPEL_FUNC(name) \ ++ void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \ ++ int height, int width) ++ ++QPEL_FUNC(ff_hevc_rpi_put_qpel_v1_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_v2_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_v3_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v1_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v2_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v3_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v1_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v2_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v3_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v1_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v2_neon_8); ++QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v3_neon_8); ++#undef QPEL_FUNC ++ ++#define QPEL_FUNC_UW_PIX(name) \ ++ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \ ++ int height, intptr_t mx, intptr_t my, int width); ++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8); ++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8); ++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8); ++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8); ++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8); ++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8); ++QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8); ++#undef QPEL_FUNC_UW_PIX ++ ++#define QPEL_FUNC_UW(name) \ ++ void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \ ++ int width, int height, int16_t* src2, ptrdiff_t src2stride); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_pixels_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v1_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v2_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v3_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v1_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v2_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v3_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v1_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v2_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v3_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v1_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v2_neon_8); ++QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v3_neon_8); ++#undef QPEL_FUNC_UW ++ ++void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride, ++ int height, intptr_t mx, intptr_t my, int width) { ++ ++ put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width); ++} ++ ++void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, ++ int height, intptr_t mx, intptr_t my, int width) { ++ ++ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0); ++} ++ ++void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) { ++ put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE); ++} ++ ++void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc, + const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, + const MvField *curr, const MvField *neigh, uint8_t *bs); + + -+static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ -+ ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); -+ ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); +} -+static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ -+ ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); -+ ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); +} + -+static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ -+ ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); -+ ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); +} -+static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ -+ ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); -+ ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); +} + +#if SAO_FILTER_N == 6 -+static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ -+ ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); -+ ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); ++ ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); +} -+static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) +{ -+ ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); -+ ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); ++ ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); +} + -+static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ -+ ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); -+ ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++ ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); +} -+static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, + int16_t *sao_offset_val, int sao_left_class, int width, int height) +{ -+ ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); -+ ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++ ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); +} + -+#if RPI_HEVC_SAND -+static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height) +{ -+ ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); -+ ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++ ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); +} -+static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, + const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, + int eo, int width, int height) +{ -+ ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); -+ ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++ ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); +} + -+static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height) +{ -+ ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, ++ ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); -+ ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, ++ ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} -+static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, + ptrdiff_t stride_dst, ptrdiff_t stride_src, + const int16_t *sao_offset_val_u, int sao_left_class_u, + const int16_t *sao_offset_val_v, int sao_left_class_v, + int width, int height) +{ -+ ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, ++ ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); -+ ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, ++ ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, + sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); +} +#endif -+#endif + + + @@ -4265,787 +4568,1420 @@ index 1a3912c609..ad625d067a 100644 +#error SAO edge src stride not 160 - value used in .S +#endif + - av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) - { - if (bit_depth == 8) { - int x; - c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon; -+ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon; - c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon; -+ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon; - c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon; -+ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon; - c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon; -+ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon; -+#ifdef RPI -+ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_8; -+ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_8; -+ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_8; -+#endif - c->idct[0] = ff_hevc_transform_4x4_neon_8; - c->idct[1] = ff_hevc_transform_8x8_neon_8; - c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_8; -@@ -160,7 +469,53 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) - c->add_residual[1] = ff_hevc_add_residual_8x8_neon_8; - c->add_residual[2] = ff_hevc_add_residual_16x16_neon_8; - c->add_residual[3] = ff_hevc_add_residual_32x32_neon_8; -+ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_8; -+ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_8; -+ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_8; -+ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_8; -+#if RPI_HEVC_SAND -+ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_8; -+ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_8; -+ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_8; -+ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_8; -+ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_8; -+ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_8; -+ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_8; -+ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_8; -+ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_8; -+ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_8; -+ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_8; -+ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_8; -+#endif - c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_8; -+ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_8; -+ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_8; -+ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_8; -+ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_8; -+ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_8; -+ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_8; -+ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_8; -+ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_8; -+ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_8; -+ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_8; ++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) ++{ ++ if (bit_depth == 8) { ++ int x; ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon; ++ c->hevc_v_loop_filter_chroma = ff_hevc_rpi_v_loop_filter_chroma_neon; ++ c->hevc_v_loop_filter_chroma_c = ff_hevc_rpi_v_loop_filter_chroma_neon; ++ c->hevc_h_loop_filter_chroma = ff_hevc_rpi_h_loop_filter_chroma_neon; ++ c->hevc_h_loop_filter_chroma_c = ff_hevc_rpi_h_loop_filter_chroma_neon; ++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8; ++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8; ++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8; ++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8; ++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8; ++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8; ++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8; ++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8; ++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8; ++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8; ++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8; ++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8; ++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8; ++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8; ++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8; ++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8; ++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8; ++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8; ++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8; ++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8; ++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8; ++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8; ++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8; ++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8; ++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8; ++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8; ++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8; ++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8; ++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8; ++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8; ++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8; ++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8; ++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8; ++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8; ++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8; ++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8; ++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8; ++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8; ++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8; +#if SAO_FILTER_N == 6 -+ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_8; -+ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_8; ++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8; ++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8; +#endif -+#if RPI_HEVC_SAND -+ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_8; -+ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_8; -+ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_8; ++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8; ++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8; ++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8; + -+ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_8; -+ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_8; -+ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_8; ++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8; ++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8; ++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8; + +#if SAO_FILTER_N == 6 -+ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_8; -+ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_8; -+#endif -+#endif - put_hevc_qpel_neon[1][0] = ff_hevc_put_qpel_v1_neon_8; - put_hevc_qpel_neon[2][0] = ff_hevc_put_qpel_v2_neon_8; - put_hevc_qpel_neon[3][0] = ff_hevc_put_qpel_v3_neon_8; -@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) - c->put_hevc_qpel_bi[x][1][0] = ff_hevc_put_qpel_bi_neon_wrapper; - c->put_hevc_qpel_bi[x][0][1] = ff_hevc_put_qpel_bi_neon_wrapper; - c->put_hevc_qpel_bi[x][1][1] = ff_hevc_put_qpel_bi_neon_wrapper; -+ c->put_hevc_epel[x][1][0] = ff_hevc_put_epel_v_neon_8; -+ c->put_hevc_epel[x][0][1] = ff_hevc_put_epel_h_neon_8; -+ c->put_hevc_epel[x][1][1] = ff_hevc_put_epel_hv_neon_8; - } -+ c->put_hevc_epel[0][0][0] = ff_hevc_put_pixels_w2_neon_8; -+ c->put_hevc_epel[1][0][0] = ff_hevc_put_pixels_w4_neon_8; -+ c->put_hevc_epel[2][0][0] = ff_hevc_put_pixels_w6_neon_8; -+ c->put_hevc_epel[3][0][0] = ff_hevc_put_pixels_w8_neon_8; -+ c->put_hevc_epel[4][0][0] = ff_hevc_put_pixels_w12_neon_8; -+ c->put_hevc_epel[5][0][0] = ff_hevc_put_pixels_w16_neon_8; -+ c->put_hevc_epel[6][0][0] = ff_hevc_put_pixels_w24_neon_8; -+ c->put_hevc_epel[7][0][0] = ff_hevc_put_pixels_w32_neon_8; -+ c->put_hevc_epel[8][0][0] = ff_hevc_put_pixels_w48_neon_8; -+ c->put_hevc_epel[9][0][0] = ff_hevc_put_pixels_w64_neon_8; -+ - c->put_hevc_qpel[0][0][0] = ff_hevc_put_pixels_w2_neon_8; - c->put_hevc_qpel[1][0][0] = ff_hevc_put_pixels_w4_neon_8; - c->put_hevc_qpel[2][0][0] = ff_hevc_put_pixels_w6_neon_8; -@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth) - c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8; - c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8; - } ++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8; ++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8; ++#endif ++ put_hevc_qpel_neon[1][0] = ff_hevc_rpi_put_qpel_v1_neon_8; ++ put_hevc_qpel_neon[2][0] = ff_hevc_rpi_put_qpel_v2_neon_8; ++ put_hevc_qpel_neon[3][0] = ff_hevc_rpi_put_qpel_v3_neon_8; ++ put_hevc_qpel_neon[0][1] = ff_hevc_rpi_put_qpel_h1_neon_8; ++ put_hevc_qpel_neon[0][2] = ff_hevc_rpi_put_qpel_h2_neon_8; ++ put_hevc_qpel_neon[0][3] = ff_hevc_rpi_put_qpel_h3_neon_8; ++ put_hevc_qpel_neon[1][1] = ff_hevc_rpi_put_qpel_h1v1_neon_8; ++ put_hevc_qpel_neon[1][2] = ff_hevc_rpi_put_qpel_h2v1_neon_8; ++ put_hevc_qpel_neon[1][3] = ff_hevc_rpi_put_qpel_h3v1_neon_8; ++ put_hevc_qpel_neon[2][1] = ff_hevc_rpi_put_qpel_h1v2_neon_8; ++ put_hevc_qpel_neon[2][2] = ff_hevc_rpi_put_qpel_h2v2_neon_8; ++ put_hevc_qpel_neon[2][3] = ff_hevc_rpi_put_qpel_h3v2_neon_8; ++ put_hevc_qpel_neon[3][1] = ff_hevc_rpi_put_qpel_h1v3_neon_8; ++ put_hevc_qpel_neon[3][2] = ff_hevc_rpi_put_qpel_h2v3_neon_8; ++ put_hevc_qpel_neon[3][3] = ff_hevc_rpi_put_qpel_h3v3_neon_8; ++ put_hevc_qpel_uw_neon[1][0] = ff_hevc_rpi_put_qpel_uw_v1_neon_8; ++ put_hevc_qpel_uw_neon[2][0] = ff_hevc_rpi_put_qpel_uw_v2_neon_8; ++ put_hevc_qpel_uw_neon[3][0] = ff_hevc_rpi_put_qpel_uw_v3_neon_8; ++ put_hevc_qpel_uw_neon[0][1] = ff_hevc_rpi_put_qpel_uw_h1_neon_8; ++ put_hevc_qpel_uw_neon[0][2] = ff_hevc_rpi_put_qpel_uw_h2_neon_8; ++ put_hevc_qpel_uw_neon[0][3] = ff_hevc_rpi_put_qpel_uw_h3_neon_8; ++ put_hevc_qpel_uw_neon[1][1] = ff_hevc_rpi_put_qpel_uw_h1v1_neon_8; ++ put_hevc_qpel_uw_neon[1][2] = ff_hevc_rpi_put_qpel_uw_h2v1_neon_8; ++ put_hevc_qpel_uw_neon[1][3] = ff_hevc_rpi_put_qpel_uw_h3v1_neon_8; ++ put_hevc_qpel_uw_neon[2][1] = ff_hevc_rpi_put_qpel_uw_h1v2_neon_8; ++ put_hevc_qpel_uw_neon[2][2] = ff_hevc_rpi_put_qpel_uw_h2v2_neon_8; ++ put_hevc_qpel_uw_neon[2][3] = ff_hevc_rpi_put_qpel_uw_h3v2_neon_8; ++ put_hevc_qpel_uw_neon[3][1] = ff_hevc_rpi_put_qpel_uw_h1v3_neon_8; ++ put_hevc_qpel_uw_neon[3][2] = ff_hevc_rpi_put_qpel_uw_h2v3_neon_8; ++ put_hevc_qpel_uw_neon[3][3] = ff_hevc_rpi_put_qpel_uw_h3v3_neon_8; ++ for (x = 0; x < 10; x++) { ++ c->put_hevc_qpel[x][1][0] = ff_hevc_rpi_put_qpel_neon_wrapper; ++ c->put_hevc_qpel[x][0][1] = ff_hevc_rpi_put_qpel_neon_wrapper; ++ c->put_hevc_qpel[x][1][1] = ff_hevc_rpi_put_qpel_neon_wrapper; ++ c->put_hevc_qpel_uni[x][1][0] = ff_hevc_rpi_put_qpel_uni_neon_wrapper; ++ c->put_hevc_qpel_uni[x][0][1] = ff_hevc_rpi_put_qpel_uni_neon_wrapper; ++ c->put_hevc_qpel_uni[x][1][1] = ff_hevc_rpi_put_qpel_uni_neon_wrapper; ++ c->put_hevc_qpel_bi[x][1][0] = ff_hevc_rpi_put_qpel_bi_neon_wrapper; ++ c->put_hevc_qpel_bi[x][0][1] = ff_hevc_rpi_put_qpel_bi_neon_wrapper; ++ c->put_hevc_qpel_bi[x][1][1] = ff_hevc_rpi_put_qpel_bi_neon_wrapper; ++ c->put_hevc_epel[x][1][0] = ff_hevc_rpi_put_epel_v_neon_8; ++ c->put_hevc_epel[x][0][1] = ff_hevc_rpi_put_epel_h_neon_8; ++ c->put_hevc_epel[x][1][1] = ff_hevc_rpi_put_epel_hv_neon_8; ++ } ++ c->put_hevc_epel[0][0][0] = ff_hevc_rpi_put_pixels_w2_neon_8; ++ c->put_hevc_epel[1][0][0] = ff_hevc_rpi_put_pixels_w4_neon_8; ++ c->put_hevc_epel[2][0][0] = ff_hevc_rpi_put_pixels_w6_neon_8; ++ c->put_hevc_epel[3][0][0] = ff_hevc_rpi_put_pixels_w8_neon_8; ++ c->put_hevc_epel[4][0][0] = ff_hevc_rpi_put_pixels_w12_neon_8; ++ c->put_hevc_epel[5][0][0] = ff_hevc_rpi_put_pixels_w16_neon_8; ++ c->put_hevc_epel[6][0][0] = ff_hevc_rpi_put_pixels_w24_neon_8; ++ c->put_hevc_epel[7][0][0] = ff_hevc_rpi_put_pixels_w32_neon_8; ++ c->put_hevc_epel[8][0][0] = ff_hevc_rpi_put_pixels_w48_neon_8; ++ c->put_hevc_epel[9][0][0] = ff_hevc_rpi_put_pixels_w64_neon_8; ++ ++ c->put_hevc_qpel[0][0][0] = ff_hevc_rpi_put_pixels_w2_neon_8; ++ c->put_hevc_qpel[1][0][0] = ff_hevc_rpi_put_pixels_w4_neon_8; ++ c->put_hevc_qpel[2][0][0] = ff_hevc_rpi_put_pixels_w6_neon_8; ++ c->put_hevc_qpel[3][0][0] = ff_hevc_rpi_put_pixels_w8_neon_8; ++ c->put_hevc_qpel[4][0][0] = ff_hevc_rpi_put_pixels_w12_neon_8; ++ c->put_hevc_qpel[5][0][0] = ff_hevc_rpi_put_pixels_w16_neon_8; ++ c->put_hevc_qpel[6][0][0] = ff_hevc_rpi_put_pixels_w24_neon_8; ++ c->put_hevc_qpel[7][0][0] = ff_hevc_rpi_put_pixels_w32_neon_8; ++ c->put_hevc_qpel[8][0][0] = ff_hevc_rpi_put_pixels_w48_neon_8; ++ c->put_hevc_qpel[9][0][0] = ff_hevc_rpi_put_pixels_w64_neon_8; ++ ++ c->put_hevc_qpel_uni[1][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8; ++ c->put_hevc_qpel_uni[3][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8; ++ c->put_hevc_qpel_uni[5][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8; ++ c->put_hevc_qpel_uni[6][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8; ++ c->put_hevc_qpel_uni[7][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8; ++ c->put_hevc_qpel_uni[8][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8; ++ c->put_hevc_qpel_uni[9][0][0] = ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8; ++ } + else if (bit_depth == 10) { -+ c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_neon_10; -+ c->hevc_v_loop_filter_luma_c = ff_hevc_v_loop_filter_luma_neon_10; -+ c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_neon_10; -+ c->hevc_h_loop_filter_luma_c = ff_hevc_h_loop_filter_luma_neon_10; -+ c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_neon_10; -+ c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10; -+ c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_neon_10; -+ c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10; -+#ifdef RPI -+ c->hevc_v_loop_filter_luma2 = ff_hevc_v_loop_filter_luma2_neon_10; -+ c->hevc_h_loop_filter_uv = ff_hevc_h_loop_filter_uv_neon_10; -+ c->hevc_v_loop_filter_uv2 = ff_hevc_v_loop_filter_uv2_neon_10; -+#endif -+ c->idct[0] = ff_hevc_transform_4x4_neon_10; -+ c->idct[1] = ff_hevc_transform_8x8_neon_10; -+ c->idct_dc[0] = ff_hevc_idct_4x4_dc_neon_10; -+ c->idct_dc[1] = ff_hevc_idct_8x8_dc_neon_10; -+ c->idct_dc[2] = ff_hevc_idct_16x16_dc_neon_10; -+ c->idct_dc[3] = ff_hevc_idct_32x32_dc_neon_10; -+ c->add_residual[0] = ff_hevc_add_residual_4x4_neon_10; -+ c->add_residual[1] = ff_hevc_add_residual_8x8_neon_10; -+ c->add_residual[2] = ff_hevc_add_residual_16x16_neon_10; -+ c->add_residual[3] = ff_hevc_add_residual_32x32_neon_10; -+ c->add_residual_dc[0] = ff_hevc_add_residual_4x4_dc_neon_10; -+ c->add_residual_dc[1] = ff_hevc_add_residual_8x8_dc_neon_10; -+ c->add_residual_dc[2] = ff_hevc_add_residual_16x16_dc_neon_10; -+ c->add_residual_dc[3] = ff_hevc_add_residual_32x32_dc_neon_10; -+#if RPI_HEVC_SAND -+ c->add_residual_u[0] = ff_hevc_add_residual_4x4_u_neon_10; -+ c->add_residual_u[1] = ff_hevc_add_residual_8x8_u_neon_10; -+ c->add_residual_u[2] = ff_hevc_add_residual_16x16_u_neon_10; -+ c->add_residual_v[0] = ff_hevc_add_residual_4x4_v_neon_10; -+ c->add_residual_v[1] = ff_hevc_add_residual_8x8_v_neon_10; -+ c->add_residual_v[2] = ff_hevc_add_residual_16x16_v_neon_10; -+ c->add_residual_c[0] = ff_hevc_add_residual_4x4_c_neon_10; -+ c->add_residual_c[1] = ff_hevc_add_residual_8x8_c_neon_10; -+ c->add_residual_c[2] = ff_hevc_add_residual_16x16_c_neon_10; -+ c->add_residual_dc_c[0] = ff_hevc_add_residual_4x4_dc_c_neon_10; -+ c->add_residual_dc_c[1] = ff_hevc_add_residual_8x8_dc_c_neon_10; -+ c->add_residual_dc_c[2] = ff_hevc_add_residual_16x16_dc_c_neon_10; -+#endif -+ c->transform_4x4_luma = ff_hevc_transform_luma_4x4_neon_10; -+ c->sao_band_filter[0] = ff_hevc_sao_band_8_neon_10; -+ c->sao_band_filter[1] = ff_hevc_sao_band_16_neon_10; -+ c->sao_band_filter[2] = ff_hevc_sao_band_32_neon_10; -+ c->sao_band_filter[3] = ff_hevc_sao_band_48_neon_10; -+ c->sao_band_filter[4] = ff_hevc_sao_band_64_neon_10; -+ -+ c->sao_edge_filter[0] = ff_hevc_sao_edge_8_neon_10; -+ c->sao_edge_filter[1] = ff_hevc_sao_edge_16_neon_10; -+ c->sao_edge_filter[2] = ff_hevc_sao_edge_32_neon_10; -+ c->sao_edge_filter[3] = ff_hevc_sao_edge_48_neon_10; -+ c->sao_edge_filter[4] = ff_hevc_sao_edge_64_neon_10; ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_chroma = ff_hevc_rpi_v_loop_filter_chroma_neon_10; ++ c->hevc_v_loop_filter_chroma_c = ff_hevc_rpi_v_loop_filter_chroma_neon_10; ++ c->hevc_h_loop_filter_chroma = ff_hevc_rpi_h_loop_filter_chroma_neon_10; ++ c->hevc_h_loop_filter_chroma_c = ff_hevc_rpi_h_loop_filter_chroma_neon_10; ++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10; ++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10; ++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10; ++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10; ++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10; ++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10; ++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10; ++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10; ++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10; ++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10; ++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10; ++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10; ++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10; ++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10; ++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10; ++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10; ++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10; ++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10; ++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10; ++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10; ++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10; ++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10; ++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10; ++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10; ++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10; ++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10; ++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10; ++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10; ++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10; ++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10; ++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10; ++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10; ++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10; ++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10; ++ ++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10; ++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10; ++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10; ++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10; ++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10; +#if SAO_FILTER_N == 6 -+ c->sao_band_filter[5] = ff_hevc_sao_band_24_neon_10; -+ c->sao_edge_filter[5] = ff_hevc_sao_edge_24_neon_10; ++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10; ++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10; +#endif -+#if RPI_HEVC_SAND -+ c->sao_band_filter_c[0] = ff_hevc_sao_band_c_8_neon_10; -+ c->sao_band_filter_c[1] = ff_hevc_sao_band_c_16_neon_10; -+ c->sao_band_filter_c[2] = ff_hevc_sao_band_c_32_neon_10; ++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10; ++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10; ++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10; + -+ c->sao_edge_filter_c[0] = ff_hevc_sao_edge_c_8_neon_10; -+ c->sao_edge_filter_c[1] = ff_hevc_sao_edge_c_16_neon_10; -+ c->sao_edge_filter_c[2] = ff_hevc_sao_edge_c_32_neon_10; ++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10; ++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10; ++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10; + +#if SAO_FILTER_N == 6 -+ c->sao_band_filter_c[5] = ff_hevc_sao_band_c_24_neon_10; -+ c->sao_edge_filter_c[5] = ff_hevc_sao_edge_c_24_neon_10; -+#endif ++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10; ++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10; +#endif + } + + assert(offsetof(MvField, mv) == 0); + assert(offsetof(MvField, ref_idx) == 8); + assert(offsetof(MvField, pred_flag) == 10); -+ c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon; - } -diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S ++ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; ++} +diff --git a/libavcodec/arm/rpi_hevcdsp_qpel_neon.S b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S new file mode 100644 -index 0000000000..7cc5cd5e5c +index 0000000000..86a9dcc377 --- /dev/null -+++ b/libavcodec/arm/hevcdsp_res16_neon.S -@@ -0,0 +1,610 @@ ++++ b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S +@@ -0,0 +1,999 @@ ++/* ++ * Copyright (c) 2014 - 2015 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ +#include "libavutil/arm/asm.S" +#include "neon.S" + -+#define BIT_DEPTH 10 ++#define MAX_PB_SIZE #64 + -+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX -+ vmax.s16 \Q0, \Q_MIN -+ vmax.s16 \Q1, \Q_MIN -+ vmax.s16 \Q2, \Q_MIN -+ vmax.s16 \Q3, \Q_MIN -+ vmin.s16 \Q0, \Q_MAX -+ vmin.s16 \Q1, \Q_MAX -+ vmin.s16 \Q2, \Q_MAX -+ vmin.s16 \Q3, \Q_MAX ++.macro regshuffle_d8 ++ vmov d16, d17 ++ vmov d17, d18 ++ vmov d18, d19 ++ vmov d19, d20 ++ vmov d20, d21 ++ vmov d21, d22 ++ vmov d22, d23 +.endm + -+@ add_residual4x4( -+@ uint8_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] -+ -+function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1 -+ vld1.16 {q10, q11}, [r1] -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vld1.16 {d0}, [r0, :64], r2 -+ vld1.16 {d1}, [r0, :64], r2 -+ vld1.16 {d2}, [r0, :64], r2 -+ vld1.16 {d3}, [r0, :64], r2 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q11 -+ sub r0, r0, r2, lsl #2 -+ vmax.s16 q0, q0, q8 -+ vmax.s16 q1, q1, q8 -+ vmin.s16 q0, q0, q9 -+ vmin.s16 q1, q1, q9 -+ vst1.16 {d0}, [r0, :64], r2 -+ vst1.16 {d1}, [r0, :64], r2 -+ vst1.16 {d2}, [r0, :64], r2 -+ vst1.16 {d3}, [r0, :64], r2 -+ bx lr ++.macro regshuffle_q8 ++ vmov q0, q1 ++ vmov q1, q2 ++ vmov q2, q3 ++ vmov q3, q4 ++ vmov q4, q5 ++ vmov q5, q6 ++ vmov q6, q7 ++.endm + -+endfunc ++.macro vextin8 ++ pld [r2] ++ vld1.8 {q11}, [r2], r3 ++ vext.8 d16, d22, d23, #1 ++ vext.8 d17, d22, d23, #2 ++ vext.8 d18, d22, d23, #3 ++ vext.8 d19, d22, d23, #4 ++ vext.8 d20, d22, d23, #5 ++ vext.8 d21, d22, d23, #6 ++ vext.8 d22, d22, d23, #7 ++.endm + -+@ add_residual4x4( -+@ uint8_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] ++.macro loadin8 ++ pld [r2] ++ vld1.8 {d16}, [r2], r3 ++ pld [r2] ++ vld1.8 {d17}, [r2], r3 ++ pld [r2] ++ vld1.8 {d18}, [r2], r3 ++ pld [r2] ++ vld1.8 {d19}, [r2], r3 ++ pld [r2] ++ vld1.8 {d20}, [r2], r3 ++ pld [r2] ++ vld1.8 {d21}, [r2], r3 ++ pld [r2] ++ vld1.8 {d22}, [r2], r3 ++ pld [r2] ++ vld1.8 {d23}, [r2], r3 ++.endm + -+function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vdup.i16 q9, r3 -+ vld1.16 {d0}, [r0, :64], r1 -+ vld1.16 {d1}, [r0, :64], r1 -+ vdup.16 q15, r2 -+ vld1.16 {d2}, [r0, :64], r1 -+ vld1.16 {d3}, [r0, :64], r1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q15 -+ sub r0, r0, r1, lsl #2 -+ vmax.s16 q0, q0, q8 -+ vmax.s16 q1, q1, q8 -+ vmin.s16 q0, q0, q9 -+ vmin.s16 q1, q1, q9 -+ vst1.16 {d0}, [r0, :64], r1 -+ vst1.16 {d1}, [r0, :64], r1 -+ vst1.16 {d2}, [r0, :64], r1 -+ vst1.16 {d3}, [r0, :64], r1 -+ bx lr ++.macro qpel_filter_1_32b ++ vmov.i16 d16, #58 ++ vmov.i16 d17, #10 ++ vmull.s16 q9, d6, d16 // 58 * d0 ++ vmull.s16 q10, d7, d16 // 58 * d1 ++ vmov.i16 d16, #17 ++ vmull.s16 q11, d4, d17 // 10 * c0 ++ vmull.s16 q12, d5, d17 // 10 * c1 ++ vmov.i16 d17, #5 ++ vmull.s16 q13, d8, d16 // 17 * e0 ++ vmull.s16 q14, d9, d16 // 17 * e1 ++ vmull.s16 q15, d10, d17 // 5 * f0 ++ vmull.s16 q8, d11, d17 // 5 * f1 ++ vsub.s32 q9, q11 // 58 * d0 - 10 * c0 ++ vsub.s32 q10, q12 // 58 * d1 - 10 * c1 ++ vshll.s16 q11, d2, #2 // 4 * b0 ++ vshll.s16 q12, d3, #2 // 4 * b1 ++ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 ++ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 ++ vsubl.s16 q13, d12, d0 // g0 - a0 ++ vsubl.s16 q14, d13, d1 // g1 - a1 ++ vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 ++ vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 ++ vsub.s32 q13, q15 // g0 - a0 - 5 * f0 ++ vsub.s32 q14, q8 // g1 - a1 - 5 * f1 ++ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 ++ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 ++ vqshrn.s32 d16, q9, #6 ++ vqshrn.s32 d17, q10, #6 ++.endm + -+endfunc ++// input q0 - q7 ++// output q8 ++.macro qpel_filter_2_32b ++ vmov.i32 q8, #11 ++ vaddl.s16 q9, d6, d8 // d0 + e0 ++ vaddl.s16 q10, d7, d9 // d1 + e1 ++ vaddl.s16 q11, d4, d10 // c0 + f0 ++ vaddl.s16 q12, d5, d11 // c1 + f1 ++ vmul.s32 q11, q8 // 11 * (c0 + f0) ++ vmul.s32 q12, q8 // 11 * (c1 + f1) ++ vmov.i32 q8, #40 ++ vaddl.s16 q15, d2, d12 // b0 + g0 ++ vmul.s32 q9, q8 // 40 * (d0 + e0) ++ vmul.s32 q10, q8 // 40 * (d1 + e1) ++ vaddl.s16 q8, d3, d13 // b1 + g1 ++ vaddl.s16 q13, d0, d14 // a0 + h0 ++ vaddl.s16 q14, d1, d15 // a1 + h1 ++ vshl.s32 q15, #2 // 4*(b0+g0) ++ vshl.s32 q8, #2 // 4*(b1+g1) ++ vadd.s32 q11, q13 // 11 * (c0 + f0) + a0 + h0 ++ vadd.s32 q12, q14 // 11 * (c1 + f1) + a1 + h1 ++ vadd.s32 q9, q15 // 40 * (d0 + e0) + 4*(b0+g0) ++ vadd.s32 q10, q8 // 40 * (d1 + e1) + 4*(b1+g1) ++ vsub.s32 q9, q11 // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0) ++ vsub.s32 q10, q12 // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1) ++ vqshrn.s32 d16, q9, #6 ++ vqshrn.s32 d17, q10, #6 ++.endm + ++.macro qpel_filter_3_32b ++ vmov.i16 d16, #58 ++ vmov.i16 d17, #10 ++ vmull.s16 q9, d8, d16 // 58 * d0 ++ vmull.s16 q10, d9, d16 // 58 * d1 ++ vmov.i16 d16, #17 ++ vmull.s16 q11, d10, d17 // 10 * c0 ++ vmull.s16 q12, d11, d17 // 10 * c1 ++ vmov.i16 d17, #5 ++ vmull.s16 q13, d6, d16 // 17 * e0 ++ vmull.s16 q14, d7, d16 // 17 * e1 ++ vmull.s16 q15, d4, d17 // 5 * f0 ++ vmull.s16 q8, d5, d17 // 5 * f1 ++ vsub.s32 q9, q11 // 58 * d0 - 10 * c0 ++ vsub.s32 q10, q12 // 58 * d1 - 10 * c1 ++ vshll.s16 q11, d12, #2 // 4 * b0 ++ vshll.s16 q12, d13, #2 // 4 * b1 ++ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 ++ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 ++ vsubl.s16 q13, d2, d14 // g0 - a0 ++ vsubl.s16 q14, d3, d15 // g1 - a1 ++ vadd.s32 q9, q11 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 ++ vadd.s32 q10, q12 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 ++ vsub.s32 q13, q15 // g0 - a0 - 5 * f0 ++ vsub.s32 q14, q8 // g1 - a1 - 5 * f1 ++ vadd.s32 q9, q13 // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0 ++ vadd.s32 q10, q14 // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1 ++ vqshrn.s32 d16, q9, #6 ++ vqshrn.s32 d17, q10, #6 ++.endm + -+@ add_residual8x8( -+@ uint8_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] ++.macro qpel_filter_1 out=q7 ++ vmov.u8 d24, #58 ++ vmov.u8 d25, #10 ++ vshll.u8 q13, d20, #4 // 16*e ++ vshll.u8 q14, d21, #2 // 4*f ++ vmull.u8 \out, d19, d24 // 58*d ++ vaddw.u8 q13, q13, d20 // 17*e ++ vmull.u8 q15, d18, d25 // 10*c ++ vaddw.u8 q14, q14, d21 // 5*f ++ vsubl.u8 q12, d22, d16 // g - a ++ vadd.u16 \out, q13 // 58d + 17e ++ vshll.u8 q13, d17, #2 // 4*b ++ vadd.u16 q15, q14 // 10*c + 5*f ++ vadd.s16 q13, q12 // - a + 4*b + g ++ vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f ++ vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f ++.endm + -+function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+ mov r12, #2 -+1: -+ vldm r1!, {q10-q13} -+ vld1.16 {q0}, [r0, :128], r2 -+ subs r12, #1 -+ vld1.16 {q1}, [r0, :128], r2 -+ vqadd.s16 q0, q10 -+ vld1.16 {q2}, [r0, :128], r2 -+ vqadd.s16 q1, q11 -+ vld1.16 {q3}, [r0, :128], r2 -+ vqadd.s16 q2, q12 -+ vqadd.s16 q3, q13 -+ sub r0, r0, r2, lsl #2 -+ vmax.s16 q0, q0, q8 -+ vmax.s16 q1, q1, q8 -+ vmax.s16 q2, q2, q8 -+ vmax.s16 q3, q3, q8 -+ vmin.s16 q0, q0, q9 -+ vmin.s16 q1, q1, q9 -+ vst1.16 {q0}, [r0, :128], r2 -+ vmin.s16 q2, q2, q9 -+ vst1.16 {q1}, [r0, :128], r2 -+ vmin.s16 q3, q3, q9 -+ vst1.16 {q2}, [r0, :128], r2 -+ vst1.16 {q3}, [r0, :128], r2 -+ bne 1b -+ bx lr ++.macro qpel_filter_2 out=q7 ++ vmov.i16 q12, #10 ++ vmov.i16 q14, #11 ++ vaddl.u8 q13, d19, d20 // d + e ++ vaddl.u8 q15, d18, d21 // c + f ++ vmul.u16 q13, q12 // 10 * (d+e) ++ vmul.u16 q15, q14 // 11 * ( c + f) ++ vaddl.u8 \out, d17, d22 // b + g ++ vaddl.u8 q12, d16, d23 // a + h ++ vadd.u16 \out, q13 // b + 10 * (d + e) + g ++ vadd.s16 q12, q15 ++ vshl.u16 \out, #2 // 4 * (b + 10 * (d + e) + g) ++ vsub.s16 \out, q12 ++.endm + -+endfunc ++.macro qpel_filter_3 out=q7 ++ vmov.u8 d24, #58 ++ vmov.u8 d25, #10 ++ vshll.u8 q13, d19, #4 // 16*e ++ vshll.u8 q14, d18, #2 // 4*f ++ vmull.u8 \out, d20, d24 // 58*d ++ vaddw.u8 q13, q13, d19 // 17*e ++ vmull.u8 q15, d21, d25 // 10*c ++ vaddw.u8 q14, q14, d18 // 5*f ++ vsubl.u8 q12, d17, d23 // g - a ++ vadd.u16 \out, q13 // 58d + 17e ++ vshll.u8 q13, d22, #2 // 4*b ++ vadd.u16 q15, q14 // 10*c + 5*f ++ vadd.s16 q13, q12 // - a + 4*b + g ++ vsub.s16 \out, q15 // -10*c + 58*d + 17*e -5*f ++ vadd.s16 \out, q13 // -a + 4*b -10*c + 58*d + 17*e -5*f ++.endm + -+@ add_residual4x4_dc_c( -+@ uint8_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc_uv) [r2] ++.macro hevc_put_qpel_vX_neon_8 filter ++ push {r4, r5, r6, r7} ++ ldr r4, [sp, #16] // height ++ ldr r5, [sp, #20] // width ++ vpush {d8-d15} ++ sub r2, r2, r3, lsl #1 ++ sub r2, r3 ++ mov r12, r4 ++ mov r6, r0 ++ mov r7, r2 ++ lsl r1, #1 ++0: loadin8 ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ \filter ++ vst1.16 {q7}, [r0], r1 ++ regshuffle_d8 ++ vld1.8 {d23}, [r2], r3 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #16 ++ mov r0, r6 ++ add r7, #8 ++ mov r2, r7 ++ b 0b ++4: subs r4, #1 ++ \filter ++ vst1.16 d14, [r0], r1 ++ regshuffle_d8 ++ vld1.32 {d23[0]}, [r2], r3 ++ bne 4b ++99: vpop {d8-d15} ++ pop {r4, r5, r6, r7} ++ bx lr ++.endm + -+function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 -+ mov r12, #1 -+ vdup.32 q15, r2 -+ b 9f ++.macro hevc_put_qpel_uw_vX_neon_8 filter ++ push {r4-r10} ++ ldr r5, [sp, #28] // width ++ ldr r4, [sp, #32] // height ++ ldr r8, [sp, #36] // src2 ++ ldr r9, [sp, #40] // src2stride ++ vpush {d8-d15} ++ sub r2, r2, r3, lsl #1 ++ sub r2, r3 ++ mov r12, r4 ++ mov r6, r0 ++ mov r7, r2 ++ cmp r8, #0 ++ bne .Lbi\@ ++0: loadin8 ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ \filter ++ vqrshrun.s16 d0, q7, #6 ++ vst1.8 d0, [r0], r1 ++ regshuffle_d8 ++ vld1.8 {d23}, [r2], r3 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #8 ++ mov r0, r6 ++ add r7, #8 ++ mov r2, r7 ++ b 0b ++4: subs r4, #1 ++ \filter ++ vqrshrun.s16 d0, q7, #6 ++ vst1.32 d0[0], [r0], r1 ++ regshuffle_d8 ++ vld1.32 {d23[0]}, [r2], r3 ++ bne 4b ++ b 99f ++.Lbi\@: lsl r9, #1 ++ mov r10, r8 ++0: loadin8 ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ \filter ++ vld1.16 {q0}, [r8], r9 ++ vqadd.s16 q0, q7 ++ vqrshrun.s16 d0, q0, #7 ++ vst1.8 d0, [r0], r1 ++ regshuffle_d8 ++ vld1.8 {d23}, [r2], r3 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #8 ++ mov r0, r6 ++ add r10, #16 ++ mov r8, r10 ++ add r7, #8 ++ mov r2, r7 ++ b 0b ++4: subs r4, #1 ++ \filter ++ vld1.16 d0, [r8], r9 ++ vqadd.s16 d0, d14 ++ vqrshrun.s16 d0, q0, #7 ++ vst1.32 d0[0], [r0], r1 ++ regshuffle_d8 ++ vld1.32 {d23[0]}, [r2], r3 ++ bne 4b ++99: vpop {d8-d15} ++ pop {r4-r10} ++ bx lr ++.endm ++ ++function ff_hevc_rpi_put_qpel_v1_neon_8, export=1 ++ hevc_put_qpel_vX_neon_8 qpel_filter_1 +endfunc + -+@ add_residual8x8_dc( -+@ uint8_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] ++function ff_hevc_rpi_put_qpel_v2_neon_8, export=1 ++ hevc_put_qpel_vX_neon_8 qpel_filter_2 ++endfunc ++ ++function ff_hevc_rpi_put_qpel_v3_neon_8, export=1 ++ hevc_put_qpel_vX_neon_8 qpel_filter_3 ++endfunc + -+function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 -+ mov r12, #2 -+ vdup.16 q15, r2 -+9: -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+1: -+ vld1.16 {q0}, [r0, :128], r1 -+ subs r12, #1 -+ vld1.16 {q1}, [r0, :128], r1 -+ vqadd.s16 q0, q15 -+ vld1.16 {q2}, [r0, :128], r1 -+ vqadd.s16 q1, q15 -+ vld1.16 {q3}, [r0, :128], r1 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q15 -+ sub r0, r0, r1, lsl #2 -+ vmax.s16 q0, q8 -+ vmax.s16 q1, q8 -+ vmax.s16 q2, q8 -+ vmax.s16 q3, q8 -+ vmin.s16 q0, q9 -+ vmin.s16 q1, q9 -+ vst1.16 {q0}, [r0, :128], r1 -+ vmin.s16 q2, q9 -+ vst1.16 {q1}, [r0, :128], r1 -+ vmin.s16 q3, q9 -+ vst1.16 {q2}, [r0, :128], r1 -+ vst1.16 {q3}, [r0, :128], r1 -+ bne 1b -+ bx lr + ++function ff_hevc_rpi_put_qpel_uw_v1_neon_8, export=1 ++ hevc_put_qpel_uw_vX_neon_8 qpel_filter_1 +endfunc + -+@ add_residual16x16( -+@ uint8_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] ++function ff_hevc_rpi_put_qpel_uw_v2_neon_8, export=1 ++ hevc_put_qpel_uw_vX_neon_8 qpel_filter_2 ++endfunc + -+function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+ mov r12, #8 -+1: -+ vldm r1!, {q10-q13} -+ @ For RPI Sand we could guarantee :256 but not for general -+ @ non-RPI allocation. :128 is as good as we can claim -+ vld1.16 {q0, q1}, [r0, :128], r2 -+ subs r12, #1 -+ vld1.16 {q2, q3}, [r0, :128] -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q11 -+ vqadd.s16 q2, q12 -+ vqadd.s16 q3, q13 -+ sub r0, r2 -+ vmax.s16 q0, q0, q8 -+ vmax.s16 q1, q1, q8 -+ vmax.s16 q2, q2, q8 -+ vmax.s16 q3, q3, q8 -+ vmin.s16 q0, q0, q9 -+ vmin.s16 q1, q1, q9 -+ vmin.s16 q2, q2, q9 -+ vmin.s16 q3, q3, q9 -+ vst1.16 {q0, q1}, [r0, :128], r2 -+ vst1.16 {q2, q3}, [r0, :128], r2 -+ bne 1b -+ bx lr ++function ff_hevc_rpi_put_qpel_uw_v3_neon_8, export=1 ++ hevc_put_qpel_uw_vX_neon_8 qpel_filter_3 +endfunc + -+@ add_residual8x8_dc_c( -+@ uint8_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc_uv) [r2] ++.macro hevc_put_qpel_hX_neon_8 filter ++ push {r4, r5, r6, r7} ++ ldr r4, [sp, #16] // height ++ ldr r5, [sp, #20] // width + -+function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 -+ mov r12, #4 -+ vdup.32 q15, r2 -+ b 9f ++ vpush {d8-d15} ++ sub r2, #4 ++ lsl r1, #1 ++ mov r12, r4 ++ mov r6, r0 ++ mov r7, r2 ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ vextin8 ++ \filter ++ vst1.16 {q7}, [r0], r1 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #16 ++ mov r0, r6 ++ add r7, #8 ++ mov r2, r7 ++ cmp r5, #4 ++ bne 8b ++4: subs r4, #1 ++ vextin8 ++ \filter ++ vst1.16 d14, [r0], r1 ++ bne 4b ++99: vpop {d8-d15} ++ pop {r4, r5, r6, r7} ++ bx lr ++.endm ++ ++.macro hevc_put_qpel_uw_hX_neon_8 filter ++ push {r4-r10} ++ ldr r5, [sp, #28] // width ++ ldr r4, [sp, #32] // height ++ ldr r8, [sp, #36] // src2 ++ ldr r9, [sp, #40] // src2stride ++ vpush {d8-d15} ++ sub r2, #4 ++ mov r12, r4 ++ mov r6, r0 ++ mov r7, r2 ++ cmp r8, #0 ++ bne .Lbi\@ ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ vextin8 ++ \filter ++ vqrshrun.s16 d0, q7, #6 ++ vst1.8 d0, [r0], r1 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #8 ++ mov r0, r6 ++ add r7, #8 ++ mov r2, r7 ++ cmp r5, #4 ++ bne 8b ++4: subs r4, #1 ++ vextin8 ++ \filter ++ vqrshrun.s16 d0, q7, #6 ++ vst1.32 d0[0], [r0], r1 ++ bne 4b ++ b 99f ++.Lbi\@: ++ lsl r9, #1 ++ cmp r5, #4 ++ beq 4f ++ mov r10, r8 ++8: subs r4, #1 ++ vextin8 ++ \filter ++ vld1.16 {q0}, [r8], r9 ++ vqadd.s16 q0, q7 ++ vqrshrun.s16 d0, q0, #7 ++ vst1.8 d0, [r0], r1 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #8 ++ add r10, #16 ++ mov r8, r10 ++ mov r0, r6 ++ add r7, #8 ++ mov r2, r7 ++ cmp r5, #4 ++ bne 8b ++4: subs r4, #1 ++ vextin8 ++ \filter ++ vld1.16 d0, [r8], r9 ++ vqadd.s16 d0, d14 ++ vqrshrun.s16 d0, q0, #7 ++ vst1.32 d0[0], [r0], r1 ++ bne 4b ++99: vpop {d8-d15} ++ pop {r4-r10} ++ bx lr ++.endm ++ ++function ff_hevc_rpi_put_qpel_h1_neon_8, export=1 ++ hevc_put_qpel_hX_neon_8 qpel_filter_1 +endfunc + -+@ add_residual16x16_dc( -+@ uint8_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] ++function ff_hevc_rpi_put_qpel_h2_neon_8, export=1 ++ hevc_put_qpel_hX_neon_8 qpel_filter_2 ++endfunc ++ ++function ff_hevc_rpi_put_qpel_h3_neon_8, export=1 ++ hevc_put_qpel_hX_neon_8 qpel_filter_3 ++endfunc + -+function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 -+ vdup.i16 q15, r2 -+ mov r12, #8 -+9: -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+1: -+ @ For RPI Sand we could guarantee :256 but not for general -+ @ non-RPI allocation. :128 is as good as we can claim -+ vld1.16 {q0, q1}, [r0, :128], r1 -+ subs r12, #1 -+ vld1.16 {q2, q3}, [r0, :128] -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q15 -+ sub r0, r1 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst1.16 {q0, q1}, [r0, :128], r1 -+ vst1.16 {q2, q3}, [r0, :128], r1 -+ bne 1b -+ bx lr + ++function ff_hevc_rpi_put_qpel_uw_h1_neon_8, export=1 ++ hevc_put_qpel_uw_hX_neon_8 qpel_filter_1 +endfunc + ++function ff_hevc_rpi_put_qpel_uw_h2_neon_8, export=1 ++ hevc_put_qpel_uw_hX_neon_8 qpel_filter_2 ++endfunc + -+@ add_residual32x32( -+@ uint8_t *_dst, [r0] -+@ int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] ++function ff_hevc_rpi_put_qpel_uw_h3_neon_8, export=1 ++ hevc_put_qpel_uw_hX_neon_8 qpel_filter_3 ++endfunc + -+function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+ mov r12, #32 -+1: -+ vldm r1!, {q10-q13} -+ vldm r0, {q0-q3} -+ subs r12, #1 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q11 -+ vqadd.s16 q2, q12 -+ vqadd.s16 q3, q13 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vstm r0, {q0-q3} -+ add r0, r2 -+ bne 1b -+ bx lr ++.macro hevc_put_qpel_hXvY_neon_8 filterh filterv ++ push {r4, r5, r6, r7} ++ ldr r4, [sp, #16] // height ++ ldr r5, [sp, #20] // width ++ ++ vpush {d8-d15} ++ sub r2, #4 ++ sub r2, r2, r3, lsl #1 ++ sub r2, r3 // extra_before 3 ++ lsl r1, #1 ++ mov r12, r4 ++ mov r6, r0 ++ mov r7, r2 ++0: vextin8 ++ \filterh q0 ++ vextin8 ++ \filterh q1 ++ vextin8 ++ \filterh q2 ++ vextin8 ++ \filterh q3 ++ vextin8 ++ \filterh q4 ++ vextin8 ++ \filterh q5 ++ vextin8 ++ \filterh q6 ++ vextin8 ++ \filterh q7 ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ \filterv ++ vst1.16 {q8}, [r0], r1 ++ regshuffle_q8 ++ vextin8 ++ \filterh q7 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #16 ++ mov r0, r6 ++ add r7, #8 ++ mov r2, r7 ++ b 0b ++4: subs r4, #1 ++ \filterv ++ vst1.16 d16, [r0], r1 ++ regshuffle_q8 ++ vextin8 ++ \filterh q7 ++ bne 4b ++99: vpop {d8-d15} ++ pop {r4, r5, r6, r7} ++ bx lr ++.endm + ++.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv ++ push {r4-r10} ++ ldr r5, [sp, #28] // width ++ ldr r4, [sp, #32] // height ++ ldr r8, [sp, #36] // src2 ++ ldr r9, [sp, #40] // src2stride ++ vpush {d8-d15} ++ sub r2, #4 ++ sub r2, r2, r3, lsl #1 ++ sub r2, r3 // extra_before 3 ++ mov r12, r4 ++ mov r6, r0 ++ mov r7, r2 ++ cmp r8, #0 ++ bne .Lbi\@ ++0: vextin8 ++ \filterh q0 ++ vextin8 ++ \filterh q1 ++ vextin8 ++ \filterh q2 ++ vextin8 ++ \filterh q3 ++ vextin8 ++ \filterh q4 ++ vextin8 ++ \filterh q5 ++ vextin8 ++ \filterh q6 ++ vextin8 ++ \filterh q7 ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ \filterv ++ vqrshrun.s16 d0, q8, #6 ++ vst1.8 d0, [r0], r1 ++ regshuffle_q8 ++ vextin8 ++ \filterh q7 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #8 ++ mov r0, r6 ++ add r7, #8 ++ mov r2, r7 ++ b 0b ++4: subs r4, #1 ++ \filterv ++ vqrshrun.s16 d0, q8, #6 ++ vst1.32 d0[0], [r0], r1 ++ regshuffle_q8 ++ vextin8 ++ \filterh q7 ++ bne 4b ++ b 99f ++.Lbi\@: lsl r9, #1 ++ mov r10, r8 ++0: vextin8 ++ \filterh q0 ++ vextin8 ++ \filterh q1 ++ vextin8 ++ \filterh q2 ++ vextin8 ++ \filterh q3 ++ vextin8 ++ \filterh q4 ++ vextin8 ++ \filterh q5 ++ vextin8 ++ \filterh q6 ++ vextin8 ++ \filterh q7 ++ cmp r5, #4 ++ beq 4f ++8: subs r4, #1 ++ \filterv ++ vld1.16 {q0}, [r8], r9 ++ vqadd.s16 q0, q8 ++ vqrshrun.s16 d0, q0, #7 ++ vst1.8 d0, [r0], r1 ++ regshuffle_q8 ++ vextin8 ++ \filterh q7 ++ bne 8b ++ subs r5, #8 ++ beq 99f ++ mov r4, r12 ++ add r6, #8 ++ mov r0, r6 ++ add r10, #16 ++ mov r8, r10 ++ add r7, #8 ++ mov r2, r7 ++ b 0b ++4: subs r4, #1 ++ \filterv ++ vld1.16 d0, [r8], r9 ++ vqadd.s16 d0, d16 ++ vqrshrun.s16 d0, q0, #7 ++ vst1.32 d0[0], [r0], r1 ++ regshuffle_q8 ++ vextin8 ++ \filterh q7 ++ bne 4b ++99: vpop {d8-d15} ++ pop {r4-r10} ++ bx lr ++.endm ++ ++ ++function ff_hevc_rpi_put_qpel_h1v1_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b +endfunc + -+@ add_residual8x8_dc_c( -+@ uint8_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc_uv) [r2] ++function ff_hevc_rpi_put_qpel_h2v1_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b ++endfunc + -+function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 -+ mov r12, #16 -+ vdup.32 q15, r2 -+ b 9f ++function ff_hevc_rpi_put_qpel_h3v1_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b +endfunc + -+@ add_residual32x32_dc( -+@ uint8_t *_dst, [r0] -+@ ptrdiff_t stride, [r1] -+@ int dc) [r2] ++function ff_hevc_rpi_put_qpel_h1v2_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b ++endfunc + -+function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 -+ vdup.i16 q15, r2 -+ mov r12, #32 -+9: -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 -+1: -+ vldm r0, {q0-q3} -+ subs r12, #1 -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q15 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vstm r0, {q0-q3} -+ add r0, r1 -+ bne 1b -+ bx lr ++function ff_hevc_rpi_put_qpel_h2v2_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b ++endfunc + ++function ff_hevc_rpi_put_qpel_h3v2_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + -+@ ============================================================================ -+@ U add ++function ff_hevc_rpi_put_qpel_h1v3_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b ++endfunc + -+@ add_residual4x4_u( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] ++function ff_hevc_rpi_put_qpel_h2v3_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b ++endfunc + -+function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 -+ vld1.16 {q10, q11}, [r1, :256] -+ vdup.16 q15, r3 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 ++function ff_hevc_rpi_put_qpel_h3v3_neon_8, export=1 ++ hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b ++endfunc + -+ vld2.16 {d0, d2}, [r0, :128], r2 -+ vld2.16 {d1, d3}, [r0, :128], r2 -+ vld2.16 {d4, d6}, [r0, :128], r2 -+ vld2.16 {d5, d7}, [r0, :128], r2 + -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q15 -+ sub r0, r0, r2, lsl #2 -+ clip16_4 q0, q1, q2, q3, q8, q9 ++function ff_hevc_rpi_put_qpel_uw_h1v1_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b ++endfunc + -+ vst2.16 {d0, d2}, [r0, :128], r2 -+ vst2.16 {d1, d3}, [r0, :128], r2 -+ vst2.16 {d4, d6}, [r0, :128], r2 -+ vst2.16 {d5, d7}, [r0, :128] -+ bx lr ++function ff_hevc_rpi_put_qpel_uw_h2v1_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b +endfunc + -+@ add_residual8x8_u( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] ++function ff_hevc_rpi_put_qpel_uw_h3v1_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b ++endfunc + -+function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ mov r12, #4 -+ vdup.i16 q9, r3 -+1: -+ vld2.16 {q0, q1}, [r0, :256], r2 -+ vld2.16 {q2, q3}, [r0, :256] -+ vld1.16 {q10, q11}, [r1, :256]! -+ subs r12, #1 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q15 -+ sub r0, r2 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0, :256], r2 -+ vst2.16 {q2, q3}, [r0, :256], r2 -+ bne 1b -+ bx lr ++function ff_hevc_rpi_put_qpel_uw_h1v2_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b +endfunc + -+@ add_residual16x16_u( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] ++function ff_hevc_rpi_put_qpel_uw_h2v2_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b ++endfunc + -+function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ mov r12, #16 -+ vdup.i16 q9, r3 -+ sub r2, #32 -+1: -+ vld2.16 {q0, q1}, [r0, :256]! -+ vld2.16 {q2, q3}, [r0, :256] -+ vld1.16 {q10, q11}, [r1, :256]! -+ subs r12, #1 -+ vqadd.s16 q0, q10 -+ vqadd.s16 q1, q15 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q3, q15 -+ sub r0, #32 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0, :256]! -+ vst2.16 {q2, q3}, [r0, :256], r2 -+ bne 1b -+ bx lr ++function ff_hevc_rpi_put_qpel_uw_h3v2_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b +endfunc + -+@ ============================================================================ -+@ V add ++function ff_hevc_rpi_put_qpel_uw_h1v3_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b ++endfunc + -+@ add_residual4x4_v( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] ++function ff_hevc_rpi_put_qpel_uw_h2v3_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b ++endfunc + -+function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 -+ vld1.16 {q10, q11}, [r1, :256] -+ vdup.16 q15, r3 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ vdup.i16 q9, r3 ++function ff_hevc_rpi_put_qpel_uw_h3v3_neon_8, export=1 ++ hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b ++endfunc + -+ vld2.16 {d0, d2}, [r0, :128], r2 -+ vld2.16 {d1, d3}, [r0, :128], r2 -+ vld2.16 {d4, d6}, [r0, :128], r2 -+ vld2.16 {d5, d7}, [r0, :128], r2 ++.macro init_put_pixels ++ pld [r1] ++ pld [r1, r2] ++ mov r12, MAX_PB_SIZE ++ lsl r12, #1 ++.endm + -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q10 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q11 -+ sub r0, r0, r2, lsl #2 -+ clip16_4 q0, q1, q2, q3, q8, q9 ++function ff_hevc_rpi_put_pixels_w2_neon_8, export=1 ++ init_put_pixels ++ vmov.u8 d5, #255 ++ vshr.u64 d5, #32 ++0: subs r3, #1 ++ vld1.32 {d0[0]}, [r1], r2 ++ pld [r1] ++ vld1.32 d6, [r0] ++ vshll.u8 q0, d0, #6 ++ vbit d6, d0, d5 ++ vst1.32 d6, [r0], r12 ++ bne 0b ++ bx lr ++endfunc + -+ vst2.16 {d0, d2}, [r0, :128], r2 -+ vst2.16 {d1, d3}, [r0, :128], r2 -+ vst2.16 {d4, d6}, [r0, :128], r2 -+ vst2.16 {d5, d7}, [r0, :128] -+ bx lr ++function ff_hevc_rpi_put_pixels_w4_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #2 ++ vld1.32 {d0[0]}, [r1], r2 ++ vld1.32 {d0[1]}, [r1], r2 ++ pld [r1] ++ pld [r1, r2] ++ vshll.u8 q0, d0, #6 ++ vst1.64 {d0}, [r0], r12 ++ vst1.64 {d1}, [r0], r12 ++ bne 0b ++ bx lr +endfunc + -+@ add_residual8x8_v( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] ++function ff_hevc_rpi_put_pixels_w6_neon_8, export=1 ++ init_put_pixels ++ vmov.u8 q10, #255 ++ vshr.u64 d21, #32 ++0: subs r3, #1 ++ vld1.16 {d0}, [r1], r2 ++ pld [r1] ++ vshll.u8 q0, d0, #6 ++ vld1.8 {q12}, [r0] ++ vbit q12, q0, q10 ++ vst1.8 {q12}, [r0], r12 ++ bne 0b ++ bx lr ++endfunc + -+function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ mov r12, #4 -+ vdup.i16 q9, r3 -+1: -+ vld2.16 {q0, q1}, [r0, :256], r2 -+ vld2.16 {q2, q3}, [r0, :256] -+ vld1.16 {q10, q11}, [r1, :256]! -+ subs r12, #1 -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q10 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q11 -+ sub r0, r2 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0, :256], r2 -+ vst2.16 {q2, q3}, [r0, :256], r2 -+ bne 1b -+ bx lr ++function ff_hevc_rpi_put_pixels_w8_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #2 ++ vld1.8 {d0}, [r1], r2 ++ vld1.8 {d2}, [r1], r2 ++ pld [r1] ++ pld [r1, r2] ++ vshll.u8 q0, d0, #6 ++ vshll.u8 q1, d2, #6 ++ vst1.16 {q0}, [r0], r12 ++ vst1.16 {q1}, [r0], r12 ++ bne 0b ++ bx lr +endfunc + -+@ add_residual16x16_v( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride, [r2] -+@ int dc) [r3] ++function ff_hevc_rpi_put_pixels_w12_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #2 ++ vld1.64 {d0}, [r1] ++ add r1, #8 ++ vld1.32 {d1[0]}, [r1], r2 ++ sub r1, #8 ++ vld1.64 {d2}, [r1] ++ add r1, #8 ++ vld1.32 {d1[1]}, [r1], r2 ++ sub r1, #8 ++ pld [r1] ++ pld [r1, r2] ++ vshll.u8 q8, d0, #6 ++ vshll.u8 q9, d1, #6 ++ vshll.u8 q10, d2, #6 ++ vmov d22, d19 ++ vst1.64 {d16, d17, d18}, [r0], r12 ++ vst1.64 {d20, d21, d22}, [r0], r12 ++ bne 0b ++ bx lr ++endfunc + -+function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 -+ vdup.16 q15, r3 -+ movw r3, #(1 << BIT_DEPTH) - 1 -+ vmov.i64 q8, #0 -+ mov r12, #16 -+ vdup.i16 q9, r3 -+ sub r2, #32 -+1: -+ vld2.16 {q0, q1}, [r0, :256]! -+ vld2.16 {q2, q3}, [r0, :256] -+ vld1.16 {q10, q11}, [r1, :256]! -+ subs r12, #1 -+ vqadd.s16 q0, q15 -+ vqadd.s16 q1, q10 -+ vqadd.s16 q2, q15 -+ vqadd.s16 q3, q11 -+ sub r0, #32 -+ clip16_4 q0, q1, q2, q3, q8, q9 -+ vst2.16 {q0, q1}, [r0, :256]! -+ vst2.16 {q2, q3}, [r0, :256], r2 -+ bne 1b -+ bx lr ++function ff_hevc_rpi_put_pixels_w16_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #2 ++ vld1.8 {q0}, [r1], r2 ++ vld1.8 {q1}, [r1], r2 ++ pld [r1] ++ pld [r1, r2] ++ vshll.u8 q8, d0, #6 ++ vshll.u8 q9, d1, #6 ++ vshll.u8 q10, d2, #6 ++ vshll.u8 q11, d3, #6 ++ vst1.8 {q8, q9}, [r0], r12 ++ vst1.8 {q10, q11}, [r0], r12 ++ bne 0b ++ bx lr +endfunc + -+@ ============================================================================ -+@ U & V add ++function ff_hevc_rpi_put_pixels_w24_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #1 ++ vld1.8 {d0, d1, d2}, [r1], r2 ++ pld [r1] ++ vshll.u8 q10, d0, #6 ++ vshll.u8 q11, d1, #6 ++ vshll.u8 q12, d2, #6 ++ vstm r0, {q10, q11, q12} ++ add r0, r12 ++ bne 0b ++ bx lr ++endfunc + -+@ add_residual4x4_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] ++function ff_hevc_rpi_put_pixels_w32_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #1 ++ vld1.8 {q0, q1}, [r1], r2 ++ pld [r1] ++ vshll.u8 q8, d0, #6 ++ vshll.u8 q9, d1, #6 ++ vshll.u8 q10, d2, #6 ++ vshll.u8 q11, d3, #6 ++ vstm r0, {q8, q9, q10, q11} ++ add r0, r12 ++ bne 0b ++ bx lr ++endfunc + -+function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 -+ vldm r1, {q10-q13} ++function ff_hevc_rpi_put_pixels_w48_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #1 ++ vld1.8 {q0, q1}, [r1] ++ add r1, #32 ++ vld1.8 {q2}, [r1], r2 ++ sub r1, #32 ++ pld [r1] ++ vshll.u8 q8, d0, #6 ++ vshll.u8 q9, d1, #6 ++ vshll.u8 q10, d2, #6 ++ vshll.u8 q11, d3, #6 ++ vshll.u8 q12, d4, #6 ++ vshll.u8 q13, d5, #6 ++ vstm r0, {q8, q9, q10, q11, q12, q13} ++ add r0, r12 ++ bne 0b ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_put_pixels_w64_neon_8, export=1 ++ init_put_pixels ++0: subs r3, #1 ++ vld1.8 {q0, q1}, [r1] ++ add r1, #32 ++ vld1.8 {q2, q3}, [r1], r2 ++ sub r1, #32 ++ pld [r1] ++ vshll.u8 q8, d0, #6 ++ vshll.u8 q9, d1, #6 ++ vshll.u8 q10, d2, #6 ++ vshll.u8 q11, d3, #6 ++ vshll.u8 q12, d4, #6 ++ vshll.u8 q13, d5, #6 ++ vshll.u8 q14, d6, #6 ++ vshll.u8 q15, d7, #6 ++ vstm r0, {q8, q9, q10, q11, q12, q13, q14, q15} ++ add r0, r12 ++ bne 0b ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_put_qpel_uw_pixels_neon_8, export=1 ++ push {r4-r9} ++ ldr r5, [sp, #24] // width ++ ldr r4, [sp, #28] // height ++ ldr r8, [sp, #32] // src2 ++ ldr r9, [sp, #36] // src2stride ++ vpush {d8-d15} ++ cmp r8, #0 ++ bne 2f ++1: subs r4, #1 ++ vld1.8 {d0}, [r2], r3 ++ vst1.8 d0, [r0], r1 ++ bne 1b ++ vpop {d8-d15} ++ pop {r4-r9} ++ bx lr ++2: subs r4, #1 ++ vld1.8 {d0}, [r2], r3 ++ vld1.16 {q1}, [r8], r9 ++ vshll.u8 q0, d0, #6 ++ vqadd.s16 q0, q1 ++ vqrshrun.s16 d0, q0, #7 ++ vst1.8 d0, [r0], r1 ++ bne 2b ++ vpop {d8-d15} ++ pop {r4-r9} ++ bx lr ++endfunc ++ ++.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4 ++function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1 ++ ldr r12, [sp] // height ++1: subs r12, #4 ++ vld1.32 {\regs} , [r2], r3 ++ vld1.32 {\regs2} , [r2], r3 ++ vld1.32 {\regs3} , [r2], r3 ++ vld1.32 {\regs4} , [r2], r3 ++ vst1.32 {\regs} , [r0], r1 ++ vst1.32 {\regs2} , [r0], r1 ++ vst1.32 {\regs3} , [r0], r1 ++ vst1.32 {\regs4} , [r0], r1 ++ bne 1b ++ bx lr ++endfunc ++.endm ++ ++.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4 ++function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1 ++ push {r4-r5} ++ ldr r12, [sp, #8] // height ++1: subs r12, #2 ++ mov r4, r2 ++ vld1.32 {\regs} , [r2]! ++ vld1.32 {\regs2} , [r2] ++ add r2, r4, r3 ++ mov r4, r2 ++ vld1.32 {\regs3} , [r2]! ++ vld1.32 {\regs4} , [r2] ++ add r2, r4, r3 ++ mov r5, r0 ++ vst1.32 {\regs} , [r0]! ++ vst1.32 {\regs2} , [r0] ++ add r0, r5, r1 ++ mov r5, r0 ++ vst1.32 {\regs3} , [r0]! ++ vst1.32 {\regs4} , [r0] ++ add r0, r5, r1 ++ bne 1b ++ pop {r4-r5} ++ bx lr ++endfunc ++.endm ++ ++put_qpel_uw_pixels 4, d0[0], d0[1], d1[0], d1[1] ++put_qpel_uw_pixels 8, d0, d1, d2, d3 ++put_qpel_uw_pixels_m 12, d0, d1[0], d2, d3[0] ++put_qpel_uw_pixels 16, q0, q1, q2, q3 ++put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21 ++put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11 ++put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10 ++put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11 +diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S +new file mode 100644 +index 0000000000..7dfcc2751a +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S +@@ -0,0 +1,610 @@ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++#define BIT_DEPTH 10 ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ add_residual4x4( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1] + movw r3, #(1 << BIT_DEPTH) - 1 ++ vld1.16 {d0}, [r0, :64], r2 ++ vld1.16 {d1}, [r0, :64], r2 ++ vld1.16 {d2}, [r0, :64], r2 ++ vld1.16 {d3}, [r0, :64], r2 + vmov.i64 q8, #0 + vdup.i16 q9, r3 -+ -+ vld2.16 {d0, d2}, [r0, :128], r2 -+ vld2.16 {d1, d3}, [r0, :128], r2 -+ vld2.16 {d4, d6}, [r0, :128], r2 -+ vld2.16 {d5, d7}, [r0, :128], r2 -+ + vqadd.s16 q0, q10 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q1, q12 -+ vqadd.s16 q3, q13 ++ vqadd.s16 q1, q11 + sub r0, r0, r2, lsl #2 + vmax.s16 q0, q0, q8 + vmax.s16 q1, q1, q8 -+ vmax.s16 q2, q2, q8 -+ vmax.s16 q3, q3, q8 + vmin.s16 q0, q0, q9 + vmin.s16 q1, q1, q9 -+ vmin.s16 q2, q2, q9 -+ vmin.s16 q3, q3, q9 -+ -+ vst2.16 {d0, d2}, [r0, :128], r2 -+ vst2.16 {d1, d3}, [r0, :128], r2 -+ vst2.16 {d4, d6}, [r0, :128], r2 -+ vst2.16 {d5, d7}, [r0, :128] ++ vst1.16 {d0}, [r0, :64], r2 ++ vst1.16 {d1}, [r0, :64], r2 ++ vst1.16 {d2}, [r0, :64], r2 ++ vst1.16 {d3}, [r0, :64], r2 + bx lr ++ +endfunc + -+@ add_residual8x8_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] ++@ add_residual4x4( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] + -+function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 + movw r3, #(1 << BIT_DEPTH) - 1 ++ vdup.i16 q9, r3 ++ vld1.16 {d0}, [r0, :64], r1 ++ vld1.16 {d1}, [r0, :64], r1 ++ vdup.16 q15, r2 ++ vld1.16 {d2}, [r0, :64], r1 ++ vld1.16 {d3}, [r0, :64], r1 + vmov.i64 q8, #0 -+ mov r12, #4 + vdup.i16 q9, r3 -+ add r3, r1, #(8*8*2) @ Offset to V -+1: -+ vld2.16 {q0, q1}, [r0, :256], r2 -+ vld2.16 {q2, q3}, [r0, :256] -+ vld1.16 {q10, q11}, [r1, :256]! -+ vld1.16 {q12, q13}, [r3, :256]! -+ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ sub r0, r0, r1, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0, :64], r1 ++ vst1.16 {d1}, [r0, :64], r1 ++ vst1.16 {d2}, [r0, :64], r1 ++ vst1.16 {d3}, [r0, :64], r1 ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual8x8( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #2 ++1: ++ vldm r1!, {q10-q13} ++ vld1.16 {q0}, [r0, :128], r2 ++ subs r12, #1 ++ vld1.16 {q1}, [r0, :128], r2 + vqadd.s16 q0, q10 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q1, q12 ++ vld1.16 {q2}, [r0, :128], r2 ++ vqadd.s16 q1, q11 ++ vld1.16 {q3}, [r0, :128], r2 ++ vqadd.s16 q2, q12 + vqadd.s16 q3, q13 -+ sub r0, r2 ++ sub r0, r0, r2, lsl #2 + vmax.s16 q0, q0, q8 + vmax.s16 q1, q1, q8 + vmax.s16 q2, q2, q8 + vmax.s16 q3, q3, q8 + vmin.s16 q0, q0, q9 + vmin.s16 q1, q1, q9 ++ vst1.16 {q0}, [r0, :128], r2 + vmin.s16 q2, q2, q9 ++ vst1.16 {q1}, [r0, :128], r2 + vmin.s16 q3, q3, q9 -+ vst2.16 {q0, q1}, [r0, :256], r2 -+ vst2.16 {q2, q3}, [r0, :256], r2 ++ vst1.16 {q2}, [r0, :128], r2 ++ vst1.16 {q3}, [r0, :128], r2 + bne 1b + bx lr ++ +endfunc + -+@ add_residual16x16_c( -+@ uint8_t *_dst, [r0] -+@ const int16_t *res, [r1] -+@ ptrdiff_t stride) [r2] ++@ add_residual4x4_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #1 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual8x8_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 ++ mov r12, #2 ++ vdup.16 q15, r2 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ vld1.16 {q0}, [r0, :128], r1 ++ subs r12, #1 ++ vld1.16 {q1}, [r0, :128], r1 ++ vqadd.s16 q0, q15 ++ vld1.16 {q2}, [r0, :128], r1 ++ vqadd.s16 q1, q15 ++ vld1.16 {q3}, [r0, :128], r1 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ sub r0, r0, r1, lsl #2 ++ vmax.s16 q0, q8 ++ vmax.s16 q1, q8 ++ vmax.s16 q2, q8 ++ vmax.s16 q3, q8 ++ vmin.s16 q0, q9 ++ vmin.s16 q1, q9 ++ vst1.16 {q0}, [r0, :128], r1 ++ vmin.s16 q2, q9 ++ vst1.16 {q1}, [r0, :128], r1 ++ vmin.s16 q3, q9 ++ vst1.16 {q2}, [r0, :128], r1 ++ vst1.16 {q3}, [r0, :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual16x16( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] + -+function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 ++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1 + movw r3, #(1 << BIT_DEPTH) - 1 + vmov.i64 q8, #0 -+ mov r12, #16 + vdup.i16 q9, r3 -+ add r3, r1, #(16*16*2) @ Offset to V -+ sub r2, #32 ++ mov r12, #8 +1: -+ vld2.16 {q0, q1}, [r0, :256]! -+ vld2.16 {q2, q3}, [r0, :256] -+ vld1.16 {q10, q11}, [r1, :256]! -+ vld1.16 {q12, q13}, [r3, :256]! ++ vldm r1!, {q10-q13} ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0, :128], r2 + subs r12, #1 ++ vld1.16 {q2, q3}, [r0, :128] + vqadd.s16 q0, q10 -+ vqadd.s16 q2, q11 -+ vqadd.s16 q1, q12 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 + vqadd.s16 q3, q13 -+ sub r0, #32 ++ sub r0, r2 + vmax.s16 q0, q0, q8 + vmax.s16 q1, q1, q8 + vmax.s16 q2, q2, q8 @@ -5054,477 +5990,633 @@ index 0000000000..7cc5cd5e5c + vmin.s16 q1, q1, q9 + vmin.s16 q2, q2, q9 + vmin.s16 q3, q3, q9 -+ vst2.16 {q0, q1}, [r0, :256]! -+ vst2.16 {q2, q3}, [r0, :256], r2 ++ vst1.16 {q0, q1}, [r0, :128], r2 ++ vst1.16 {q2, q3}, [r0, :128], r2 + bne 1b + bx lr +endfunc + -diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S -new file mode 100644 -index 0000000000..30113d9c93 ---- /dev/null -+++ b/libavcodec/arm/hevcdsp_sao_neon.S -@@ -0,0 +1,1882 @@ -+/* -+ * Copyright (c) 2014 - 2015 Seppo Tomperi -+ * -+ * This file is part of FFmpeg. -+ * -+ * FFmpeg is free software; you can redistribute it and/or -+ * modify it under the terms of the GNU Lesser General Public -+ * License as published by the Free Software Foundation; either -+ * version 2.1 of the License, or (at your option) any later version. -+ * -+ * FFmpeg is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * Lesser General Public License for more details. -+ * -+ * You should have received a copy of the GNU Lesser General Public -+ * License along with FFmpeg; if not, write to the Free Software -+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -+ */ ++@ add_residual8x8_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] + -+#include "libavutil/arm/asm.S" -+#include "neon.S" ++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #4 ++ vdup.32 q15, r2 ++ b 9f ++endfunc + -+.set EDGE_SRC_STRIDE, 160 ++@ add_residual16x16_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] + -+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128 -+ vshr.u8 q12, q8, #3 -+ vadd.s8 q8, \Q_K128 -+ vshr.u8 q13, q9, #3 -+ vadd.s8 q9, \Q_K128 ++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r12, #8 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0, :128], r1 ++ subs r12, #1 ++ vld1.16 {q2, q3}, [r0, :128] ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ sub r0, r1 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0, q1}, [r0, :128], r1 ++ vst1.16 {q2, q3}, [r0, :128], r1 ++ bne 1b ++ bx lr + -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT0, d25 -+ vtbl.8 d26, \XLAT1, d26 -+ vtbl.8 d27, \XLAT1, d27 ++endfunc + -+ vqadd.s8 q8, q12 -+ vshr.u8 q12, q10, #3 -+ vadd.s8 q10, \Q_K128 -+ vqadd.s8 q9, q13 -+ vshr.u8 q13, q11, #3 -+ vadd.s8 q11, \Q_K128 + -+ vsub.s8 q8, \Q_K128 -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT0, d25 -+ vsub.s8 q9, \Q_K128 -+ vtbl.8 d26, \XLAT1, d26 -+ vtbl.8 d27, \XLAT1, d27 -+ vqadd.s8 q10, q12 -+ vqadd.s8 q11, q13 -+ vsub.s8 q10, \Q_K128 -+ vsub.s8 q11, \Q_K128 -+.endm ++@ add_residual32x32( ++@ uint8_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] + -+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128 -+ vshr.u8 q12, q8, #3 -+ vadd.s8 q8, \Q_K128 ++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++ mov r12, #32 ++1: ++ vldm r1!, {q10-q13} ++ vldm r0, {q0-q3} ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vstm r0, {q0-q3} ++ add r0, r2 ++ bne 1b ++ bx lr + -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT1, d25 ++endfunc + -+ vqadd.s8 q8, q12 -+ vsub.s8 q8, \Q_K128 -+.endm ++@ add_residual8x8_dc_c( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] + ++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r12, #16 ++ vdup.32 q15, r2 ++ b 9f ++endfunc + -+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX -+ vmax.s16 \Q0, \Q_MIN -+ vmax.s16 \Q1, \Q_MIN -+ vmax.s16 \Q2, \Q_MIN -+ vmax.s16 \Q3, \Q_MIN -+ vmin.s16 \Q0, \Q_MAX -+ vmin.s16 \Q1, \Q_MAX -+ vmin.s16 \Q2, \Q_MAX -+ vmin.s16 \Q3, \Q_MAX -+.endm -+ -+@ Clobbers q12, q13 -+.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth -+ vshrn.i16 d24, \Q0, #(\bit_depth - 5) -+ vshrn.i16 d25, \Q1, #(\bit_depth - 5) -+ vshrn.i16 d26, \Q2, #(\bit_depth - 5) -+ vshrn.i16 d27, \Q3, #(\bit_depth - 5) -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT1, d25 -+ vtbl.8 d26, \XLAT0, d26 -+ vtbl.8 d27, \XLAT1, d27 -+ vaddw.s8 \Q0, d24 -+ vaddw.s8 \Q1, d25 -+ vaddw.s8 \Q2, d26 -+ vaddw.s8 \Q3, d27 -+ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX -+.endm ++@ add_residual32x32_dc( ++@ uint8_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] + -+@ Clobbers q12 -+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth -+ vshrn.i16 d24, \Q0, #(\bit_depth - 5) -+ vshrn.i16 d25, \Q1, #(\bit_depth - 5) -+ vtbl.8 d24, \XLAT0, d24 -+ vtbl.8 d25, \XLAT1, d25 -+ vaddw.s8 \Q0, d24 -+ vaddw.s8 \Q1, d25 -+ vmax.s16 \Q0, \Q_MIN -+ vmax.s16 \Q1, \Q_MIN -+ vmin.s16 \Q0, \Q_MAX -+ vmin.s16 \Q1, \Q_MAX -+.endm ++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r12, #32 ++9: ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 ++1: ++ vldm r0, {q0-q3} ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vstm r0, {q0-q3} ++ add r0, r1 ++ bne 1b ++ bx lr + ++endfunc + -+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) -+@ so we are quite safe stuffing it into a byte array -+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma -+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of -+@ precision ++@ ============================================================================ ++@ U add + -+@ This, somewhat nasty, bit of code builds the {d0-d3} translation -+@ array via the stack -+@ Given that sao_left_class > 28 can cause wrap we can't just poke -+@ all 4 bytes in at once -+@ -+@ It also loads other common regs ++@ add_residual4x4_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] + -+function band_load_y -+ vmov.i64 q0, #0 -+ ldr r12, [sp, #8] @ &sao_offset_val[0] -+ add r12, #2 @ 1st interesting val is [1] -+ vld1.16 {d16}, [r12] @ Unaligned -+ vmov.i64 q1, #0 -+ ldr r12, [sp, #12] @ sao_left_class ++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1, :256] ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 + -+ mov r4, sp -+ sub sp, #32 -+ and sp, #~63 @ Align stack so we can wrap with a simple AND -+ vst1.8 {q0, q1}, [sp, :256] @ Put zero array on stack -+ add r12, sp -+ vst1.8 {d16[0]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[2]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[4]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[6]}, [r12] -+ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array -+ mov sp, r4 ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 + -+ ldr r12, [sp, #20] @ height -+ pld [r1] ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, r0, r2, lsl #2 ++ clip16_4 q0, q1, q2, q3, q8, q9 + -+ sub r12, #1 -+ add r4, r1, r3 -+ bx lr ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr +endfunc + ++@ add_residual8x8_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] + -+function band_load_c -+ vmov.i64 q2, #0 -+ ldr r12, [sp, #8] @ &sao_offset_val1[0] -+ add r12, #2 @ 1st interesting val is [1] -+ vld1.16 {d16}, [r12] @ Unaligned -+ vmov.i64 q3, #0 -+ ldr r12, [sp, #12] @ sao_left_class ++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, r2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc + -+ mov r4, sp @ Remember SP -+ sub sp, #32 -+ and sp, #~63 @ Align stack so we can wrap with a simple AND ++@ add_residual16x16_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] + -+ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack -+ add r12, sp -+ vst1.8 {d16[0]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[2]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[4]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[6]}, [r12] -+ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array ++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ sub r0, #32 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc + -+ @ And again for the 2nd set -+ ldr r12, [r4, #16] @ &sao_offset_val2[0] -+ add r12, #2 @ 1st interesting val is [1] -+ vld1.16 {d16}, [r12] @ Unaligned -+ ldr r12, [r4, #20] @ sao_left_class2 ++@ ============================================================================ ++@ V add + -+ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack (again) -+ add r12, sp -+ vst1.8 {d16[0]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[2]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[4]}, [r12]! -+ and r12, #~32 -+ vst1.8 {d16[6]}, [r12] -+ vld1.8 {q2, q3}, [sp, :256] @ Pop modified array ++@ add_residual4x4_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] + -+ mov sp, r4 ++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 ++ vld1.16 {q10, q11}, [r1, :256] ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 + -+ ldr r12, [sp, #28] @ height -+ pld [r1] ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 + -+ subs r12, #1 -+ add r4, r1, r3 -+ bx lr -+endfunc ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, r0, r2, lsl #2 ++ clip16_4 q0, q1, q2, q3, q8, q9 + ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr ++endfunc + -+@ ff_hevc_sao_band_64_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] ++@ add_residual8x8_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] + -+function ff_hevc_sao_band_64_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y -+ vmov.u8 q15, #128 -+ -+1: subs r12, #1 -+ vldm r1, {q8-q11} -+ pld [r4] -+ add r1, r3 -+ -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, r2 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc + -+ it ne -+ addne r4, r3 -+ vstm r0, {q8-q11} -+ add r0, r2 -+ bpl 1b ++@ add_residual16x16_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] + -+ pop {r4, pc} ++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ sub r0, #32 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr +endfunc + -+@ ff_hevc_sao_band_32_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] ++@ ============================================================================ ++@ U & V add + -+function ff_hevc_sao_band_32_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y -+ vmov.u8 q15, #128 ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] + -+1: subs r12, #2 -+ vld1.8 { q8, q9 }, [r1, :128], r3 -+ vld1.8 {q10, q11}, [r1, :128], r3 ++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 ++ vldm r1, {q10-q13} ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ vdup.i16 q9, r3 + -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ vld2.16 {d0, d2}, [r0, :128], r2 ++ vld2.16 {d1, d3}, [r0, :128], r2 ++ vld2.16 {d4, d6}, [r0, :128], r2 ++ vld2.16 {d5, d7}, [r0, :128], r2 + -+ vst1.8 { q8, q9 }, [r0, :128], r2 -+ vst1.8 {q10, q11}, [r0, :128], r2 -+ bpl 1b ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r0, r2, lsl #2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 + -+ pop {r4, pc} ++ vst2.16 {d0, d2}, [r0, :128], r2 ++ vst2.16 {d1, d3}, [r0, :128], r2 ++ vst2.16 {d4, d6}, [r0, :128], r2 ++ vst2.16 {d5, d7}, [r0, :128] ++ bx lr +endfunc + -+@ ff_hevc_sao_band_16_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] -+ -+function ff_hevc_sao_band_16_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y -+ vmov.u8 q15, #128 -+ -+1: subs r12, #4 -+ vld1.8 { q8}, [r1, :128], r3 -+ vld1.8 { q9}, [r1, :128], r3 -+ vld1.8 {q10}, [r1, :128], r3 -+ vld1.8 {q11}, [r1, :128], r3 ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] + -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #4 ++ vdup.i16 q9, r3 ++ add r3, r1, #(8*8*2) @ Offset to V ++1: ++ vld2.16 {q0, q1}, [r0, :256], r2 ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ vld1.16 {q12, q13}, [r3, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, r2 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst2.16 {q0, q1}, [r0, :256], r2 ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr ++endfunc + -+ vst1.8 { q8}, [r0, :128], r2 -+ vst1.8 { q9}, [r0, :128], r2 -+ vst1.8 {q10}, [r0, :128], r2 -+ vst1.8 {q11}, [r0, :128], r2 -+ bpl 1b ++@ add_residual16x16_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] + -+ pop {r4, pc} ++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 ++ movw r3, #(1 << BIT_DEPTH) - 1 ++ vmov.i64 q8, #0 ++ mov r12, #16 ++ vdup.i16 q9, r3 ++ add r3, r1, #(16*16*2) @ Offset to V ++ sub r2, #32 ++1: ++ vld2.16 {q0, q1}, [r0, :256]! ++ vld2.16 {q2, q3}, [r0, :256] ++ vld1.16 {q10, q11}, [r1, :256]! ++ vld1.16 {q12, q13}, [r3, :256]! ++ subs r12, #1 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ sub r0, #32 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmax.s16 q2, q2, q8 ++ vmax.s16 q3, q3, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vmin.s16 q2, q2, q9 ++ vmin.s16 q3, q3, q9 ++ vst2.16 {q0, q1}, [r0, :256]! ++ vst2.16 {q2, q3}, [r0, :256], r2 ++ bne 1b ++ bx lr +endfunc + -+@ ff_hevc_sao_band_8_neon_8 ( -+@ uint8_t *_dst, [r0] -+@ uint8_t *_src, [r1] -+@ ptrdiff_t stride_dst, [r2] -+@ ptrdiff_t stride_src, [r3] -+@ int16_t *sao_offset_val, [sp, #0] -+@ int sao_left_class, [sp, #4] -+@ int width, [sp, #8] -+@ int height) [sp, #12] +diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S +new file mode 100644 +index 0000000000..8c32cb23e7 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S +@@ -0,0 +1,1882 @@ ++/* ++ * Copyright (c) 2014 - 2015 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+function ff_hevc_sao_band_8_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_y -+ ldr lr, [sp, #16] @ width -+ vmov.u8 q15, #128 -+ cmp lr, #8 -+ blt 4f ++#include "libavutil/arm/asm.S" ++#include "neon.S" + -+1: subs r12, #2 -+ vld1.8 {d16}, [r1, :64], r3 -+ vld1.8 {d17}, [r1, :64], r3 ++.set EDGE_SRC_STRIDE, 160 + -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128 ++ vshr.u8 q12, q8, #3 ++ vadd.s8 q8, \Q_K128 ++ vshr.u8 q13, q9, #3 ++ vadd.s8 q9, \Q_K128 + -+ vst1.8 {d16}, [r0, :64], r2 -+ vst1.8 {d17}, [r0, :64], r2 -+ bpl 1b -+ pop {r4, pc} -+ -+4: -+1: subs r12, #4 -+ vld1.32 {d16[0]}, [r1, :32], r3 -+ vld1.32 {d16[1]}, [r1, :32], r3 -+ vld1.32 {d17[0]}, [r1, :32], r3 -+ vld1.32 {d17[1]}, [r1, :32], r3 -+ -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 -+ -+ vst1.32 {d16[0]}, [r0, :32], r2 -+ vst1.32 {d16[1]}, [r0, :32], r2 -+ vst1.32 {d17[0]}, [r0, :32], r2 -+ vst1.32 {d17[1]}, [r0, :32], r2 -+ bpl 1b -+ pop {r4, pc} -+endfunc ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 + -+@ ff_hevc_sao_band_c_32_neon_8( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] ++ vqadd.s8 q8, q12 ++ vshr.u8 q12, q10, #3 ++ vadd.s8 q10, \Q_K128 ++ vqadd.s8 q9, q13 ++ vshr.u8 q13, q11, #3 ++ vadd.s8 q11, \Q_K128 + -+function ff_hevc_sao_band_c_32_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_c ++ vsub.s8 q8, \Q_K128 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vsub.s8 q9, \Q_K128 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vqadd.s8 q10, q12 ++ vqadd.s8 q11, q13 ++ vsub.s8 q10, \Q_K128 ++ vsub.s8 q11, \Q_K128 ++.endm + -+ vmov.i8 q15, #128 -+ sub r3, #32 -+ sub r2, #32 ++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128 ++ vshr.u8 q12, q8, #3 ++ vadd.s8 q8, \Q_K128 + -+1: subs r12, #1 -+ vld2.8 { q8, q9 }, [r1, :128]! -+ vld2.8 {q10, q11}, [r1, :128], r3 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 + -+ pld [r4] ++ vqadd.s8 q8, q12 ++ vsub.s8 q8, \Q_K128 ++.endm + -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 + -+ vst2.8 { q8, q9 }, [r0, :128]! -+ vst2.8 {q10, q11}, [r0, :128], r2 ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm + -+ itt ne -+ addne r4, r3 -+ addne r4, #32 ++@ Clobbers q12, q13 ++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vshrn.i16 d26, \Q2, #(\bit_depth - 5) ++ vshrn.i16 d27, \Q3, #(\bit_depth - 5) ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vtbl.8 d26, \XLAT0, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vaddw.s8 \Q2, d26 ++ vaddw.s8 \Q3, d27 ++ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX ++.endm + -+ bpl 1b ++@ Clobbers q12 ++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++.endm + -+ pop {r4, pc} -+endfunc + -+@ ff_hevc_sao_band_c_16_neon_8( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] ++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) ++@ so we are quite safe stuffing it into a byte array ++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma ++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of ++@ precision + -+function ff_hevc_sao_band_c_16_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_c -+ vmov.i8 q15, #128 ++@ This, somewhat nasty, bit of code builds the {d0-d3} translation ++@ array via the stack ++@ Given that sao_left_class > 28 can cause wrap we can't just poke ++@ all 4 bytes in at once ++@ ++@ It also loads other common regs + -+1: subs r12, #2 -+ vld2.8 { q8, q9 }, [r1, :128], r3 -+ vld2.8 {q10, q11}, [r1, :128], r3 ++function band_load_y ++ vmov.i64 q0, #0 ++ ldr r12, [sp, #8] @ &sao_offset_val[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ vmov.i64 q1, #0 ++ ldr r12, [sp, #12] @ sao_left_class + -+ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ mov r4, sp ++ sub sp, #32 ++ and sp, #~63 @ Align stack so we can wrap with a simple AND ++ vst1.8 {q0, q1}, [sp, :256] @ Put zero array on stack ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array ++ mov sp, r4 + -+ vst2.8 { q8, q9 }, [r0, :128], r2 -+ vst2.8 {q10, q11}, [r0, :128], r2 ++ ldr r12, [sp, #20] @ height ++ pld [r1] + -+ bpl 1b -+ pop {r4, pc} ++ sub r12, #1 ++ add r4, r1, r3 ++ bx lr +endfunc + -+@ ff_hevc_sao_band_c_8_neon_8( -+@ uint8_t * dst [r0] -+@ uint8_t * src [r1] -+@ uint32_t dst_stride [r2] -+@ uint32_t src_stride [r3] -+@ const int16_t * table1 sp[0] -+@ uint32_t offset1 sp[4] -+@ const int16_t * table2 sp[8] -+@ uint32_t offset2 sp[12] -+@ int width sp[16] -+@ int height sp[20] + -+function ff_hevc_sao_band_c_8_neon_8, export=1 -+ push {r4, lr} -+ bl band_load_c -+ ldr lr, [sp, #16] @ width -+ vmov.u8 q15, #128 -+ cmp lr, #8 -+ blt 4f ++function band_load_c ++ vmov.i64 q2, #0 ++ ldr r12, [sp, #8] @ &sao_offset_val1[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ vmov.i64 q3, #0 ++ ldr r12, [sp, #12] @ sao_left_class + -+1: subs r12, #1 -+ vld2.8 {d16, d17}, [r1, :128], r3 ++ mov r4, sp @ Remember SP ++ sub sp, #32 ++ and sp, #~63 @ Align stack so we can wrap with a simple AND + -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q0, q1}, [sp, :256] @ Pop modified array + -+ vst2.8 {d16, d17}, [r0, :128], r2 -+ bpl 1b -+ pop {r4, pc} ++ @ And again for the 2nd set ++ ldr r12, [r4, #16] @ &sao_offset_val2[0] ++ add r12, #2 @ 1st interesting val is [1] ++ vld1.16 {d16}, [r12] @ Unaligned ++ ldr r12, [r4, #20] @ sao_left_class2 + -+4: -+1: subs r12, #1 -+ vld1.8 {d16}, [r1, :64], r3 -+ vld1.8 {d17}, [r1, :64], r3 -+ vuzp.8 d16, d17 ++ vst1.8 {q2, q3}, [sp, :256] @ Put zero array on stack (again) ++ add r12, sp ++ vst1.8 {d16[0]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[2]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[4]}, [r12]! ++ and r12, #~32 ++ vst1.8 {d16[6]}, [r12] ++ vld1.8 {q2, q3}, [sp, :256] @ Pop modified array + -+ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ mov sp, r4 + -+ vzip.8 d16, d17 -+ vst1.8 {d16}, [r0, :64], r2 -+ vst1.8 {d17}, [r0, :64], r2 -+ bpl 1b -+ pop {r4, pc} ++ ldr r12, [sp, #28] @ height ++ pld [r1] ++ ++ subs r12, #1 ++ add r4, r1, r3 ++ bx lr +endfunc + + -+@ ff_hevc_sao_band_64_neon_10 ( ++@ ff_hevc_rpi_sao_band_64_neon_8 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -5534,16 +6626,268 @@ index 0000000000..30113d9c93 +@ int width, [sp, #8] +@ int height) [sp, #12] + -+.macro band_64_16 bit_depth ++function ff_hevc_rpi_sao_band_64_neon_8, export=1 + push {r4, lr} -+ movw lr, #(1 << \bit_depth) - 1 -+ vmov.i64 q2, #0 -+ vdup.i16 q3, lr + bl band_load_y -+ vpush {q4-q7} ++ vmov.u8 q15, #128 + +1: subs r12, #1 -+ vldm r1, {q4-q11} ++ vldm r1, {q8-q11} ++ pld [r4] ++ add r1, r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ it ne ++ addne r4, r3 ++ vstm r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_32_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_32_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 ++ ++1: subs r12, #2 ++ vld1.8 { q8, q9 }, [r1, :128], r3 ++ vld1.8 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 { q8, q9 }, [r0, :128], r2 ++ vst1.8 {q10, q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_16_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_16_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ vmov.u8 q15, #128 ++ ++1: subs r12, #4 ++ vld1.8 { q8}, [r1, :128], r3 ++ vld1.8 { q9}, [r1, :128], r3 ++ vld1.8 {q10}, [r1, :128], r3 ++ vld1.8 {q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 { q8}, [r0, :128], r2 ++ vst1.8 { q9}, [r0, :128], r2 ++ vst1.8 {q10}, [r0, :128], r2 ++ vst1.8 {q11}, [r0, :128], r2 ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_8_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_8_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_y ++ ldr lr, [sp, #16] @ width ++ vmov.u8 q15, #128 ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #2 ++ vld1.8 {d16}, [r1, :64], r3 ++ vld1.8 {d17}, [r1, :64], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.8 {d16}, [r0, :64], r2 ++ vst1.8 {d17}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #4 ++ vld1.32 {d16[0]}, [r1, :32], r3 ++ vld1.32 {d16[1]}, [r1, :32], r3 ++ vld1.32 {d17[0]}, [r1, :32], r3 ++ vld1.32 {d17[1]}, [r1, :32], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15 ++ ++ vst1.32 {d16[0]}, [r0, :32], r2 ++ vst1.32 {d16[1]}, [r0, :32], r2 ++ vst1.32 {d17[0]}, [r0, :32], r2 ++ vst1.32 {d17[1]}, [r0, :32], r2 ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_32_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c ++ ++ vmov.i8 q15, #128 ++ sub r3, #32 ++ sub r2, #32 ++ ++1: subs r12, #1 ++ vld2.8 { q8, q9 }, [r1, :128]! ++ vld2.8 {q10, q11}, [r1, :128], r3 ++ ++ pld [r4] ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vst2.8 { q8, q9 }, [r0, :128]! ++ vst2.8 {q10, q11}, [r0, :128], r2 ++ ++ itt ne ++ addne r4, r3 ++ addne r4, #32 ++ ++ bpl 1b ++ ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_16_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c ++ vmov.i8 q15, #128 ++ ++1: subs r12, #2 ++ vld2.8 { q8, q9 }, [r1, :128], r3 ++ vld2.8 {q10, q11}, [r1, :128], r3 ++ ++ sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r0, :128], r2 ++ ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_8_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1 ++ push {r4, lr} ++ bl band_load_c ++ ldr lr, [sp, #16] @ width ++ vmov.u8 q15, #128 ++ cmp lr, #8 ++ blt 4f ++ ++1: subs r12, #1 ++ vld2.8 {d16, d17}, [r1, :128], r3 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vst2.8 {d16, d17}, [r0, :128], r2 ++ bpl 1b ++ pop {r4, pc} ++ ++4: ++1: subs r12, #1 ++ vld1.8 {d16}, [r1, :64], r3 ++ vld1.8 {d17}, [r1, :64], r3 ++ vuzp.8 d16, d17 ++ ++ sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15 ++ ++ vzip.8 d16, d17 ++ vst1.8 {d16}, [r0, :64], r2 ++ vst1.8 {d17}, [r0, :64], r2 ++ bpl 1b ++ pop {r4, pc} ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_64_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_64_16 bit_depth ++ push {r4, lr} ++ movw lr, #(1 << \bit_depth) - 1 ++ vmov.i64 q2, #0 ++ vdup.i16 q3, lr ++ bl band_load_y ++ vpush {q4-q7} ++ ++1: subs r12, #1 ++ vldm r1, {q4-q11} + add r1, r3 + sao_band_64b_16 q4, q5, q6, q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth + sao_band_64b_16 q8, q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth @@ -5555,11 +6899,11 @@ index 0000000000..30113d9c93 + pop {r4, pc} +.endm + -+function ff_hevc_sao_band_64_neon_10, export=1 ++function ff_hevc_rpi_sao_band_64_neon_10, export=1 + band_64_16 10 +endfunc + -+@ ff_hevc_sao_band_32_neon_10 ( ++@ ff_hevc_rpi_sao_band_32_neon_10 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -5587,11 +6931,11 @@ index 0000000000..30113d9c93 + pop {r4, pc} +.endm + -+function ff_hevc_sao_band_32_neon_10, export=1 ++function ff_hevc_rpi_sao_band_32_neon_10, export=1 + band_32_16 10 +endfunc + -+@ ff_hevc_sao_band_16_neon_10 ( ++@ ff_hevc_rpi_sao_band_16_neon_10 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -5619,11 +6963,11 @@ index 0000000000..30113d9c93 + pop {r4, pc} +.endm + -+function ff_hevc_sao_band_16_neon_10, export=1 ++function ff_hevc_rpi_sao_band_16_neon_10, export=1 + band_16_16 10 +endfunc + -+@ ff_hevc_sao_band_8_neon_10 ( ++@ ff_hevc_rpi_sao_band_8_neon_10 ( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -5667,12 +7011,12 @@ index 0000000000..30113d9c93 + pop {r4, pc} +.endm + -+function ff_hevc_sao_band_8_neon_10, export=1 ++function ff_hevc_rpi_sao_band_8_neon_10, export=1 + band_8_16 10 +endfunc + + -+@ ff_hevc_sao_band_c_32_neon_10( ++@ ff_hevc_rpi_sao_band_c_32_neon_10( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] @@ -5720,12 +7064,12 @@ index 0000000000..30113d9c93 + pop {r4, pc} +.endm + -+function ff_hevc_sao_band_c_32_neon_10, export=1 ++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1 + band_c_32_16 10 +endfunc + + -+@ ff_hevc_sao_band_c_16_neon_10( ++@ ff_hevc_rpi_sao_band_c_16_neon_10( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] @@ -5760,12 +7104,12 @@ index 0000000000..30113d9c93 + pop {r4, pc} +.endm + -+function ff_hevc_sao_band_c_16_neon_10, export=1 ++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1 + band_c_16_16 10 +endfunc + + -+@ ff_hevc_sao_band_c_8_neon_10( ++@ ff_hevc_rpi_sao_band_c_8_neon_10( +@ uint8_t * dst [r0] +@ uint8_t * src [r1] +@ uint32_t dst_stride [r2] @@ -5811,7 +7155,7 @@ index 0000000000..30113d9c93 + pop {r4, pc} +.endm + -+function ff_hevc_sao_band_c_8_neon_10, export=1 ++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1 + band_c_8_16 10 +endfunc + @@ -6079,7 +7423,7 @@ index 0000000000..30113d9c93 +endfunc + + -+@ ff_hevc_sao_edge_[c_]xx_neon( ++@ ff_hevc_rpi_sao_edge_[c_]xx_neon( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -6729,7 +8073,7 @@ index 0000000000..30113d9c93 +35: edge_4bx4_e3 \body_fn, \pb +.endm + -+@ void ff_hevc_sao_edge_8_neon_8( ++@ void ff_hevc_rpi_sao_edge_8_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6738,13 +8082,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #4] +@ int height) [sp, #8] + -+function ff_hevc_sao_edge_8_neon_8, export=1 ++function ff_hevc_rpi_sao_edge_8_neon_8, export=1 + edge_16b_init 8, 0, 1, 99f +99: + edge_8bx2_4bx4_bodies edge_16b_body_8, 1 +endfunc + -+@ void ff_hevc_sao_edge_16_neon_8( ++@ void ff_hevc_rpi_sao_edge_16_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6753,13 +8097,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #4] +@ int height) [sp, #8] + -+function ff_hevc_sao_edge_16_neon_8, export=1 ++function ff_hevc_rpi_sao_edge_16_neon_8, export=1 + edge_16b_init 8, 0, 0, 99f +99: + edge_16b_bodies edge_16b_body_8, 1 +endfunc + -+@ void ff_hevc_sao_edge_32_neon_8( ++@ void ff_hevc_rpi_sao_edge_32_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6768,13 +8112,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #4] +@ int height) [sp, #8] + -+function ff_hevc_sao_edge_32_neon_8, export=1 ++function ff_hevc_rpi_sao_edge_32_neon_8, export=1 + edge_64b_init 8, 0, 0, 99f +99: + edge_32bx2_bodies edge_64b_body_8, 1 +endfunc + -+@ void ff_hevc_sao_edge_64_neon_8( ++@ void ff_hevc_rpi_sao_edge_64_neon_8( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6783,13 +8127,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #4] +@ int height) [sp, #8] + -+function ff_hevc_sao_edge_64_neon_8, export=1 ++function ff_hevc_rpi_sao_edge_64_neon_8, export=1 + edge_64b_init 8, 0, 0, 99f +99: + edge_64b_bodies edge_64b_body_8, 1 +endfunc + -+@ ff_hevc_sao_edge_c_8_neon_8( ++@ ff_hevc_rpi_sao_edge_c_8_neon_8( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -6799,13 +8143,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #8] +@ int height) [sp, #12] + -+function ff_hevc_sao_edge_c_8_neon_8, export=1 ++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1 + edge_16b_init 8, 1, 1, 99f +99: + edge_16b_8bx2_bodies edge_16b_body_8, 2 +endfunc + -+@ ff_hevc_sao_edge_c_16_neon_8( ++@ ff_hevc_rpi_sao_edge_c_16_neon_8( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -6815,13 +8159,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #8] +@ int height) [sp, #12] + -+function ff_hevc_sao_edge_c_16_neon_8, export=1 ++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1 + edge_64b_init 8, 1, 0, 99f +99: + edge_32bx2_bodies edge_64b_body_8, 2 +endfunc + -+@ ff_hevc_sao_edge_c_32_neon_8( ++@ ff_hevc_rpi_sao_edge_c_32_neon_8( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -6831,13 +8175,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #8] +@ int height) [sp, #12] + -+function ff_hevc_sao_edge_c_32_neon_8, export=1 ++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1 + edge_64b_init 8, 1, 0, 99f +99: + edge_64b_bodies edge_64b_body_8, 2 +endfunc + -+@ void ff_hevc_sao_edge_8_neon_10( ++@ void ff_hevc_rpi_sao_edge_8_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6846,13 +8190,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #4] +@ int height) [sp, #8] + -+function ff_hevc_sao_edge_8_neon_10, export=1 ++function ff_hevc_rpi_sao_edge_8_neon_10, export=1 + edge_16b_init 10, 0, 1, 99f +99: + edge_16b_8bx2_bodies edge_16b_body_16, 2 +endfunc + -+@ void ff_hevc_sao_edge_16_neon_10( ++@ void ff_hevc_rpi_sao_edge_16_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6861,13 +8205,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #4] +@ int height) [sp, #8] + -+function ff_hevc_sao_edge_16_neon_10, export=1 ++function ff_hevc_rpi_sao_edge_16_neon_10, export=1 + edge_64b_init 10, 0, 0, 99f +99: + edge_32bx2_bodies edge_64b_body_16, 2 +endfunc + -+@ void ff_hevc_sao_edge_64_neon_10( ++@ void ff_hevc_rpi_sao_edge_64_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6882,11 +8226,11 @@ index 0000000000..30113d9c93 +@ Calling code will always have src != dst so we don't have to worry +@ about edge effects + -+function ff_hevc_sao_edge_64_neon_10, export=1 ++function ff_hevc_rpi_sao_edge_64_neon_10, export=1 + edge_64b_init 10, 0, 1, 99f +endfunc + -+@ void ff_hevc_sao_edge_32_neon_10( ++@ void ff_hevc_rpi_sao_edge_32_neon_10( +@ uint8_t *_dst, [r0] +@ uint8_t *_src, [r1] +@ int stride_dst, [r2] @@ -6895,13 +8239,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #4] +@ int height) [sp, #8] + -+function ff_hevc_sao_edge_32_neon_10, export=1 ++function ff_hevc_rpi_sao_edge_32_neon_10, export=1 + edge_64b_init 10, 0, 0, 99f +99: + edge_64b_bodies edge_64b_body_16, 2 +endfunc + -+@ ff_hevc_sao_edge_c_8_neon_10( ++@ ff_hevc_rpi_sao_edge_c_8_neon_10( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -6911,13 +8255,13 @@ index 0000000000..30113d9c93 +@ int width, [sp, #8] +@ int height) [sp, #12] + -+function ff_hevc_sao_edge_c_8_neon_10, export=1 ++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1 + edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1 +99: + edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4 +endfunc + -+@ ff_hevc_sao_edge_c_32_neon_10( ++@ ff_hevc_rpi_sao_edge_c_32_neon_10( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -6927,12 +8271,12 @@ index 0000000000..30113d9c93 +@ int width, [sp, #8] +@ int height) [sp, #12] + -+function ff_hevc_sao_edge_c_32_neon_10, export=1 ++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1 + edge_64b_init 10, 1, 1, 99f +endfunc + + -+@ ff_hevc_sao_edge_c_16_neon_10( ++@ ff_hevc_rpi_sao_edge_c_16_neon_10( +@ uint8_t *_dst, [r0] +@ const uint8_t *_src, [r1] +@ ptrdiff_t stride_dst, [r2] @@ -6942,44 +8286,17 @@ index 0000000000..30113d9c93 +@ int width, [sp, #8] +@ int height) [sp, #12] + -+function ff_hevc_sao_edge_c_16_neon_10, export=1 ++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1 + edge_64b_init 10, 1, 0, 99f +99: + edge_64b_bodies edge_64b_body_16, 4 +endfunc + diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h -index 18c3e3ea1e..6b380bbdf2 100644 +index 18c3e3ea1e..c26b6d607c 100644 --- a/libavcodec/avcodec.h +++ b/libavcodec/avcodec.h -@@ -449,6 +449,8 @@ enum AVCodecID { - AV_CODEC_ID_GDV, - AV_CODEC_ID_FITS, - -+ AV_CODEC_ID_H264_MVC, -+ - /* various PCM "codecs" */ - AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs - AV_CODEC_ID_PCM_S16LE = 0x10000, -@@ -2965,6 +2967,7 @@ typedef struct AVCodecContext { - #define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders. - #define FF_BUG_TRUNCATED 16384 - #define FF_BUG_IEDGE 32768 -+#define FF_BUG_GMC_UNSUPPORTED (1 << 16) - - /** - * strictly follow the standard (MPEG-4, ...). -@@ -3317,6 +3320,9 @@ typedef struct AVCodecContext { - #define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244 - #define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA) - #define FF_PROFILE_H264_CAVLC_444 44 -+#define FF_PROFILE_H264_MULTIVIEW_HIGH 118 -+#define FF_PROFILE_H264_STEREO_HIGH 128 -+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138 - - #define FF_PROFILE_VC1_SIMPLE 0 - #define FF_PROFILE_VC1_MAIN 1 -@@ -3627,7 +3633,13 @@ typedef struct AVCodecContext { +@@ -3627,7 +3627,13 @@ typedef struct AVCodecContext { #endif /** @@ -6994,6 +8311,24 @@ index 18c3e3ea1e..6b380bbdf2 100644 * the end of the audio. I.e. this number of decoded samples must be * discarded by the caller from the end of the stream to get the original * audio without any trailing padding. +@@ -4816,6 +4822,17 @@ void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst); + */ + AVCodec *avcodec_find_decoder(enum AVCodecID id); + ++/** ++ * Find a registered decoder with a matching codec ID and pix_fmt. ++ * A decoder will pix_fmt set to NULL will match any fmt. ++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL. ++ * ++ * @param id AVCodecID of the requested decoder ++ * @param fmt AVPixelForma that msut be supported by decoder ++ * @return A decoder if one was found, NULL otherwise. ++ */ ++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt); ++ + /** + * Find a registered decoder with the specified name. + * diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h index 1bf1c620d6..ccfa991f60 100644 --- a/libavcodec/cabac.h @@ -7030,10 +8365,10 @@ index af0f6da2e9..bd491c0c55 100644 AVCodecContext *avctx; BswapDSPContext bdsp; diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c -index 6a13bbbf0e..ff10f3b2bc 100644 +index 6a13bbbf0e..478b7c0ffc 100644 --- a/libavcodec/codec_desc.c +++ b/libavcodec/codec_desc.c -@@ -1665,6 +1665,48 @@ static const AVCodecDescriptor codec_descriptors[] = { +@@ -1665,6 +1665,41 @@ static const AVCodecDescriptor codec_descriptors[] = { .props = AV_CODEC_PROP_LOSSLESS, .mime_types= MT("image/png"), }, @@ -7071,202 +8406,194 @@ index 6a13bbbf0e..ff10f3b2bc 100644 + .name = "ylc", + .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, -+ }, -+ { -+ .id = AV_CODEC_ID_H264_MVC, -+ .type = AVMEDIA_TYPE_VIDEO, -+ .name = "h264_mvc", -+ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"), -+ .props = AV_CODEC_PROP_LOSSY, + }, /* various PCM "codecs" */ { -diff --git a/libavcodec/h264.h b/libavcodec/h264.h -index 86df5eb9b3..22c4f1d82a 100644 ---- a/libavcodec/h264.h -+++ b/libavcodec/h264.h -@@ -41,7 +41,9 @@ enum { - H264_NAL_END_STREAM = 11, - H264_NAL_FILLER_DATA = 12, - H264_NAL_SPS_EXT = 13, -+ H264_NAL_SPS_SUBSET = 15, - H264_NAL_AUXILIARY_SLICE = 19, -+ H264_NAL_SLICE_EXT = 20, - }; - - #endif /* AVCODEC_H264_H */ diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c -index dd0a965af0..5e43def0e9 100644 +index dd0a965af0..053325c26b 100644 --- a/libavcodec/h264_parser.c +++ b/libavcodec/h264_parser.c -@@ -60,6 +60,8 @@ typedef struct H264ParseContext { - uint8_t parse_history[6]; - int parse_history_count; - int parse_last_mb; -+ int is_mvc; -+ int slice_ext; - int64_t reference_dts; - int last_frame_num, last_picture_structure; - } H264ParseContext; -@@ -109,24 +111,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf, - } else if (state <= 5) { - int nalu_type = buf[i] & 0x1F; - if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS || -- nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) { -+ nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD || -+ nalu_type == H264_NAL_SPS_SUBSET) { - if (pc->frame_start_found) { - i++; +@@ -115,7 +115,7 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf, goto found; } } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA || - nalu_type == H264_NAL_IDR_SLICE) { -+ nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) { ++ nalu_type == H264_NAL_IDR_SLICE)) { state += 8; -+ -+ p->slice_ext = (nalu_type == H264_NAL_SLICE_EXT); continue; } - state = 7; - } else { - p->parse_history[p->parse_history_count++] = buf[i]; -- if (p->parse_history_count > 5) { -+ if (p->parse_history_count > 8) { - unsigned int mb, last_mb = p->parse_last_mb; - GetBitContext gb; - -- init_get_bits(&gb, p->parse_history, 8*p->parse_history_count); -+ init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext); - p->parse_history_count = 0; - mb= get_ue_golomb_long(&gb); - p->parse_last_mb = mb; -@@ -149,7 +154,7 @@ found: - pc->frame_start_found = 0; - if (p->is_avc) - return next_avc; -- return i - (state & 5) - 5 * (state > 7); -+ return i - (state & 5) - 8 * (state > 7); - } +diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c +index 0b1195dc3e..5ef81fa739 100644 +--- a/libavcodec/mmaldec.c ++++ b/libavcodec/mmaldec.c +@@ -24,6 +24,9 @@ + * MMAL Video Decoder + */ - static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb, -@@ -594,7 +599,8 @@ static int h264_parse(AVCodecParserContext *s, - } - } ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" + #include + #include + #include +@@ -31,6 +34,7 @@ + #include + #include + #include ++#pragma GCC diagnostic pop + #include -- parse_nal_units(s, avctx, buf, buf_size); -+ if (!p->is_mvc) -+ parse_nal_units(s, avctx, buf, buf_size); + #include "avcodec.h" +diff --git a/libavcodec/raw.c b/libavcodec/raw.c +index 8da2a9735e..9089f9b4ea 100644 +--- a/libavcodec/raw.c ++++ b/libavcodec/raw.c +@@ -283,6 +283,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { + { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, + { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, - if (avctx->framerate.num) - avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1})); -@@ -651,7 +657,7 @@ static int h264_split(AVCodecContext *avctx, - if ((state & 0xFFFFFF00) != 0x100) - break; - nalu_type = state & 0x1F; -- if (nalu_type == H264_NAL_SPS) { -+ if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) { - has_sps = 1; - } else if (nalu_type == H264_NAL_PPS) - has_pps = 1; -@@ -703,3 +709,23 @@ AVCodecParser ff_h264_parser = { - .parser_close = h264_close, - .split = h264_split, - }; ++ /* RPI (Might as well define for everything) */ ++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, + -+static av_cold int init_mvc(AVCodecParserContext *s) + /* special */ + { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ + { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ +diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c +index d181b74570..76e844caa8 100644 +--- a/libavcodec/rawenc.c ++++ b/libavcodec/rawenc.c +@@ -24,6 +24,7 @@ + * Raw Video Encoder + */ + ++#include "config.h" + #include "avcodec.h" + #include "raw.h" + #include "internal.h" +@@ -31,6 +32,10 @@ + #include "libavutil/intreadwrite.h" + #include "libavutil/imgutils.h" + #include "libavutil/internal.h" ++#include "libavutil/avassert.h" ++#if CONFIG_RPI ++#include "libavutil/rpi_sand_fns.h" ++#endif + + static av_cold int raw_encode_init(AVCodecContext *avctx) + { +@@ -49,6 +54,55 @@ FF_ENABLE_DEPRECATION_WARNINGS + return 0; + } + ++#if CONFIG_RPI ++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) +{ -+ H264ParseContext *p = s->priv_data; -+ int ret = init(s); -+ if (ret < 0) ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3 / 2; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) + return ret; + -+ p->is_mvc = 1; ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height; ++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); + return 0; +} + -+AVCodecParser ff_h264_mvc_parser = { -+ .codec_ids = { AV_CODEC_ID_H264_MVC }, -+ .priv_data_size = sizeof(H264ParseContext), -+ .parser_init = init_mvc, -+ .parser_parse = h264_parse, -+ .parser_close = h264_close, -+ .split = h264_split, -+}; -diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h -index f0fb919a7f..f3e3732ce1 100644 ---- a/libavcodec/hevc.h -+++ b/libavcodec/hevc.h -@@ -21,6 +21,49 @@ - #ifndef AVCODEC_HEVC_H - #define AVCODEC_HEVC_H - -+// define RPI to split the CABAC/prediction/transform into separate stages -+#ifndef RPI -+ -+ #define RPI_INTER 0 -+ #define RPI_TSTATS 0 -+ #define RPI_HEVC_SAND 0 -+ -+#else ++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3; ++ uint8_t * dst; ++ int ret; + -+ #include "rpi_qpu.h" -+ #define RPI_INTER 1 // 0 use ARM for UV inter-pred, 1 use QPU ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; + -+ // By passing jobs to a worker thread we hope to be able to catch up during slow frames -+ // This has no effect unless RPI_WORKER is defined -+ // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as -+ // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one -+ // free for the foreground to fill in. -+ #define RPI_MAX_JOBS 8 ++ dst = pkt->data; + -+ // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs -+ // As it stands there is something mildy broken in VPU deblock - looks mostly OK -+ // but reliably fails some conformance tests (e.g. DBLK_A/B/C_) -+ // With VPU luma & chroma pred it is much the same speed to deblock on the ARM -+ // -+ // * Whilst most of the code still exists it will have rotted by now -+// #define RPI_DEBLOCK_VPU ++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); ++ dst += width * height * 2; ++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); ++ return 0; ++} ++#endif + -+ #define RPI_VPU_DEBLOCK_CACHED 1 + -+ #if HAVE_NEON -+ #define RPI_HEVC_SAND 1 -+ #else -+ // Sand bust on Pi1 currently - reasons unknown -+ #define RPI_HEVC_SAND 0 -+ #endif + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame, int *got_packet) + { +@@ -58,6 +112,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + if (ret < 0) + return ret; + ++#if CONFIG_RPI ++ if (av_rpi_is_sand_frame(frame)) { ++ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame); ++ *got_packet = (ret == 0); ++ return ret; ++ } ++#endif + + if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) + return ret; + if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, +diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c +new file mode 100644 +index 0000000000..e498c1a3eb +--- /dev/null ++++ b/libavcodec/rpi_hevc_cabac.c +@@ -0,0 +1,2381 @@ ++/* ++ * HEVC CABAC decoding ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+ #define RPI_QPU_EMU_Y 0 -+ #define RPI_QPU_EMU_C 0 ++#define UNCHECKED_BITSTREAM_READER 1 + -+ #define RPI_TSTATS 0 -+#endif ++#include "libavutil/attributes.h" ++#include "libavutil/common.h" + - /** - * Table 7-3: NAL unit type codes - */ -diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c -index 853fd3f722..e8e6ad3b1a 100644 ---- a/libavcodec/hevc_cabac.c -+++ b/libavcodec/hevc_cabac.c -@@ -21,6 +21,8 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+#define UNCHECKED_BITSTREAM_READER 1 ++#include "cabac_functions.h" ++#include "rpi_hevc_data.h" ++#include "hevc.h" ++#include "rpi_hevcdec.h" + - #include "libavutil/attributes.h" - #include "libavutil/common.h" - -@@ -29,8 +31,68 @@ - #include "hevc.h" - #include "hevcdec.h" - -+#ifdef RPI +#include "libavutil/rpi_sand_fns.h" -+#endif + +// BY22 is probably faster than simple bypass if the processor has +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction @@ -7285,11 +8612,11 @@ index 853fd3f722..e8e6ad3b1a 100644 +#define USE_N_END_1 1 + +#if ARCH_ARM -+#include "arm/hevc_cabac.h" ++#include "arm/rpi_hevc_cabac.h" +#endif + - #define CABAC_MAX_BIN 31 - ++#define CABAC_MAX_BIN 31 ++ + +#if USE_BY22 && !USE_BY22_DIV +#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL) @@ -7326,14 +8653,422 @@ index 853fd3f722..e8e6ad3b1a 100644 +#undef I +#endif // USE_BY22 + - /** - * number of bin by SyntaxElement. - */ -@@ -447,19 +509,227 @@ static const uint8_t diag_scan8x8_inv[8][8] = { - { 28, 36, 43, 49, 54, 58, 61, 63, }, - }; - --void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts) ++/** ++ * number of bin by SyntaxElement. ++ */ ++static const int8_t num_bins_in_se[] = { ++ 1, // sao_merge_flag ++ 1, // sao_type_idx ++ 0, // sao_eo_class ++ 0, // sao_band_position ++ 0, // sao_offset_abs ++ 0, // sao_offset_sign ++ 0, // end_of_slice_flag ++ 3, // split_coding_unit_flag ++ 1, // cu_transquant_bypass_flag ++ 3, // skip_flag ++ 3, // cu_qp_delta ++ 1, // pred_mode ++ 4, // part_mode ++ 0, // pcm_flag ++ 1, // prev_intra_luma_pred_mode ++ 0, // mpm_idx ++ 0, // rem_intra_luma_pred_mode ++ 2, // intra_chroma_pred_mode ++ 1, // merge_flag ++ 1, // merge_idx ++ 5, // inter_pred_idc ++ 2, // ref_idx_l0 ++ 2, // ref_idx_l1 ++ 2, // abs_mvd_greater0_flag ++ 2, // abs_mvd_greater1_flag ++ 0, // abs_mvd_minus2 ++ 0, // mvd_sign_flag ++ 1, // mvp_lx_flag ++ 1, // no_residual_data_flag ++ 3, // split_transform_flag ++ 2, // cbf_luma ++ 4, // cbf_cb, cbf_cr ++ 2, // transform_skip_flag[][] ++ 2, // explicit_rdpcm_flag[][] ++ 2, // explicit_rdpcm_dir_flag[][] ++ 18, // last_significant_coeff_x_prefix ++ 18, // last_significant_coeff_y_prefix ++ 0, // last_significant_coeff_x_suffix ++ 0, // last_significant_coeff_y_suffix ++ 4, // significant_coeff_group_flag ++ 44, // significant_coeff_flag ++ 24, // coeff_abs_level_greater1_flag ++ 6, // coeff_abs_level_greater2_flag ++ 0, // coeff_abs_level_remaining ++ 0, // coeff_sign_flag ++ 8, // log2_res_scale_abs ++ 2, // res_scale_sign_flag ++ 1, // cu_chroma_qp_offset_flag ++ 1, // cu_chroma_qp_offset_idx ++}; ++ ++/** ++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement. ++ */ ++static const int elem_offset[sizeof(num_bins_in_se)] = { ++ 0, // sao_merge_flag ++ 1, // sao_type_idx ++ 2, // sao_eo_class ++ 2, // sao_band_position ++ 2, // sao_offset_abs ++ 2, // sao_offset_sign ++ 2, // end_of_slice_flag ++ 2, // split_coding_unit_flag ++ 5, // cu_transquant_bypass_flag ++ 6, // skip_flag ++ 9, // cu_qp_delta ++ 12, // pred_mode ++ 13, // part_mode ++ 17, // pcm_flag ++ 17, // prev_intra_luma_pred_mode ++ 18, // mpm_idx ++ 18, // rem_intra_luma_pred_mode ++ 18, // intra_chroma_pred_mode ++ 20, // merge_flag ++ 21, // merge_idx ++ 22, // inter_pred_idc ++ 27, // ref_idx_l0 ++ 29, // ref_idx_l1 ++ 31, // abs_mvd_greater0_flag ++ 33, // abs_mvd_greater1_flag ++ 35, // abs_mvd_minus2 ++ 35, // mvd_sign_flag ++ 35, // mvp_lx_flag ++ 36, // no_residual_data_flag ++ 37, // split_transform_flag ++ 40, // cbf_luma ++ 42, // cbf_cb, cbf_cr ++ 46, // transform_skip_flag[][] ++ 48, // explicit_rdpcm_flag[][] ++ 50, // explicit_rdpcm_dir_flag[][] ++ 52, // last_significant_coeff_x_prefix ++ 70, // last_significant_coeff_y_prefix ++ 88, // last_significant_coeff_x_suffix ++ 88, // last_significant_coeff_y_suffix ++ 88, // significant_coeff_group_flag ++ 92, // significant_coeff_flag ++ 136, // coeff_abs_level_greater1_flag ++ 160, // coeff_abs_level_greater2_flag ++ 166, // coeff_abs_level_remaining ++ 166, // coeff_sign_flag ++ 166, // log2_res_scale_abs ++ 174, // res_scale_sign_flag ++ 176, // cu_chroma_qp_offset_flag ++ 177, // cu_chroma_qp_offset_idx ++}; ++ ++#define CNU 154 ++/** ++ * Indexed by init_type ++ */ ++static const uint8_t init_values[3][HEVC_CONTEXTS] = { ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 200, ++ // split_coding_unit_flag ++ 139, 141, 157, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ CNU, CNU, CNU, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ CNU, ++ // part_mode ++ 184, CNU, CNU, CNU, ++ // prev_intra_luma_pred_mode ++ 184, ++ // intra_chroma_pred_mode ++ 63, 139, ++ // merge_flag ++ CNU, ++ // merge_idx ++ CNU, ++ // inter_pred_idc ++ CNU, CNU, CNU, CNU, CNU, ++ // ref_idx_l0 ++ CNU, CNU, ++ // ref_idx_l1 ++ CNU, CNU, ++ // abs_mvd_greater1_flag ++ CNU, CNU, ++ // abs_mvd_greater1_flag ++ CNU, CNU, ++ // mvp_lx_flag ++ CNU, ++ // no_residual_data_flag ++ CNU, ++ // split_transform_flag ++ 153, 138, 138, ++ // cbf_luma ++ 111, 141, ++ // cbf_cb, cbf_cr ++ 94, 138, 182, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, ++ // last_significant_coeff_y_prefix ++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, ++ // significant_coeff_group_flag ++ 91, 171, 134, 141, ++ // significant_coeff_flag ++ 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, ++ 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, ++ 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, ++ 141, 111, ++ // coeff_abs_level_greater1_flag ++ 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107, ++ 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, ++ // coeff_abs_level_greater2_flag ++ 138, 153, 136, 167, 152, 152, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 185, ++ // split_coding_unit_flag ++ 107, 139, 126, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ 197, 185, 201, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ 149, ++ // part_mode ++ 154, 139, 154, 154, ++ // prev_intra_luma_pred_mode ++ 154, ++ // intra_chroma_pred_mode ++ 152, 139, ++ // merge_flag ++ 110, ++ // merge_idx ++ 122, ++ // inter_pred_idc ++ 95, 79, 63, 31, 31, ++ // ref_idx_l0 ++ 153, 153, ++ // ref_idx_l1 ++ 153, 153, ++ // abs_mvd_greater1_flag ++ 140, 198, ++ // abs_mvd_greater1_flag ++ 140, 198, ++ // mvp_lx_flag ++ 168, ++ // no_residual_data_flag ++ 79, ++ // split_transform_flag ++ 124, 138, 94, ++ // cbf_luma ++ 153, 111, ++ // cbf_cb, cbf_cr ++ 149, 107, 167, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, ++ // last_significant_coeff_y_prefix ++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, ++ // significant_coeff_group_flag ++ 121, 140, 61, 154, ++ // significant_coeff_flag ++ 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, ++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, ++ 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, ++ 140, 140, ++ // coeff_abs_level_greater1_flag ++ 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, ++ 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, ++ // coeff_abs_level_greater2_flag ++ 107, 167, 91, 122, 107, 167, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 160, ++ // split_coding_unit_flag ++ 107, 139, 126, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ 197, 185, 201, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ 134, ++ // part_mode ++ 154, 139, 154, 154, ++ // prev_intra_luma_pred_mode ++ 183, ++ // intra_chroma_pred_mode ++ 152, 139, ++ // merge_flag ++ 154, ++ // merge_idx ++ 137, ++ // inter_pred_idc ++ 95, 79, 63, 31, 31, ++ // ref_idx_l0 ++ 153, 153, ++ // ref_idx_l1 ++ 153, 153, ++ // abs_mvd_greater1_flag ++ 169, 198, ++ // abs_mvd_greater1_flag ++ 169, 198, ++ // mvp_lx_flag ++ 168, ++ // no_residual_data_flag ++ 79, ++ // split_transform_flag ++ 224, 167, 122, ++ // cbf_luma ++ 153, 111, ++ // cbf_cb, cbf_cr ++ 149, 92, 167, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, ++ // last_significant_coeff_y_prefix ++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, ++ // significant_coeff_group_flag ++ 121, 140, 61, 154, ++ // significant_coeff_flag ++ 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, ++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, ++ 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, ++ 140, 140, ++ // coeff_abs_level_greater1_flag ++ 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, ++ 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, ++ // coeff_abs_level_greater2_flag ++ 107, 167, 91, 107, 107, 167, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++}; ++ ++static const uint8_t scan_1x1[1] = { ++ 0, ++}; ++ ++static const uint8_t horiz_scan2x2_x[4] = { ++ 0, 1, 0, 1, ++}; ++ ++static const uint8_t horiz_scan2x2_y[4] = { ++ 0, 0, 1, 1 ++}; ++ ++static const uint8_t horiz_scan4x4_x[16] = { ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++}; ++ ++static const uint8_t horiz_scan4x4_y[16] = { ++ 0, 0, 0, 0, ++ 1, 1, 1, 1, ++ 2, 2, 2, 2, ++ 3, 3, 3, 3, ++}; ++ ++static const uint8_t horiz_scan8x8_inv[8][8] = { ++ { 0, 1, 2, 3, 16, 17, 18, 19, }, ++ { 4, 5, 6, 7, 20, 21, 22, 23, }, ++ { 8, 9, 10, 11, 24, 25, 26, 27, }, ++ { 12, 13, 14, 15, 28, 29, 30, 31, }, ++ { 32, 33, 34, 35, 48, 49, 50, 51, }, ++ { 36, 37, 38, 39, 52, 53, 54, 55, }, ++ { 40, 41, 42, 43, 56, 57, 58, 59, }, ++ { 44, 45, 46, 47, 60, 61, 62, 63, }, ++}; ++ ++static const uint8_t diag_scan2x2_x[4] = { ++ 0, 0, 1, 1, ++}; ++ ++static const uint8_t diag_scan2x2_y[4] = { ++ 0, 1, 0, 1, ++}; ++ ++static const uint8_t diag_scan2x2_inv[2][2] = { ++ { 0, 2, }, ++ { 1, 3, }, ++}; ++ ++static const uint8_t diag_scan4x4_inv[4][4] = { ++ { 0, 2, 5, 9, }, ++ { 1, 4, 8, 12, }, ++ { 3, 7, 11, 14, }, ++ { 6, 10, 13, 15, }, ++}; ++ ++static const uint8_t diag_scan8x8_inv[8][8] = { ++ { 0, 2, 5, 9, 14, 20, 27, 35, }, ++ { 1, 4, 8, 13, 19, 26, 34, 42, }, ++ { 3, 7, 12, 18, 25, 33, 41, 48, }, ++ { 6, 11, 17, 24, 32, 40, 47, 53, }, ++ { 10, 16, 23, 31, 39, 46, 52, 57, }, ++ { 15, 22, 30, 38, 45, 51, 56, 60, }, ++ { 21, 29, 37, 44, 50, 55, 59, 62, }, ++ { 28, 36, 43, 49, 54, 58, 61, 63, }, ++}; ++ + +typedef struct +{ @@ -7460,7 +9195,7 @@ index 853fd3f722..e8e6ad3b1a 100644 +// into the correct state. _by22_finish must be called to return to 'normal' +// (i.e. non-bypass) cabac decoding +static inline void get_cabac_by22_start(CABACContext * const c) - { ++{ + const unsigned int bits = __builtin_ctz(c->low); + const uint32_t m = hevc_mem_bits32(c->bytestream, 0); + uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits)); @@ -7539,116 +9274,94 @@ index 853fd3f722..e8e6ad3b1a 100644 +#endif // USE_BY22 + + -+void ff_hevc_save_states(HEVCContext *s, const HEVCLocalContext * const lc, int ctb_addr_ts) ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts) +{ + // ???? Does this work with tiles + WPP? (No) + // **** Need to save rice state too + // pred_qpy is handled by get_qPy_pred and lc->first_qp_group - if (s->ps.pps->entropy_coding_sync_enabled_flag && - (ctb_addr_ts % s->ps.sps->ctb_width == 2 || - (s->ps.sps->ctb_width == 2 && - ctb_addr_ts % s->ps.sps->ctb_width == 0))) { -- memcpy(s->cabac_state, s->HEVClc->cabac_state, HEVC_CONTEXTS); ++ if (s->ps.pps->entropy_coding_sync_enabled_flag && ++ (ctb_addr_ts % s->ps.sps->ctb_width == 2 || ++ (s->ps.sps->ctb_width == 2 && ++ ctb_addr_ts % s->ps.sps->ctb_width == 0))) { + memcpy(s->cabac_state, lc->cabac_state, HEVC_CONTEXTS); - } - } - --static void load_states(HEVCContext *s) -+static void load_states(const HEVCContext * const s, HEVCLocalContext * const lc) - { -- memcpy(s->HEVClc->cabac_state, s->cabac_state, HEVC_CONTEXTS); ++ } ++} ++ ++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ + memcpy(lc->cabac_state, s->cabac_state, HEVC_CONTEXTS); - } - - static int cabac_reinit(HEVCLocalContext *lc) -@@ -467,17 +737,17 @@ static int cabac_reinit(HEVCLocalContext *lc) - return skip_bytes(&lc->cc, 0) == NULL ? AVERROR_INVALIDDATA : 0; - } - --static int cabac_init_decoder(HEVCContext *s) -+static int cabac_init_decoder(HEVCLocalContext * const lc) - { -- GetBitContext *gb = &s->HEVClc->gb; ++} ++ ++static int cabac_reinit(HEVCRpiLocalContext *lc) ++{ ++ return skip_bytes(&lc->cc, 0) == NULL ? AVERROR_INVALIDDATA : 0; ++} ++ ++static int cabac_init_decoder(HEVCRpiLocalContext * const lc) ++{ + GetBitContext * const gb = &lc->gb; - skip_bits(gb, 1); - align_get_bits(gb); -- return ff_init_cabac_decoder(&s->HEVClc->cc, ++ skip_bits(gb, 1); ++ align_get_bits(gb); + return ff_init_cabac_decoder(&lc->cc, - gb->buffer + get_bits_count(gb) / 8, - (get_bits_left(gb) + 7) / 8); - } - --static void cabac_init_state(HEVCContext *s) -+static void cabac_init_state(const HEVCContext * const s, HEVCLocalContext * const lc) - { - int init_type = 2 - s->sh.slice_type; - int i; -@@ -494,194 +764,204 @@ static void cabac_init_state(HEVCContext *s) - pre ^= pre >> 31; - if (pre > 124) - pre = 124 + (pre & 1); -- s->HEVClc->cabac_state[i] = pre; ++ gb->buffer + get_bits_count(gb) / 8, ++ (get_bits_left(gb) + 7) / 8); ++} ++ ++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int init_type = 2 - s->sh.slice_type; ++ int i; ++ ++ if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ++ init_type ^= 3; ++ ++ for (i = 0; i < HEVC_CONTEXTS; i++) { ++ int init_value = init_values[init_type][i]; ++ int m = (init_value >> 4) * 5 - 45; ++ int n = ((init_value & 15) << 3) - 16; ++ int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127; ++ ++ pre ^= pre >> 31; ++ if (pre > 124) ++ pre = 124 + (pre & 1); + lc->cabac_state[i] = pre; - } - - for (i = 0; i < 4; i++) -- s->HEVClc->stat_coeff[i] = 0; ++ } ++ ++ for (i = 0; i < 4; i++) + lc->stat_coeff[i] = 0; - } - --int ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts) -+int ff_hevc_cabac_init(const HEVCContext * const s, HEVCLocalContext *const lc, int ctb_addr_ts) - { - if (ctb_addr_ts == s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) { -- int ret = cabac_init_decoder(s); ++} ++ ++int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts) ++{ ++ if (ctb_addr_ts == s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) { + int ret = cabac_init_decoder(lc); - if (ret < 0) - return ret; - if (s->sh.dependent_slice_segment_flag == 0 || - (s->ps.pps->tiles_enabled_flag && - s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1])) -- cabac_init_state(s); ++ if (ret < 0) ++ return ret; ++ if (s->sh.dependent_slice_segment_flag == 0 || ++ (s->ps.pps->tiles_enabled_flag && ++ s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1])) + cabac_init_state(s, lc); - - if (!s->sh.first_slice_in_pic_flag && - s->ps.pps->entropy_coding_sync_enabled_flag) { - if (ctb_addr_ts % s->ps.sps->ctb_width == 0) { - if (s->ps.sps->ctb_width == 1) -- cabac_init_state(s); ++ ++ if (!s->sh.first_slice_in_pic_flag && ++ s->ps.pps->entropy_coding_sync_enabled_flag) { ++ if (ctb_addr_ts % s->ps.sps->ctb_width == 0) { ++ if (s->ps.sps->ctb_width == 1) + cabac_init_state(s, lc); - else if (s->sh.dependent_slice_segment_flag == 1) -- load_states(s); ++ else if (s->sh.dependent_slice_segment_flag == 1) + load_states(s, lc); - } - } - } else { - if (s->ps.pps->tiles_enabled_flag && - s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) { -- int ret; -- if (s->threads_number == 1) -- ret = cabac_reinit(s->HEVClc); -- else { -- ret = cabac_init_decoder(s); -- } -- if (ret < 0) -- return ret; -- cabac_init_state(s); -- } -- if (s->ps.pps->entropy_coding_sync_enabled_flag) { -- if (ctb_addr_ts % s->ps.sps->ctb_width == 0) { ++ } ++ } ++ } else { ++ if (s->ps.pps->tiles_enabled_flag && ++ s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) { + if (!lc->wpp_init) { - int ret; -- get_cabac_terminate(&s->HEVClc->cc); - if (s->threads_number == 1) -- ret = cabac_reinit(s->HEVClc); -- else { -- ret = cabac_init_decoder(s); -- } ++ int ret; ++ if (s->threads_number == 1) // **** Ummm... can only be 1 in our world but this is a wpp test + ret = cabac_reinit(lc); + else + ret = cabac_init_decoder(lc); - if (ret < 0) - return ret; ++ if (ret < 0) ++ return ret; + } + lc->wpp_init = 0; + @@ -7671,551 +9384,421 @@ index 853fd3f722..e8e6ad3b1a 100644 + return ret; + } + lc->wpp_init = 0; - - if (s->ps.sps->ctb_width == 1) -- cabac_init_state(s); ++ ++ if (s->ps.sps->ctb_width == 1) + cabac_init_state(s, lc); - else -- load_states(s); ++ else + load_states(s, lc); - } - } - } - return 0; - } - --#define GET_CABAC(ctx) get_cabac(&s->HEVClc->cc, &s->HEVClc->cabac_state[ctx]) -+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx)) - --int ff_hevc_sao_merge_flag_decode(HEVCContext *s) -+int ff_hevc_sao_merge_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[SAO_MERGE_FLAG]); -+ return get_cabac(&lc->cc, lc->cabac_state + elem_offset[SAO_MERGE_FLAG]); - } - --int ff_hevc_sao_type_idx_decode(HEVCContext *s) -+int ff_hevc_sao_type_idx_decode(HEVCLocalContext * const lc) - { -- if (!GET_CABAC(elem_offset[SAO_TYPE_IDX])) ++ } ++ } ++ } ++ return 0; ++} ++ ++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx)) ++ ++int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return get_cabac(&lc->cc, lc->cabac_state + elem_offset[SAO_MERGE_FLAG]); ++} ++ ++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc) ++{ + if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX])) - return 0; - -- if (!get_cabac_bypass(&s->HEVClc->cc)) ++ return 0; ++ + if (!get_cabac_bypass(&lc->cc)) - return SAO_BAND; - return SAO_EDGE; - } - --int ff_hevc_sao_band_position_decode(HEVCContext *s) -+int ff_hevc_sao_band_position_decode(HEVCLocalContext * const lc) - { - int i; -- int value = get_cabac_bypass(&s->HEVClc->cc); ++ return SAO_BAND; ++ return SAO_EDGE; ++} ++ ++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i; + int value = get_cabac_bypass(&lc->cc); - - for (i = 0; i < 4; i++) -- value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc); ++ ++ for (i = 0; i < 4; i++) + value = (value << 1) | get_cabac_bypass(&lc->cc); - return value; - } - --int ff_hevc_sao_offset_abs_decode(HEVCContext *s) -+int ff_hevc_sao_offset_abs_decode(const HEVCContext * const s, HEVCLocalContext * const lc) - { - int i = 0; - int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1; - -- while (i < length && get_cabac_bypass(&s->HEVClc->cc)) ++ return value; ++} ++ ++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int i = 0; ++ int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1; ++ + while (i < length && get_cabac_bypass(&lc->cc)) - i++; - return i; - } - --int ff_hevc_sao_offset_sign_decode(HEVCContext *s) -+int ff_hevc_sao_offset_sign_decode(HEVCLocalContext * const lc) - { -- return get_cabac_bypass(&s->HEVClc->cc); ++ i++; ++ return i; ++} ++ ++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc) ++{ + return get_cabac_bypass(&lc->cc); - } - --int ff_hevc_sao_eo_class_decode(HEVCContext *s) -+int ff_hevc_sao_eo_class_decode(HEVCLocalContext * const lc) - { -- int ret = get_cabac_bypass(&s->HEVClc->cc) << 1; -- ret |= get_cabac_bypass(&s->HEVClc->cc); ++} ++ ++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc) ++{ + int ret = get_cabac_bypass(&lc->cc) << 1; + ret |= get_cabac_bypass(&lc->cc); - return ret; - } - --int ff_hevc_end_of_slice_flag_decode(HEVCContext *s) -+int ff_hevc_end_of_slice_flag_decode(HEVCLocalContext * const lc) - { -- return get_cabac_terminate(&s->HEVClc->cc); ++ return ret; ++} ++ ++int ff_hevc_rpi_end_of_slice_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return get_cabac_terminate(&lc->cc); - } - --int ff_hevc_cu_transquant_bypass_flag_decode(HEVCContext *s) -+int ff_hevc_cu_transquant_bypass_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[CU_TRANSQUANT_BYPASS_FLAG]); ++} ++ ++int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[CU_TRANSQUANT_BYPASS_FLAG]); - } - --int ff_hevc_skip_flag_decode(HEVCContext *s, int x0, int y0, int x_cb, int y_cb) -+int ff_hevc_skip_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc, ++} ++ ++int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x0, const int y0, const int x_cb, const int y_cb) - { - int min_cb_width = s->ps.sps->min_cb_width; - int inc = 0; - int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); - int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); - -- if (s->HEVClc->ctb_left_flag || x0b) ++{ ++ int min_cb_width = s->ps.sps->min_cb_width; ++ int inc = 0; ++ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); ++ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); ++ + if (lc->ctb_left_flag || x0b) - inc = !!SAMPLE_CTB(s->skip_flag, x_cb - 1, y_cb); -- if (s->HEVClc->ctb_up_flag || y0b) ++ inc = !!SAMPLE_CTB(s->skip_flag, x_cb - 1, y_cb); + if (lc->ctb_up_flag || y0b) - inc += !!SAMPLE_CTB(s->skip_flag, x_cb, y_cb - 1); - -- return GET_CABAC(elem_offset[SKIP_FLAG] + inc); ++ inc += !!SAMPLE_CTB(s->skip_flag, x_cb, y_cb - 1); ++ + return GET_CABAC_LC(elem_offset[SKIP_FLAG] + inc); - } - --int ff_hevc_cu_qp_delta_abs(HEVCContext *s) -+int ff_hevc_cu_qp_delta_abs(HEVCLocalContext * const lc) - { - int prefix_val = 0; - int suffix_val = 0; - int inc = 0; - -- while (prefix_val < 5 && GET_CABAC(elem_offset[CU_QP_DELTA] + inc)) { ++} ++ ++int ff_hevc_rpi_cu_qp_delta_abs(HEVCRpiLocalContext * const lc) ++{ ++ int prefix_val = 0; ++ int suffix_val = 0; ++ int inc = 0; ++ + while (prefix_val < 5 && GET_CABAC_LC(elem_offset[CU_QP_DELTA] + inc)) { - prefix_val++; - inc = 1; - } - if (prefix_val >= 5) { - int k = 0; -- while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) { ++ prefix_val++; ++ inc = 1; ++ } ++ if (prefix_val >= 5) { ++ int k = 0; + while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { - suffix_val += 1 << k; - k++; - } -- if (k == CABAC_MAX_BIN) -- av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); ++ suffix_val += 1 << k; ++ k++; ++ } +// if (k == CABAC_MAX_BIN) +// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); - - while (k--) -- suffix_val += get_cabac_bypass(&s->HEVClc->cc) << k; ++ ++ while (k--) + suffix_val += get_cabac_bypass(&lc->cc) << k; - } - return prefix_val + suffix_val; - } - --int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s) -+int ff_hevc_cu_qp_delta_sign_flag(HEVCLocalContext * const lc) - { -- return get_cabac_bypass(&s->HEVClc->cc); ++ } ++ return prefix_val + suffix_val; ++} ++ ++int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc) ++{ + return get_cabac_bypass(&lc->cc); - } - --int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s) -+int ff_hevc_cu_chroma_qp_offset_flag(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_FLAG]); ++} ++ ++int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_FLAG]); - } - --int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s) -+int ff_hevc_cu_chroma_qp_offset_idx(const HEVCContext * const s, HEVCLocalContext * const lc) - { - int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1); - int i = 0; - -- while (i < c_max && GET_CABAC(elem_offset[CU_CHROMA_QP_OFFSET_IDX])) ++} ++ ++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1); ++ int i = 0; ++ + while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX])) - i++; - - return i; - } - --int ff_hevc_pred_mode_decode(HEVCContext *s) -+int ff_hevc_pred_mode_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[PRED_MODE_FLAG]); ++ i++; ++ ++ return i; ++} ++ ++int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[PRED_MODE_FLAG]); - } - --int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, int y0) -+int ff_hevc_split_coding_unit_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc, int ct_depth, int x0, int y0) - { - int inc = 0, depth_left = 0, depth_top = 0; - int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); -@@ -689,229 +969,232 @@ int ff_hevc_split_coding_unit_flag_decode(HEVCContext *s, int ct_depth, int x0, - int x_cb = x0 >> s->ps.sps->log2_min_cb_size; - int y_cb = y0 >> s->ps.sps->log2_min_cb_size; - -- if (s->HEVClc->ctb_left_flag || x0b) ++} ++ ++int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int ct_depth, int x0, int y0) ++{ ++ int inc = 0, depth_left = 0, depth_top = 0; ++ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); ++ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); ++ int x_cb = x0 >> s->ps.sps->log2_min_cb_size; ++ int y_cb = y0 >> s->ps.sps->log2_min_cb_size; ++ + if (lc->ctb_left_flag || x0b) - depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1]; -- if (s->HEVClc->ctb_up_flag || y0b) ++ depth_left = s->tab_ct_depth[(y_cb) * s->ps.sps->min_cb_width + x_cb - 1]; + if (lc->ctb_up_flag || y0b) - depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb]; - - inc += (depth_left > ct_depth); - inc += (depth_top > ct_depth); - -- return GET_CABAC(elem_offset[SPLIT_CODING_UNIT_FLAG] + inc); ++ depth_top = s->tab_ct_depth[(y_cb - 1) * s->ps.sps->min_cb_width + x_cb]; ++ ++ inc += (depth_left > ct_depth); ++ inc += (depth_top > ct_depth); ++ + return GET_CABAC_LC(elem_offset[SPLIT_CODING_UNIT_FLAG] + inc); - } - --int ff_hevc_part_mode_decode(HEVCContext *s, int log2_cb_size) -+int ff_hevc_part_mode_decode(const HEVCContext * const s, HEVCLocalContext * const lc, const int log2_cb_size) - { -- if (GET_CABAC(elem_offset[PART_MODE])) // 1 ++} ++ ++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size) ++{ + if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1 - return PART_2Nx2N; - if (log2_cb_size == s->ps.sps->log2_min_cb_size) { -- if (s->HEVClc->cu.pred_mode == MODE_INTRA) // 0 ++ return PART_2Nx2N; ++ if (log2_cb_size == s->ps.sps->log2_min_cb_size) { + if (lc->cu.pred_mode == MODE_INTRA) // 0 - return PART_NxN; -- if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01 ++ return PART_NxN; + if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 - return PART_2NxN; - if (log2_cb_size == 3) // 00 - return PART_Nx2N; -- if (GET_CABAC(elem_offset[PART_MODE] + 2)) // 001 ++ return PART_2NxN; ++ if (log2_cb_size == 3) // 00 ++ return PART_Nx2N; + if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001 - return PART_Nx2N; - return PART_NxN; // 000 - } - - if (!s->ps.sps->amp_enabled_flag) { -- if (GET_CABAC(elem_offset[PART_MODE] + 1)) // 01 ++ return PART_Nx2N; ++ return PART_NxN; // 000 ++ } ++ ++ if (!s->ps.sps->amp_enabled_flag) { + if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 - return PART_2NxN; - return PART_Nx2N; - } - -- if (GET_CABAC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX -- if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 011 ++ return PART_2NxN; ++ return PART_Nx2N; ++ } ++ + if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX + if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011 - return PART_2NxN; -- if (get_cabac_bypass(&s->HEVClc->cc)) // 0101 ++ return PART_2NxN; + if (get_cabac_bypass(&lc->cc)) // 0101 - return PART_2NxnD; - return PART_2NxnU; // 0100 - } - -- if (GET_CABAC(elem_offset[PART_MODE] + 3)) // 001 ++ return PART_2NxnD; ++ return PART_2NxnU; // 0100 ++ } ++ + if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001 - return PART_Nx2N; -- if (get_cabac_bypass(&s->HEVClc->cc)) // 0001 ++ return PART_Nx2N; + if (get_cabac_bypass(&lc->cc)) // 0001 - return PART_nRx2N; - return PART_nLx2N; // 0000 - } - --int ff_hevc_pcm_flag_decode(HEVCContext *s) -+int ff_hevc_pcm_flag_decode(HEVCLocalContext * const lc) - { -- return get_cabac_terminate(&s->HEVClc->cc); ++ return PART_nRx2N; ++ return PART_nLx2N; // 0000 ++} ++ ++int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return get_cabac_terminate(&lc->cc); - } - --int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCContext *s) -+int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[PREV_INTRA_LUMA_PRED_FLAG]); ++} ++ ++int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[PREV_INTRA_LUMA_PRED_FLAG]); - } - --int ff_hevc_mpm_idx_decode(HEVCContext *s) -+int ff_hevc_mpm_idx_decode(HEVCLocalContext * const lc) - { - int i = 0; -- while (i < 2 && get_cabac_bypass(&s->HEVClc->cc)) ++} ++ ++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i = 0; + while (i < 2 && get_cabac_bypass(&lc->cc)) - i++; - return i; - } - --int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCContext *s) -+int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCLocalContext * const lc) - { - int i; -- int value = get_cabac_bypass(&s->HEVClc->cc); ++ i++; ++ return i; ++} ++ ++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i; + int value = get_cabac_bypass(&lc->cc); - - for (i = 0; i < 4; i++) -- value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc); ++ ++ for (i = 0; i < 4; i++) + value = (value << 1) | get_cabac_bypass(&lc->cc); - return value; - } - --int ff_hevc_intra_chroma_pred_mode_decode(HEVCContext *s) -+int ff_hevc_intra_chroma_pred_mode_decode(HEVCLocalContext * const lc) - { - int ret; -- if (!GET_CABAC(elem_offset[INTRA_CHROMA_PRED_MODE])) ++ return value; ++} ++ ++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret; + if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE])) - return 4; - -- ret = get_cabac_bypass(&s->HEVClc->cc) << 1; -- ret |= get_cabac_bypass(&s->HEVClc->cc); ++ return 4; ++ + ret = get_cabac_bypass(&lc->cc) << 1; + ret |= get_cabac_bypass(&lc->cc); - return ret; - } - --int ff_hevc_merge_idx_decode(HEVCContext *s) -+int ff_hevc_merge_idx_decode(const HEVCContext * const s, HEVCLocalContext * const lc) - { -- int i = GET_CABAC(elem_offset[MERGE_IDX]); ++ return ret; ++} ++ ++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ + int i = GET_CABAC_LC(elem_offset[MERGE_IDX]); - - if (i != 0) { -- while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&s->HEVClc->cc)) ++ ++ if (i != 0) { + while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc)) - i++; - } - return i; - } - --int ff_hevc_merge_flag_decode(HEVCContext *s) -+int ff_hevc_merge_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[MERGE_FLAG]); ++ i++; ++ } ++ return i; ++} ++ ++int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[MERGE_FLAG]); - } - --int ff_hevc_inter_pred_idc_decode(HEVCContext *s, int nPbW, int nPbH) -+int ff_hevc_inter_pred_idc_decode(HEVCLocalContext * const lc, int nPbW, int nPbH) - { - if (nPbW + nPbH == 12) -- return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4); -- if (GET_CABAC(elem_offset[INTER_PRED_IDC] + s->HEVClc->ct_depth)) ++} ++ ++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH) ++{ ++ if (nPbW + nPbH == 12) + return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); + if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth)) - return PRED_BI; - -- return GET_CABAC(elem_offset[INTER_PRED_IDC] + 4); ++ return PRED_BI; ++ + return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); - } - --int ff_hevc_ref_idx_lx_decode(HEVCContext *s, int num_ref_idx_lx) -+int ff_hevc_ref_idx_lx_decode(HEVCLocalContext * const lc, const int num_ref_idx_lx) - { - int i = 0; - int max = num_ref_idx_lx - 1; - int max_ctx = FFMIN(max, 2); - -- while (i < max_ctx && GET_CABAC(elem_offset[REF_IDX_L0] + i)) ++} ++ ++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx) ++{ ++ int i = 0; ++ int max = num_ref_idx_lx - 1; ++ int max_ctx = FFMIN(max, 2); ++ + while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i)) - i++; - if (i == 2) { -- while (i < max && get_cabac_bypass(&s->HEVClc->cc)) ++ i++; ++ if (i == 2) { + while (i < max && get_cabac_bypass(&lc->cc)) - i++; - } - - return i; - } - --int ff_hevc_mvp_lx_flag_decode(HEVCContext *s) -+int ff_hevc_mvp_lx_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[MVP_LX_FLAG]); ++ i++; ++ } ++ ++ return i; ++} ++ ++int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[MVP_LX_FLAG]); - } - --int ff_hevc_no_residual_syntax_flag_decode(HEVCContext *s) -+int ff_hevc_no_residual_syntax_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[NO_RESIDUAL_DATA_FLAG]); ++} ++ ++int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[NO_RESIDUAL_DATA_FLAG]); - } - --static av_always_inline int abs_mvd_greater0_flag_decode(HEVCContext *s) -+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[ABS_MVD_GREATER0_FLAG]); ++} ++ ++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]); - } - --static av_always_inline int abs_mvd_greater1_flag_decode(HEVCContext *s) -+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCLocalContext * const lc) - { -- return GET_CABAC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1); ++} ++ ++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1); - } - --static av_always_inline int mvd_decode(HEVCContext *s) ++} ++ +#if !USE_BY22 -+static av_always_inline int mvd_decode(HEVCLocalContext * const lc) - { - int ret = 2; - int k = 1; - -- while (k < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) { ++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret = 2; ++ int k = 1; ++ + while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { - ret += 1U << k; - k++; - } - if (k == CABAC_MAX_BIN) { -- av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); ++ ret += 1U << k; ++ k++; ++ } ++ if (k == CABAC_MAX_BIN) { + av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); - return 0; - } ++ return 0; ++ } + - while (k--) -- ret += get_cabac_bypass(&s->HEVClc->cc) << k; -- return get_cabac_bypass_sign(&s->HEVClc->cc, -ret); ++ while (k--) + ret += get_cabac_bypass(&lc->cc) << k; + return get_cabac_bypass_sign(&lc->cc, -ret); - } ++} +#endif - --static av_always_inline int mvd_sign_flag_decode(HEVCContext *s) -+static av_always_inline int mvd_sign_flag_decode(HEVCLocalContext * const lc) - { -- return get_cabac_bypass_sign(&s->HEVClc->cc, -1); ++ ++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc) ++{ + return get_cabac_bypass_sign(&lc->cc, -1); - } - --int ff_hevc_split_transform_flag_decode(HEVCContext *s, int log2_trafo_size) -+int ff_hevc_split_transform_flag_decode(HEVCLocalContext * const lc, const int log2_trafo_size) - { -- return GET_CABAC(elem_offset[SPLIT_TRANSFORM_FLAG] + 5 - log2_trafo_size); ++} ++ ++int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size) ++{ + return GET_CABAC_LC(elem_offset[SPLIT_TRANSFORM_FLAG] + 5 - log2_trafo_size); - } - --int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth) -+int ff_hevc_cbf_cb_cr_decode(HEVCLocalContext * const lc, const int trafo_depth) - { -- return GET_CABAC(elem_offset[CBF_CB_CR] + trafo_depth); ++} ++ ++int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) ++{ + return GET_CABAC_LC(elem_offset[CBF_CB_CR] + trafo_depth); - } - --int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth) -+int ff_hevc_cbf_luma_decode(HEVCLocalContext * const lc, const int trafo_depth) - { -- return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth); ++} ++ ++int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) ++{ + return GET_CABAC_LC(elem_offset[CBF_LUMA] + !trafo_depth); - } - --static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx) -+static int hevc_transform_skip_flag_decode(HEVCLocalContext * const lc, int c_idx_nz) - { -- return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx); ++} ++ ++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ + return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz); - } - --static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx) -+static int explicit_rdpcm_flag_decode(HEVCLocalContext * const lc, int c_idx_nz) - { -- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx); ++} ++ ++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ + return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz); - } - --static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx) -+static int explicit_rdpcm_dir_flag_decode(HEVCLocalContext * const lc, int c_idx_nz) - { -- return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx); ++} ++ ++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ + return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz); - } - --int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) { -+int ff_hevc_log2_res_scale_abs(HEVCLocalContext * const lc, const int idx) { - int i =0; - -- while (i < 4 && GET_CABAC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i)) ++} ++ ++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) { ++ int i =0; ++ + while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i)) - i++; - - return i; - } - --int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) { -- return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx); -+int ff_hevc_res_scale_sign_flag(HEVCLocalContext *const lc, const int idx) { ++ i++; ++ ++ return i; ++} ++ ++int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx) { + return GET_CABAC_LC(elem_offset[RES_SCALE_SIGN_FLAG] + idx); - } - --static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx, -+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCLocalContext * const lc, int c_idx_nz, - int log2_size, int *last_scx_prefix, int *last_scy_prefix) - { - int i = 0; - int max = (log2_size << 1) - 1; - int ctx_offset, ctx_shift; - -- if (!c_idx) { ++} ++ ++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, ++ int log2_size, int *last_scx_prefix, int *last_scy_prefix) ++{ ++ int i = 0; ++ int max = (log2_size << 1) - 1; ++ int ctx_offset, ctx_shift; ++ + if (!c_idx_nz) { - ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); - ctx_shift = (log2_size + 1) >> 2; - } else { -@@ -919,150 +1202,501 @@ static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext - ctx_shift = log2_size - 2; - } - while (i < max && -- GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset)) ++ ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); ++ ctx_shift = (log2_size + 1) >> 2; ++ } else { ++ ctx_offset = 15; ++ ctx_shift = log2_size - 2; ++ } ++ while (i < max && + GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset)) - i++; - *last_scx_prefix = i; - - i = 0; - while (i < max && -- GET_CABAC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset)) ++ i++; ++ *last_scx_prefix = i; ++ ++ i = 0; ++ while (i < max && + GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset)) - i++; - *last_scy_prefix = i; - } - --static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s, -+static av_always_inline int last_significant_coeff_suffix_decode(HEVCLocalContext * const lc, - int last_significant_coeff_prefix) - { - int i; - int length = (last_significant_coeff_prefix >> 1) - 1; -- int value = get_cabac_bypass(&s->HEVClc->cc); ++ i++; ++ *last_scy_prefix = i; ++} ++ ++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc, ++ int last_significant_coeff_prefix) ++{ ++ int i; ++ int length = (last_significant_coeff_prefix >> 1) - 1; + int value = get_cabac_bypass(&lc->cc); - - for (i = 1; i < length; i++) -- value = (value << 1) | get_cabac_bypass(&s->HEVClc->cc); ++ ++ for (i = 1; i < length; i++) + value = (value << 1) | get_cabac_bypass(&lc->cc); - return value; - } - --static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg) -+static av_always_inline int significant_coeff_group_flag_decode(HEVCLocalContext * const lc, int c_idx_nz, int ctx_cg) - { - int inc; - -- inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0); ++ return value; ++} ++ ++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg) ++{ ++ int inc; ++ + inc = (ctx_cg != 0) + (c_idx_nz << 1); - -- return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc); --} --static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c, -- int offset, const uint8_t *ctx_idx_map) --{ -- int inc = ctx_idx_map[(y_c << 2) + x_c] + offset; -- return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc); ++ + return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc); - } - --static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset) -+static av_always_inline int significant_coeff_flag_decode_0(HEVCLocalContext * const lc, int offset) - { -- return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); ++} ++ ++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset) ++{ + return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); - } - --static av_always_inline int coeff_abs_level_greater1_flag_decode(HEVCContext *s, int c_idx, int inc) ++} ++ +#if !USE_BY22 +#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r) +#endif @@ -8223,20 +9806,16 @@ index 853fd3f722..e8e6ad3b1a 100644 + +#ifndef coeff_abs_level_remaining_decode_bypass +static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param) - { ++{ + uint32_t y; + unsigned int prefix; + unsigned int last_coeff_abs_level_remaining; + unsigned int n; - -- if (c_idx > 0) -- inc += 16; ++ + y = get_cabac_by22_peek(c); + prefix = hevc_clz32(~y); + // y << prefix will always have top bit 0 - -- return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + inc); --} ++ + if (prefix < 3) { + const unsigned int suffix = (y << prefix) >> (31 - rice_param); + last_coeff_abs_level_remaining = (prefix << rice_param) + suffix; @@ -8245,18 +9824,13 @@ index 853fd3f722..e8e6ad3b1a 100644 + else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2) + { + const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param)); - --static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s, int c_idx, int inc) --{ -- if (c_idx > 0) -- inc += 4; ++ + last_coeff_abs_level_remaining = (2 << rice_param) + suffix; + n = prefix * 2 + rice_param - 2; + } + else { + unsigned int suffix; - -- return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc); ++ + get_cabac_by22_flush(c, prefix, y); + y = get_cabac_by22_peek(c); + @@ -8268,56 +9842,46 @@ index 853fd3f722..e8e6ad3b1a 100644 + get_cabac_by22_flush(c, n, y); + + return last_coeff_abs_level_remaining; - } ++} +#endif - --static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param) ++ +static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param) - { - int prefix = 0; - int suffix = 0; - int last_coeff_abs_level_remaining; - int i; - -- while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc)) ++{ ++ int prefix = 0; ++ int suffix = 0; ++ int last_coeff_abs_level_remaining; ++ int i; ++ + while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c)) - prefix++; - if (prefix == CABAC_MAX_BIN) { -- av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix); ++ prefix++; ++ if (prefix == CABAC_MAX_BIN) { +// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix); - return 0; - } ++ return 0; ++ } + - if (prefix < 3) { - for (i = 0; i < rc_rice_param; i++) -- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc); ++ if (prefix < 3) { ++ for (i = 0; i < rc_rice_param; i++) + suffix = (suffix << 1) | get_cabac_bypass(c); - last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix; - } else { - int prefix_minus3 = prefix - 3; - for (i = 0; i < prefix_minus3 + rc_rice_param; i++) -- suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc); ++ last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix; ++ } else { ++ int prefix_minus3 = prefix - 3; ++ for (i = 0; i < prefix_minus3 + rc_rice_param; i++) + suffix = (suffix << 1) | get_cabac_bypass(c); - last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1) - << rc_rice_param) + suffix; - } ++ last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1) ++ << rc_rice_param) + suffix; ++ } ++ ++ return last_coeff_abs_level_remaining; ++} + - return last_coeff_abs_level_remaining; - } - --static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb) +#if !USE_BY22 +#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode +static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb) - { -- int i; -- int ret = 0; ++{ + unsigned int i; + uint32_t ret = 0; - - for (i = 0; i < nb; i++) -- ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc); -- return ret; ++ ++ for (i = 0; i < nb; i++) + ret = (ret << 1) | get_cabac_bypass(c); + + return ret << (32 - nb); @@ -8331,7 +9895,7 @@ index 853fd3f722..e8e6ad3b1a 100644 + y = get_cabac_by22_peek(c); + get_cabac_by22_flush(c, nb, y); + return y & ~(0xffffffffU >> nb); - } ++} +#endif + + @@ -8350,24 +9914,14 @@ index 853fd3f722..e8e6ad3b1a 100644 +} +#endif + - --void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, -- int log2_trafo_size, enum ScanType scan_idx, -- int c_idx) ++ +// N.B. levels returned are the values assuming coeff_abs_level_remaining +// is uncoded, so 1 must be added if it is coded. sum_abs also reflects +// this version of events. -+static inline uint32_t get_greaterx_bits(HEVCLocalContext * const lc, const unsigned int n_end, int * const levels, ++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels, + int * const pprev_subset_coded, int * const psum, + const unsigned int idx0_gt1, const unsigned int idx_gt2) - { --#define GET_COORD(offset, n) \ -- do { \ -- x_c = (x_cg << 2) + scan_x_off[n]; \ -- y_c = (y_cg << 2) + scan_y_off[n]; \ -- } while (0) -- HEVCLocalContext *lc = s->HEVClc; -- int transform_skip_flag = 0; ++{ + CABACContext * const c = &lc->cc; + uint8_t * const state0 = lc->cabac_state + idx0_gt1; + uint8_t * const state_gt2 = lc->cabac_state + idx_gt2; @@ -8492,7 +10046,7 @@ index 853fd3f722..e8e6ad3b1a 100644 + x7, x14, x11, x15} + + -+static inline int next_subset(HEVCLocalContext * const lc, int i, const int c_idx_nz, ++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz, + uint8_t * const significant_coeff_group_flag, + const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg, + int * const pPrev_sig) @@ -8521,89 +10075,74 @@ index 853fd3f722..e8e6ad3b1a 100644 + return i; +} + -+#ifdef RPI -+static void rpi_add_residual(const HEVCContext *const s, HEVCRpiJob * const jb, ++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb, + const unsigned int log2_trafo_size, const unsigned int c_idx, + const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) +{ + const AVFrame * const frame = s->frame; -+ unsigned int stride = frame->linesize[c_idx]; -+ unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; -+ unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; -+ const int is_sliced = av_rpi_is_sand_frame(frame); -+ uint8_t * dst = !is_sliced ? ++ const unsigned int stride = frame_stride1(s->frame, c_idx); ++ const unsigned int x = x0 >> ctx_hshift(s, c_idx); ++ const unsigned int y = y0 >> ctx_vshift(s, c_idx); ++ const int is_sliced = 1; // av_rpi_is_sand_frame(frame); ++ uint8_t * const dst = !is_sliced ? + s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : + c_idx == 0 ? + av_rpi_sand_frame_pos_y(frame, x, y) : + av_rpi_sand_frame_pos_c(frame, x, y); + -+ if (s->enable_rpi) { -+ const unsigned int i = jb->intra.n; -+ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; ++ const unsigned int i = jb->intra.n; ++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; + -+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && -+ pc->ta.dst == dst) -+ { -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->ta.stride == stride); -+ -+ pc->type = RPI_PRED_ADD_RESIDUAL_C; -+ } -+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && -+ pc->dc.dst == dst) -+ { -+ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->dc.stride == stride); -+ -+ // Rewrite as add residual - must rewrite all fields as different union member -+ pc->type = RPI_PRED_ADD_RESIDUAL_V; -+ pc->c_idx = c_idx; -+ pc->ta.buf = coeffs; -+ pc->ta.dst = dst; -+ pc->ta.stride = stride; -+ pc->ta.dc = dc; -+ } -+ else -+ { -+ HEVCPredCmd * const cmd = pc + 1; -+ jb->intra.n = i + 1; ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); + -+ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); -+ cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; -+ cmd->ta.buf = coeffs; -+ cmd->ta.dst = dst; -+ cmd->ta.stride = stride; -+ cmd->ta.dc = 0; -+ } -+ } -+ else if (!is_sliced || c_idx == 0) { -+ s->hevcdsp.add_residual[log2_trafo_size-2](dst, (int16_t *)coeffs, stride); ++ pc->type = RPI_PRED_ADD_RESIDUAL_C; + } -+#if RPI_HEVC_SAND -+ // * These should probably never happen -+ else if (c_idx == 1) { -+ s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0); ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride); ++ ++ // Rewrite as add residual - must rewrite all fields as different union member ++ pc->type = RPI_PRED_ADD_RESIDUAL_V; ++ pc->c_idx = c_idx; ++ pc->ta.buf = coeffs; ++ pc->ta.dst = dst; ++ pc->ta.stride = stride; ++ pc->ta.dc = dc; + } -+ else { -+ s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0); ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ jb->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); ++ cmd->size = log2_trafo_size; ++ cmd->c_idx = c_idx; ++ cmd->ta.buf = coeffs; ++ cmd->ta.dst = dst; ++ cmd->ta.stride = stride; ++ cmd->ta.dc = 0; + } -+#endif +} + + -+static void rpi_add_dc(const HEVCContext * const s, HEVCRpiJob * const jb, ++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb, + const unsigned int log2_trafo_size, const unsigned int c_idx, + const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) +{ + const AVFrame * const frame = s->frame; -+ const unsigned int stride = frame->linesize[c_idx]; -+ const unsigned int x = x0 >> s->ps.sps->hshift[c_idx]; -+ const unsigned int y = y0 >> s->ps.sps->vshift[c_idx]; -+ const int is_sliced = av_rpi_is_sand_frame(frame); ++ const unsigned int stride = frame_stride1(s->frame, c_idx); ++ const unsigned int x = x0 >> ctx_hshift(s, c_idx); ++ const unsigned int y = y0 >> ctx_vshift(s, c_idx); ++ const int is_sliced = 1; + uint8_t * const dst = !is_sliced ? + s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : + c_idx == 0 ? @@ -8613,119 +10152,99 @@ index 853fd3f722..e8e6ad3b1a 100644 + const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0); + const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1); + -+ if (s->enable_rpi) { -+ const unsigned int i = jb->intra.n; -+ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; ++ const unsigned int i = jb->intra.n; ++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; + -+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && -+ pc->ta.dst == dst) -+ { -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->ta.stride == stride); ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); + -+ pc->ta.dc = (int16_t)coeff; -+ } -+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && -+ pc->dc.dst == dst) -+ { -+ av_assert1(pc->size == log2_trafo_size && -+ pc->c_idx == 1 && -+ pc->dc.stride == stride && -+ (pc->dc.dc & ~0xffff) == 0); ++ pc->ta.dc = (int16_t)coeff; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride && ++ (pc->dc.dc & ~0xffff) == 0); + -+ pc->dc.dc |= (coeff << 16); -+ } -+ else -+ { -+ HEVCPredCmd * const cmd = pc + 1; -+ jb->intra.n = i + 1; ++ pc->dc.dc |= (coeff << 16); ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ jb->intra.n = i + 1; + -+ cmd->type = RPI_PRED_ADD_DC + c_idx; -+ cmd->size = log2_trafo_size; -+ cmd->c_idx = c_idx; -+ cmd->dc.dst = dst; -+ cmd->dc.stride = stride; -+ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; -+ } ++ cmd->type = RPI_PRED_ADD_DC + c_idx; ++ cmd->size = log2_trafo_size; ++ cmd->c_idx = c_idx; ++ cmd->dc.dst = dst; ++ cmd->dc.stride = stride; ++ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; + } +} + + -+#endif -+ -+void ff_hevc_hls_residual_coding(const HEVCContext * const s, HEVCLocalContext * const lc, ++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, + const int x0, const int y0, + const int log2_trafo_size, const enum ScanType scan_idx, + const int c_idx) +{ + int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag; - - int last_significant_coeff_x, last_significant_coeff_y; -- int last_scan_pos; -- int n_end; - int num_coeff = 0; -- int greater1_ctx = 1; ++ ++ int last_significant_coeff_x, last_significant_coeff_y; ++ int num_coeff = 0; + int prev_subset_coded = 0; - - int num_last_subset; - int x_cg_last_sig, y_cg_last_sig; - -- const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off; ++ ++ int num_last_subset; ++ int x_cg_last_sig, y_cg_last_sig; ++ + const uint8_t *scan_x_cg, *scan_y_cg; + const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; - -+#ifndef RPI - ptrdiff_t stride = s->frame->linesize[c_idx]; - int hshift = s->ps.sps->hshift[c_idx]; - int vshift = s->ps.sps->vshift[c_idx]; -- uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + -+ uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride + - ((x0 >> hshift) << s->ps.sps->pixel_shift)]; -- int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); -- uint8_t significant_coeff_group_flag[8][8] = {{0}}; -+#endif -+#ifdef RPI ++ + int use_vpu; + int use_dc = 0; -+#endif + int16_t *coeffs; + uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero - int explicit_rdpcm_flag = 0; - int explicit_rdpcm_dir_flag; - -- int trafo_size = 1 << log2_trafo_size; - int i; -- int qp,shift,add,scale,scale_m; ++ int explicit_rdpcm_flag = 0; ++ int explicit_rdpcm_dir_flag; ++ ++ int i; + int qp,shift,scale; - static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 }; - const uint8_t *scale_matrix = NULL; - uint8_t dc_scale; - int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode : - lc->tu.intra_pred_mode_c; - -- memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t)); -+ int prev_sig = 0; ++ static const uint8_t const level_scale[] = { 40, 45, 51, 57, 64, 72 }; ++ const uint8_t *scale_matrix = NULL; ++ uint8_t dc_scale; + const int c_idx_nz = (c_idx != 0); -+ ++ const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ int prev_sig = 0; + int may_hide_sign; - - // Derive QP for dequant - if (!lc->cu.cu_transquant_bypass_flag) { -- static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 }; ++ ++ // Derive QP for dequant ++ if (!lc->cu.cu_transquant_bypass_flag) { + static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 }; - static const uint8_t rem6[51 + 4 * 6 + 1] = { - 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, - 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, -@@ -1078,9 +1712,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - }; - int qp_y = lc->qp_y; - ++ static const uint8_t rem6[51 + 4 * 6 + 1] = { ++ 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, ++ 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, ++ 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, ++ 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 ++ }; ++ ++ static const uint8_t div6[51 + 4 * 6 + 1] = { ++ 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, ++ 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, ++ 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, ++ 10, 10, 11, 11, 11, 11, 11, 11, 12, 12 ++ }; ++ int qp_y = lc->qp_y; ++ + may_hide_sign = s->ps.pps->sign_data_hiding_flag; + - if (s->ps.pps->transform_skip_enabled_flag && - log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) { -- transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx); ++ if (s->ps.pps->transform_skip_enabled_flag && ++ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) { + int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz); + if (transform_skip_flag) { + trans_skip_or_bypass = 1; @@ -8735,18 +10254,38 @@ index 853fd3f722..e8e6ad3b1a 100644 + may_hide_sign = 0; + } + } - } - - if (c_idx == 0) { -@@ -1113,50 +1757,87 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - qp += s->ps.sps->qp_bd_offset; - } - -- shift = s->ps.sps->bit_depth + log2_trafo_size - 5; -- add = 1 << (shift-1); -- scale = level_scale[rem6[qp]] << (div6[qp]); -- scale_m = 16; // default when no custom scaling lists. -- dc_scale = 16; ++ } ++ ++ if (c_idx == 0) { ++ qp = qp_y + s->ps.sps->qp_bd_offset; ++ } else { ++ int qp_i, offset; ++ ++ if (c_idx == 1) ++ offset = s->ps.pps->cb_qp_offset + s->sh.slice_cb_qp_offset + ++ lc->tu.cu_qp_offset_cb; ++ else ++ offset = s->ps.pps->cr_qp_offset + s->sh.slice_cr_qp_offset + ++ lc->tu.cu_qp_offset_cr; ++ ++ qp_i = av_clip(qp_y + offset, - s->ps.sps->qp_bd_offset, 57); ++ if (ctx_cfmt(s) == 1) { ++ if (qp_i < 30) ++ qp = qp_i; ++ else if (qp_i > 43) ++ qp = qp_i - 6; ++ else ++ qp = qp_c[qp_i - 30]; ++ } else { ++ if (qp_i > 51) ++ qp = 51; ++ else ++ qp = qp_i; ++ } ++ ++ qp += s->ps.sps->qp_bd_offset; ++ } ++ + // Shift is set to one less than will actually occur as the scale + // and saturate step adds 1 and then shifts right again + shift = s->ps.sps->bit_depth + log2_trafo_size - 6; @@ -8757,21 +10296,19 @@ index 853fd3f722..e8e6ad3b1a 100644 + } else { + shift -= div6[qp]; + } - -- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) { ++ + if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) { - const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ? -- &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; ++ const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ? + &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; - int matrix_id = lc->cu.pred_mode != MODE_INTRA; - - matrix_id = 3 * matrix_id + c_idx; - - scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id]; ++ int matrix_id = lc->cu.pred_mode != MODE_INTRA; ++ ++ matrix_id = 3 * matrix_id + c_idx; ++ ++ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id]; + dc_scale = scale_matrix[0]; - if (log2_trafo_size >= 4) - dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id]; - } ++ if (log2_trafo_size >= 4) ++ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id]; ++ } + else + { + static const uint8_t sixteen_scale[64] = { @@ -8787,7 +10324,7 @@ index 853fd3f722..e8e6ad3b1a 100644 + scale_matrix = sixteen_scale; + dc_scale = 16; + } - } else { ++ } else { + static const uint8_t unit_scale[64] = { + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, @@ -8799,198 +10336,138 @@ index 853fd3f722..e8e6ad3b1a 100644 + 1, 1, 1, 1, 1, 1, 1, 1, + }; + scale_matrix = unit_scale; - shift = 0; -- add = 0; -- scale = 0; -- dc_scale = 0; ++ shift = 0; + scale = 2; // We will shift right to kill this + dc_scale = 1; + + may_hide_sign = 0; - } - ++ } ++ + + + - if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && -- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) { -- explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx); ++ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && + trans_skip_or_bypass) { + explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz); - if (explicit_rdpcm_flag) { -- explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx); ++ if (explicit_rdpcm_flag) { + may_hide_sign = 0; + explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz); - } - } - -- last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size, ++ } ++ } ++ + last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size, - &last_significant_coeff_x, &last_significant_coeff_y); - - if (last_significant_coeff_x > 3) { -- int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_x); ++ &last_significant_coeff_x, &last_significant_coeff_y); ++ ++ if (last_significant_coeff_x > 3) { + int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x); - last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) * - (2 + (last_significant_coeff_x & 1)) + - suffix; - } - - if (last_significant_coeff_y > 3) { -- int suffix = last_significant_coeff_suffix_decode(s, last_significant_coeff_y); ++ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) * ++ (2 + (last_significant_coeff_x & 1)) + ++ suffix; ++ } ++ ++ if (last_significant_coeff_y > 3) { + int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y); - last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) * - (2 + (last_significant_coeff_y & 1)) + - suffix; -@@ -1173,119 +1854,145 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - int last_x_c = last_significant_coeff_x & 3; - int last_y_c = last_significant_coeff_y & 3; - -- scan_x_off = ff_hevc_diag_scan4x4_x; -- scan_y_off = ff_hevc_diag_scan4x4_y; - num_coeff = diag_scan4x4_inv[last_y_c][last_x_c]; -- if (trafo_size == 4) { ++ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) * ++ (2 + (last_significant_coeff_y & 1)) + ++ suffix; ++ } ++ ++ if (scan_idx == SCAN_VERT) ++ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y); ++ ++ x_cg_last_sig = last_significant_coeff_x >> 2; ++ y_cg_last_sig = last_significant_coeff_y >> 2; ++ ++ switch (scan_idx) { ++ case SCAN_DIAG: { ++ int last_x_c = last_significant_coeff_x & 3; ++ int last_y_c = last_significant_coeff_y & 3; ++ ++ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c]; + + switch (log2_trafo_size) { + case 2: - scan_x_cg = scan_1x1; - scan_y_cg = scan_1x1; -- } else if (trafo_size == 8) { ++ scan_x_cg = scan_1x1; ++ scan_y_cg = scan_1x1; + break; + case 3: - num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4; - scan_x_cg = diag_scan2x2_x; - scan_y_cg = diag_scan2x2_y; -- } else if (trafo_size == 16) { ++ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = diag_scan2x2_x; ++ scan_y_cg = diag_scan2x2_y; + break; + case 4: - num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4; - scan_x_cg = ff_hevc_diag_scan4x4_x; - scan_y_cg = ff_hevc_diag_scan4x4_y; -- } else { // trafo_size == 32 ++ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = ff_hevc_rpi_diag_scan4x4_x; ++ scan_y_cg = ff_hevc_rpi_diag_scan4x4_y; + break; + case 5: + default: - num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4; - scan_x_cg = ff_hevc_diag_scan8x8_x; - scan_y_cg = ff_hevc_diag_scan8x8_y; ++ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = ff_hevc_rpi_diag_scan8x8_x; ++ scan_y_cg = ff_hevc_rpi_diag_scan8x8_y; + break; - } - break; - } - case SCAN_HORIZ: - scan_x_cg = horiz_scan2x2_x; - scan_y_cg = horiz_scan2x2_y; -- scan_x_off = horiz_scan4x4_x; -- scan_y_off = horiz_scan4x4_y; - num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x]; - break; - default: //SCAN_VERT - scan_x_cg = horiz_scan2x2_y; - scan_y_cg = horiz_scan2x2_x; -- scan_x_off = horiz_scan4x4_y; -- scan_y_off = horiz_scan4x4_x; - num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y]; - break; - } - num_coeff++; - num_last_subset = (num_coeff - 1) >> 4; - -- for (i = num_last_subset; i >= 0; i--) { -- int n, m; -- int x_cg, y_cg, x_c, y_c, pos; ++ } ++ break; ++ } ++ case SCAN_HORIZ: ++ scan_x_cg = horiz_scan2x2_x; ++ scan_y_cg = horiz_scan2x2_y; ++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x]; ++ break; ++ default: //SCAN_VERT ++ scan_x_cg = horiz_scan2x2_y; ++ scan_y_cg = horiz_scan2x2_x; ++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y]; ++ break; ++ } ++ num_coeff++; ++ num_last_subset = (num_coeff - 1) >> 4; ++ + significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant + + { + const unsigned int ccount = 1 << (log2_trafo_size * 2); -+#ifdef RPI ++ const int special = trans_skip_or_bypass || lc->tu.cross_pf; // These need special processinmg + use_vpu = 0; -+ if (s->enable_rpi) { -+ const int special = trans_skip_or_bypass || lc->tu.cross_pf; // These need special processinmg -+ use_dc = (num_coeff == 1) && !special && -+ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); -+ -+ if (use_dc) { -+ // Just need a little empty space -+ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); -+ // No need to clear -+ } -+ else -+ { -+ use_vpu = !special && log2_trafo_size >= 4; -+ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount); -+#if HAVE_NEON -+ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); -+#else -+ memset(coeffs, 0, ccount * sizeof(int16_t)); -+#endif -+ } ++ use_dc = (num_coeff == 1) && !special && ++ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); ++ ++ if (use_dc) { ++ // Just need a little empty space ++ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ // No need to clear + } + else -+#endif + { -+ coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer); ++ use_vpu = !special && log2_trafo_size >= 4; ++ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount); ++#if HAVE_NEON ++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); ++#else + memset(coeffs, 0, ccount * sizeof(int16_t)); ++#endif + } + } + + i = num_last_subset; + do { - int implicit_non_zero_coeff = 0; -- int64_t trans_coeff_level; -- int prev_sig = 0; -- int offset = i << 4; -- int rice_init = 0; ++ int implicit_non_zero_coeff = 0; + int n_end; - - uint8_t significant_coeff_flag_idx[16]; -- uint8_t nb_significant_coeff_flag = 0; -- -- x_cg = scan_x_cg[i]; -- y_cg = scan_y_cg[i]; -- -- if ((i < num_last_subset) && (i > 0)) { -- int ctx_cg = 0; -- if (x_cg < (1 << (log2_trafo_size - 2)) - 1) -- ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg]; -- if (y_cg < (1 << (log2_trafo_size - 2)) - 1) -- ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1]; -- -- significant_coeff_group_flag[x_cg][y_cg] = -- significant_coeff_group_flag_decode(s, c_idx, ctx_cg); -- implicit_non_zero_coeff = 1; -- } else { -- significant_coeff_group_flag[x_cg][y_cg] = -- ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) || -- (x_cg == 0 && y_cg == 0)); -- } -- -- last_scan_pos = num_coeff - offset - 1; ++ ++ uint8_t significant_coeff_flag_idx[16]; + unsigned int nb_significant_coeff_flag = 0; - - if (i == num_last_subset) { ++ ++ if (i == num_last_subset) { + // First time through + int last_scan_pos = num_coeff - (i << 4) - 1; - n_end = last_scan_pos - 1; - significant_coeff_flag_idx[0] = last_scan_pos; - nb_significant_coeff_flag = 1; - } else { - n_end = 15; ++ n_end = last_scan_pos - 1; ++ significant_coeff_flag_idx[0] = last_scan_pos; ++ nb_significant_coeff_flag = 1; ++ } else { ++ n_end = 15; + implicit_non_zero_coeff = (i != 0); - } - -- if (x_cg < ((1 << log2_trafo_size) - 1) >> 2) -- prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg]; -- if (y_cg < ((1 << log2_trafo_size) - 1) >> 2) -- prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1); -- -- if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) { -- static const uint8_t ctx_idx_map[] = { -- 0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2 -- 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0 -- 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1 -- 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2 -- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 // default ++ } ++ + if (n_end >= 0) { + static const uint8_t ctx_idx_maps_ts2[3][16] = { + D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 @@ -9017,51 +10494,38 @@ index 853fd3f722..e8e6ad3b1a 100644 + V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 + V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default + } - }; - const uint8_t *ctx_idx_map_p; - int scf_offset = 0; -- if (s->ps.sps->transform_skip_context_enabled_flag && -- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) { -- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16]; -- if (c_idx == 0) { -- scf_offset = 40; -- } else { -- scf_offset = 14 + 27; -- } ++ }; ++ const uint8_t *ctx_idx_map_p; ++ int scf_offset = 0; + + if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { + ctx_idx_map_p = ctx_idx_maps[0][3]; + scf_offset = 40 + c_idx_nz; - } else { -- if (c_idx != 0) ++ } else { + if (c_idx_nz != 0) - scf_offset = 27; ++ scf_offset = 27; + - if (log2_trafo_size == 2) { -- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0]; ++ if (log2_trafo_size == 2) { + ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx]; - } else { -- ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4]; -- if (c_idx == 0) { -- if ((x_cg > 0 || y_cg > 0)) ++ } else { + ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig]; + if (!c_idx_nz) { + if (i != 0) - scf_offset += 3; ++ scf_offset += 3; + - if (log2_trafo_size == 3) { - scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; - } else { -@@ -1299,34 +2006,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - } - } - } -- for (n = n_end; n > 0; n--) { -- x_c = scan_x_off[n]; -- y_c = scan_y_off[n]; -- if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) { -- significant_coeff_flag_idx[nb_significant_coeff_flag] = n; -- nb_significant_coeff_flag++; ++ if (log2_trafo_size == 3) { ++ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; ++ } else { ++ scf_offset += 21; ++ } ++ } else { ++ if (log2_trafo_size == 3) ++ scf_offset += 9; ++ else ++ scf_offset += 12; ++ } ++ } ++ } + + if (n_end > 0) { + int cnt = get_sig_coeff_flag_idxs(&lc->cc, @@ -9071,42 +10535,30 @@ index 853fd3f722..e8e6ad3b1a 100644 + + nb_significant_coeff_flag += cnt; + if (cnt != 0) { - implicit_non_zero_coeff = 0; - } - } ++ implicit_non_zero_coeff = 0; ++ } ++ } + - if (implicit_non_zero_coeff == 0) { -- if (s->ps.sps->transform_skip_context_enabled_flag && -- (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) { -- if (c_idx == 0) { -- scf_offset = 42; -- } else { -- scf_offset = 16 + 27; -- } ++ if (implicit_non_zero_coeff == 0) { + if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { + scf_offset = 42 + c_idx_nz; - } else { - if (i == 0) { -- if (c_idx == 0) -- scf_offset = 0; -- else -- scf_offset = 27; ++ } else { ++ if (i == 0) { + scf_offset = c_idx_nz ? 27 : 0; - } else { - scf_offset = 2 + scf_offset; - } - } -- if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) { ++ } else { ++ scf_offset = 2 + scf_offset; ++ } ++ } + if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) { - significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; - nb_significant_coeff_flag++; - } -@@ -1336,141 +2039,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - } - } - -- n_end = nb_significant_coeff_flag; -- ++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; ++ nb_significant_coeff_flag++; ++ } ++ } else { ++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; ++ nb_significant_coeff_flag++; ++ } ++ } ++ + if (nb_significant_coeff_flag != 0) { + const unsigned int gt1_idx_delta = (c_idx_nz << 2) | + ((i != 0 && !c_idx_nz) ? 2 : 0) | @@ -9153,20 +10605,10 @@ index 853fd3f722..e8e6ad3b1a 100644 + prev_subset_coded = 1; + coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2); + } - -- if (n_end) { -- int first_nz_pos_in_cg; -- int last_nz_pos_in_cg; -- int c_rice_param = 0; -- int first_greater1_coeff_idx = -1; -- uint8_t coeff_abs_level_greater1_flag[8]; -- uint16_t coeff_sign_flag; -- int sum_abs = 0; -- int sign_hidden; -- int sb_type; ++ + // Probably not worth the overhead of starting by22 for just one value + coeff_sign_flag = get_cabac_bypass(&lc->cc); - ++ + if (coded_val) + { + if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { @@ -9176,58 +10618,25 @@ index 853fd3f722..e8e6ad3b1a 100644 + lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); + const unsigned int c_rice_param = *stat_coeff >> 2; + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param); - -- // initialize first elem of coeff_bas_level_greater1_flag -- int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0; ++ + trans_coeff_level = 3 + last_coeff_abs_level_remaining; + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + } + } - -- if (s->ps.sps->persistent_rice_adaptation_enabled_flag) { -- if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag) -- sb_type = 2 * (c_idx == 0 ? 1 : 0); -- else -- sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1; -- c_rice_param = lc->stat_coeff[sb_type] / 4; -- } ++ + { + const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; + const int k = (int32_t)(coeff_sign_flag << 31) >> 31; + const unsigned int scale_m = blk_scale[xy_off->scale]; - -- if (!(i == num_last_subset) && greater1_ctx == 0) -- ctx_set++; -- greater1_ctx = 1; -- last_nz_pos_in_cg = significant_coeff_flag_idx[0]; -- -- for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) { -- int inc = (ctx_set << 2) + greater1_ctx; -- coeff_abs_level_greater1_flag[m] = -- coeff_abs_level_greater1_flag_decode(s, c_idx, inc); -- if (coeff_abs_level_greater1_flag[m]) { -- greater1_ctx = 0; -- if (first_greater1_coeff_idx == -1) -- first_greater1_coeff_idx = m; -- } else if (greater1_ctx > 0 && greater1_ctx < 3) { -- greater1_ctx++; ++ + blk_coeffs[xy_off->coeff] = trans_scale_sat( + (trans_coeff_level ^ k) - k, // Apply sign + scale, + i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m, + shift); - } - } -- first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1]; -- -- if (lc->cu.cu_transquant_bypass_flag || -- (lc->cu.pred_mode == MODE_INTRA && -- s->ps.sps->implicit_rdpcm_enabled_flag && transform_skip_flag && -- (pred_mode_intra == 10 || pred_mode_intra == 26 )) || -- explicit_rdpcm_flag) -- sign_hidden = 0; - else -- sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4); ++ } ++ } ++ else +#endif + { + int sign_hidden = may_hide_sign; @@ -9265,41 +10674,14 @@ index 853fd3f722..e8e6ad3b1a 100644 + level += z; + coded_vals <<= z; + } - -- if (first_greater1_coeff_idx != -1) { -- coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set); -- } -- if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) { -- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag); -- } else { -- coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1)); -- } ++ + { + const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param); + const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1; + + sum_abs += last_coeff_abs_level_remaining + 1; + *level = trans_coeff_level; - -- for (m = 0; m < n_end; m++) { -- n = significant_coeff_flag_idx[m]; -- GET_COORD(offset, n); -- if (m < 8) { -- trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m]; -- if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) { -- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param); -- -- trans_coeff_level += last_coeff_abs_level_remaining; -- if (trans_coeff_level > (3 << c_rice_param)) -- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4); -- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) { -- int c_rice_p_init = lc->stat_coeff[sb_type] / 4; -- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init)) -- lc->stat_coeff[sb_type]++; -- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init)) -- if (lc->stat_coeff[sb_type] > 0) -- lc->stat_coeff[sb_type]--; -- rice_init = 1; ++ + if (stat_coeff != NULL) + update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); + stat_coeff = NULL; @@ -9307,50 +10689,14 @@ index 853fd3f722..e8e6ad3b1a 100644 + if (trans_coeff_level > (3 << c_rice_param) && + (c_rice_param < 4 || rice_adaptation_enabled)) + ++c_rice_param; - } -- } -- } else { -- int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param); -- -- trans_coeff_level = 1 + last_coeff_abs_level_remaining; -- if (trans_coeff_level > (3 << c_rice_param)) -- c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4); -- if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) { -- int c_rice_p_init = lc->stat_coeff[sb_type] / 4; -- if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init)) -- lc->stat_coeff[sb_type]++; -- else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init)) -- if (lc->stat_coeff[sb_type] > 0) -- lc->stat_coeff[sb_type]--; -- rice_init = 1; -- } ++ } + } while (coded_vals != 0); - } -- if (s->ps.pps->sign_data_hiding_flag && sign_hidden) { -- sum_abs += trans_coeff_level; -- if (n == first_nz_pos_in_cg && (sum_abs&1)) -- trans_coeff_level = -trans_coeff_level; ++ } + + // sign_hidden = 0 or 1 so we can combine the tests + if ((sign_hidden & sum_abs) != 0) { + levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1]; - } -- if (coeff_sign_flag >> 15) -- trans_coeff_level = -trans_coeff_level; -- coeff_sign_flag <<= 1; -- if(!lc->cu.cu_transquant_bypass_flag) { -- if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) { -- if(y_c || x_c || log2_trafo_size < 4) { -- switch(log2_trafo_size) { -- case 3: pos = (y_c << 3) + x_c; break; -- case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break; -- case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break; -- default: pos = (y_c << 2) + x_c; break; -- } -- scale_m = scale_matrix[pos]; -- } else { -- scale_m = dc_scale; -- } ++ } + + bypass_finish(&lc->cc); + @@ -9367,14 +10713,7 @@ index 853fd3f722..e8e6ad3b1a 100644 + blk_coeffs[0] = trans_scale_sat( + (levels[m] ^ k) - k, scale, dc_scale, shift); + --m; - } -- trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift; -- if(trans_coeff_level < 0) { -- if((~trans_coeff_level) & 0xFffffffffff8000) -- trans_coeff_level = -32768; -- } else { -- if(trans_coeff_level & 0xffffffffffff8000) -- trans_coeff_level = 32767; ++ } + +#if !USE_N_END_1 + // If N_END_1 set then m was at least 1 initially @@ -9392,112 +10731,104 @@ index 853fd3f722..e8e6ad3b1a 100644 + blk_scale[xy_off->scale], + shift); + } while (--m >= 0); - } - } -- coeffs[y_c * trafo_size + x_c] = trans_coeff_level; ++ } ++ } + - } - } -- } ++ } ++ } + } while ((i = next_subset(lc, i, c_idx_nz, + significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0); - - if (lc->cu.cu_transquant_bypass_flag) { - if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && -@@ -1480,7 +2227,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); - } - } else { -- if (transform_skip_flag) { ++ ++ if (lc->cu.cu_transquant_bypass_flag) { ++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && ++ (pred_mode_intra == 10 || pred_mode_intra == 26))) { ++ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag; ++ ++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); ++ } ++ } else { + if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass - int rot = s->ps.sps->transform_skip_rotation_enabled_flag && - log2_trafo_size == 2 && - lc->cu.pred_mode == MODE_INTRA; -@@ -1500,10 +2247,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - } - } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { - s->hevcdsp.transform_4x4_luma(coeffs); -- } else { ++ int rot = s->ps.sps->transform_skip_rotation_enabled_flag && ++ log2_trafo_size == 2 && ++ lc->cu.pred_mode == MODE_INTRA; ++ if (rot) { ++ for (i = 0; i < 8; i++) ++ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); ++ } ++ ++ s->hevcdsp.dequant(coeffs, log2_trafo_size); ++ ++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && ++ lc->cu.pred_mode == MODE_INTRA && ++ (pred_mode_intra == 10 || pred_mode_intra == 26))) { ++ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26); ++ ++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); ++ } ++ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { ++ s->hevcdsp.transform_4x4_luma(coeffs); + } -+#ifdef RPI + else if (!use_vpu) -+#else -+ else -+#endif + { - int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); - if (max_xy == 0) -- s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); ++ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); ++ if (max_xy == 0) + { -+#ifdef RPI + if (use_dc) + rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); + else -+#endif + s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); + } - else { - int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; - if (max_xy < 4) -@@ -1517,36 +2277,158 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, - } - } - if (lc->tu.cross_pf) { -- int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer; ++ else { ++ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; ++ if (max_xy < 4) ++ col_limit = FFMIN(4, col_limit); ++ else if (max_xy < 8) ++ col_limit = FFMIN(8, col_limit); ++ else if (max_xy < 12) ++ col_limit = FFMIN(24, col_limit); ++ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); ++ } ++ } ++ } ++ if (lc->tu.cross_pf) { + int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer; + const int ccount = 1 << (log2_trafo_size * 2); - -- for (i = 0; i < (trafo_size * trafo_size); i++) { ++ + for (i = 0; i < ccount; i++) { - coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); - } - } -+#ifdef RPI ++ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); ++ } ++ } ++ + if (!use_dc) -+ { + rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); -+ } -+#else - s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride); -+#endif - } - --void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size) ++} ++ +#if !USE_BY22 +// Stores results to lc -+void ff_hevc_hls_mvd_coding(HEVCLocalContext * const lc) - { -- HEVCLocalContext *lc = s->HEVClc; -- int x = abs_mvd_greater0_flag_decode(s); -- int y = abs_mvd_greater0_flag_decode(s); ++void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) ++{ + int x = abs_mvd_greater0_flag_decode(lc); + int y = abs_mvd_greater0_flag_decode(lc); - - if (x) -- x += abs_mvd_greater1_flag_decode(s); ++ ++ if (x) + x += abs_mvd_greater1_flag_decode(lc); - if (y) -- y += abs_mvd_greater1_flag_decode(s); ++ if (y) + y += abs_mvd_greater1_flag_decode(lc); - - switch (x) { -- case 2: lc->pu.mvd.x = mvd_decode(s); break; -- case 1: lc->pu.mvd.x = mvd_sign_flag_decode(s); break; ++ ++ switch (x) { + case 2: lc->pu.mvd.x = mvd_decode(lc); break; + case 1: lc->pu.mvd.x = mvd_sign_flag_decode(lc); break; - case 0: lc->pu.mvd.x = 0; break; - } - - switch (y) { -- case 2: lc->pu.mvd.y = mvd_decode(s); break; -- case 1: lc->pu.mvd.y = mvd_sign_flag_decode(s); break; ++ case 0: lc->pu.mvd.x = 0; break; ++ } ++ ++ switch (y) { + case 2: lc->pu.mvd.y = mvd_decode(lc); break; + case 1: lc->pu.mvd.y = mvd_sign_flag_decode(lc); break; - case 0: lc->pu.mvd.y = 0; break; - } - } ++ case 0: lc->pu.mvd.y = 0; break; ++ } ++} +#else -+void ff_hevc_hls_mvd_coding(HEVCLocalContext * const lc) ++void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) +{ + int x = abs_mvd_greater0_flag_decode(lc); + int y = abs_mvd_greater0_flag_decode(lc); @@ -9602,7 +10933,7 @@ index 853fd3f722..e8e6ad3b1a 100644 + lc->pu.mvd.y = (y ^ s) - s; + // don't care about b anymore + } - ++ + get_cabac_by22_flush(cc, n, val); + bypass_finish(cc); + } @@ -9610,149 +10941,289 @@ index 853fd3f722..e8e6ad3b1a 100644 +// printf("BY: X=%d,Y=%d\n", lc->pu.mvd.x, lc->pu.mvd.y); +} +#endif -diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c -index b53f4cc721..9982cff40f 100644 ---- a/libavcodec/hevc_filter.c -+++ b/libavcodec/hevc_filter.c -@@ -22,6 +22,12 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - +diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c +new file mode 100644 +index 0000000000..341bb77d9d +--- /dev/null ++++ b/libavcodec/rpi_hevc_data.c +@@ -0,0 +1,75 @@ ++/* ++ * HEVC shared tables ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++ ++#include "rpi_hevc_data.h" ++ ++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = { ++ 0, 0, 1, 0, ++ 1, 2, 0, 1, ++ 2, 3, 1, 2, ++ 3, 2, 3, 3, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = { ++ 0, 1, 0, 2, ++ 1, 0, 3, 2, ++ 1, 0, 3, 2, ++ 1, 3, 2, 3, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = { ++ 0, 0, 1, 0, ++ 1, 2, 0, 1, ++ 2, 3, 0, 1, ++ 2, 3, 4, 0, ++ 1, 2, 3, 4, ++ 5, 0, 1, 2, ++ 3, 4, 5, 6, ++ 0, 1, 2, 3, ++ 4, 5, 6, 7, ++ 1, 2, 3, 4, ++ 5, 6, 7, 2, ++ 3, 4, 5, 6, ++ 7, 3, 4, 5, ++ 6, 7, 4, 5, ++ 6, 7, 5, 6, ++ 7, 6, 7, 7, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = { ++ 0, 1, 0, 2, ++ 1, 0, 3, 2, ++ 1, 0, 4, 3, ++ 2, 1, 0, 5, ++ 4, 3, 2, 1, ++ 0, 6, 5, 4, ++ 3, 2, 1, 0, ++ 7, 6, 5, 4, ++ 3, 2, 1, 0, ++ 7, 6, 5, 4, ++ 3, 2, 1, 7, ++ 6, 5, 4, 3, ++ 2, 7, 6, 5, ++ 4, 3, 7, 6, ++ 5, 4, 7, 6, ++ 5, 7, 6, 7, ++}; +diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h +new file mode 100644 +index 0000000000..0aee673d8b +--- /dev/null ++++ b/libavcodec/rpi_hevc_data.h +@@ -0,0 +1,31 @@ ++/* ++ * HEVC shared data tables ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_DATA_H ++#define AVCODEC_RPI_HEVC_DATA_H ++ ++#include ++ ++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16]; ++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16]; ++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64]; ++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64]; ++ ++#endif /* AVCODEC_RPI_HEVC_DATA_H */ +diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c +new file mode 100644 +index 0000000000..a1d6d56b04 +--- /dev/null ++++ b/libavcodec/rpi_hevc_filter.c +@@ -0,0 +1,1067 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 Seppo Tomperi ++ * Copyright (C) 2013 Wassim Hamidouche ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ +//#define DISABLE_SAO +//#define DISABLE_DEBLOCK +//#define DISABLE_STRENGTHS +// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames) +//#define DISABLE_DEBLOCK_NONREF + - #include "libavutil/common.h" - #include "libavutil/internal.h" - -@@ -30,6 +36,16 @@ - - #include "bit_depth_template.c" - -+#ifdef RPI ++#include "libavutil/common.h" ++#include "libavutil/internal.h" ++ ++#include "cabac_functions.h" ++#include "rpi_hevcdec.h" ++ ++#include "bit_depth_template.c" ++ +#include "rpi_qpu.h" -+#endif -+#if RPI_HEVC_SAND +#include "rpi_zc.h" +#include "libavutil/rpi_sand_fns.h" -+#else -+#define RPI_ZC_SAND_8_IN_10_BUF 0 -+#endif + - #define LUMA 0 - #define CB 1 - #define CR 2 -@@ -75,14 +91,13 @@ static int chroma_tc(HEVCContext *s, int qp_y, int c_idx, int tc_offset) - return tctable[idxt]; - } - --static int get_qPy_pred(HEVCContext *s, int xBase, int yBase, int log2_cb_size) -+static inline int get_qPy_pred(const HEVCContext * const s, HEVCLocalContext * const lc, int xBase, int yBase, int log2_cb_size) - { -- HEVCLocalContext *lc = s->HEVClc; - int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1; -- int MinCuQpDeltaSizeMask = (1 << (s->ps.sps->log2_ctb_size - -- s->ps.pps->diff_cu_qp_delta_depth)) - 1; -- int xQgBase = xBase - (xBase & MinCuQpDeltaSizeMask); -- int yQgBase = yBase - (yBase & MinCuQpDeltaSizeMask); -+ int MinCuQpDeltaSizeMask = ~((1 << (s->ps.sps->log2_ctb_size - -+ s->ps.pps->diff_cu_qp_delta_depth)) - 1); -+ int xQgBase = xBase & MinCuQpDeltaSizeMask; -+ int yQgBase = yBase & MinCuQpDeltaSizeMask; - int min_cb_width = s->ps.sps->min_cb_width; - int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size; - int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size; -@@ -90,54 +105,43 @@ static int get_qPy_pred(HEVCContext *s, int xBase, int yBase, int log2_cb_size) - (xQgBase & ctb_size_mask); - int availableB = (yBase & ctb_size_mask) && - (yQgBase & ctb_size_mask); -- int qPy_pred, qPy_a, qPy_b; -- -- // qPy_pred -- if (lc->first_qp_group || (!xQgBase && !yQgBase)) { -- lc->first_qp_group = !lc->tu.is_cu_qp_delta_coded; -- qPy_pred = s->sh.slice_qp; -- } else { -- qPy_pred = lc->qPy_pred; -- } -+ const int qPy_pred = lc->qPy_pred; - -- // qPy_a -- if (availableA == 0) -- qPy_a = qPy_pred; -- else -- qPy_a = s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]; -- -- // qPy_b -- if (availableB == 0) -- qPy_b = qPy_pred; -- else -- qPy_b = s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]; -- -- av_assert2(qPy_a >= -s->ps.sps->qp_bd_offset && qPy_a < 52); -- av_assert2(qPy_b >= -s->ps.sps->qp_bd_offset && qPy_b < 52); -- -- return (qPy_a + qPy_b + 1) >> 1; ++#define LUMA 0 ++#define CB 1 ++#define CR 2 ++ ++static const uint8_t tctable[54] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18 ++ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37 ++ 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24 // QP 38...53 ++}; ++ ++static const uint8_t betatable[52] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18 ++ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37 ++ 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64 // QP 38...51 ++}; ++ ++static int chroma_tc(HEVCRpiContext *s, int qp_y, int c_idx, int tc_offset) ++{ ++ static const int qp_c[] = { ++ 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 ++ }; ++ int qp, qp_i, offset, idxt; ++ ++ // slice qp offset is not used for deblocking ++ if (c_idx == 1) ++ offset = s->ps.pps->cb_qp_offset; ++ else ++ offset = s->ps.pps->cr_qp_offset; ++ ++ qp_i = av_clip(qp_y + offset, 0, 57); ++ if (ctx_cfmt(s) == 1) { ++ if (qp_i < 30) ++ qp = qp_i; ++ else if (qp_i > 43) ++ qp = qp_i - 6; ++ else ++ qp = qp_c[qp_i - 30]; ++ } else { ++ qp = av_clip(qp_i, 0, 51); ++ } ++ ++ idxt = av_clip(qp + DEFAULT_INTRA_TC_OFFSET + tc_offset, 0, 53); ++ return tctable[idxt]; ++} ++ ++static inline int get_qPy_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase, int log2_cb_size) ++{ ++ int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1; ++ int MinCuQpDeltaSizeMask = ~((1 << (s->ps.sps->log2_ctb_size - ++ s->ps.pps->diff_cu_qp_delta_depth)) - 1); ++ int xQgBase = xBase & MinCuQpDeltaSizeMask; ++ int yQgBase = yBase & MinCuQpDeltaSizeMask; ++ int min_cb_width = s->ps.sps->min_cb_width; ++ int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size; ++ int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size; ++ int availableA = (xBase & ctb_size_mask) && ++ (xQgBase & ctb_size_mask); ++ int availableB = (yBase & ctb_size_mask) && ++ (yQgBase & ctb_size_mask); ++ const int qPy_pred = lc->qPy_pred; ++ + return ((!availableA ? qPy_pred : s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) + + (!availableB ? qPy_pred : s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1; - } - --void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase, int log2_cb_size) ++} ++ +// * Only called from bitstream decode in foreground +// so should be safe -+void ff_hevc_set_qPy(const HEVCContext * const s, HEVCLocalContext * const lc, int xBase, int yBase, int log2_cb_size) - { -- int qp_y = get_qPy_pred(s, xBase, yBase, log2_cb_size); ++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase, int log2_cb_size) ++{ + const int qp_y = get_qPy_pred(s, lc, xBase, yBase, log2_cb_size); - -- if (s->HEVClc->tu.cu_qp_delta != 0) { ++ + if (lc->tu.cu_qp_delta != 0) { - int off = s->ps.sps->qp_bd_offset; -- s->HEVClc->qp_y = FFUMOD(qp_y + s->HEVClc->tu.cu_qp_delta + 52 + 2 * off, ++ int off = s->ps.sps->qp_bd_offset; + lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off, - 52 + off) - off; - } else -- s->HEVClc->qp_y = qp_y; ++ 52 + off) - off; ++ } else + lc->qp_y = qp_y; - } - --static int get_qPy(HEVCContext *s, int xC, int yC) -+static int get_qPy(const HEVCContext * const s, const int xC, const int yC) - { -- int log2_min_cb_size = s->ps.sps->log2_min_cb_size; -- int x = xC >> log2_min_cb_size; -- int y = yC >> log2_min_cb_size; ++} ++ ++static int get_qPy(const HEVCRpiContext * const s, const int xC, const int yC) ++{ + const int log2_min_cb_size = s->ps.sps->log2_min_cb_size; + const int x = xC >> log2_min_cb_size; + const int y = yC >> log2_min_cb_size; - return s->qp_y_tab[x + y * s->ps.sps->min_cb_width]; - } - -+static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx) ++ return s->qp_y_tab[x + y * s->ps.sps->min_cb_width]; ++} ++ ++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx) +{ -+#if RPI_HEVC_SAND -+ return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; -+#else -+ return s->ps.sps->pixel_shift; -+#endif ++ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; ++} ++ ++static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++int i, j; ++ ++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=8) ++ AV_COPY64U(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } else { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=16) ++ AV_COPY128(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } +} + - static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height, - ptrdiff_t stride_dst, ptrdiff_t stride_src) - { -@@ -160,12 +164,21 @@ int i, j; - } - } - +// "DSP" these? - static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) - { -- if (pixel_shift) -- *(uint16_t *)dst = *(uint16_t *)src; -- else -- *dst = *src; ++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) ++{ + switch (pixel_shift) + { + case 2: @@ -9765,25 +11236,13 @@ index b53f4cc721..9982cff40f 100644 + *dst = *src; + break; + } - } - - static void copy_vert(uint8_t *dst, const uint8_t *src, -@@ -173,26 +186,37 @@ static void copy_vert(uint8_t *dst, const uint8_t *src, - ptrdiff_t stride_dst, ptrdiff_t stride_src) - { - int i; -- if (pixel_shift == 0) { -- for (i = 0; i < height; i++) { -- *dst = *src; -- dst += stride_dst; -- src += stride_src; -- } -- } else { -- for (i = 0; i < height; i++) { -- *(uint16_t *)dst = *(uint16_t *)src; -- dst += stride_dst; -- src += stride_src; -- } ++} ++ ++static void copy_vert(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ int i; + switch (pixel_shift) + { + case 2: @@ -9807,56 +11266,68 @@ index b53f4cc721..9982cff40f 100644 + src += stride_src; + } + break; - } - } - --static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, -+static void copy_CTB_to_hv(const HEVCContext * const s, const uint8_t * const src, - ptrdiff_t stride_src, int x, int y, int width, int height, - int c_idx, int x_ctb, int y_ctb) - { -- int sh = s->ps.sps->pixel_shift; ++ } ++} ++ ++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src, ++ ptrdiff_t stride_src, int x, int y, int width, int height, ++ int c_idx, int x_ctb, int y_ctb) ++{ + const unsigned int sh = pixel_shift(s, c_idx); - int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; - int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; - -@@ -208,7 +232,8 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src, - copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); - } - --static void restore_tqb_pixels(HEVCContext *s, ++ const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx); ++ const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx); ++ ++ /* copy horizontal edges */ ++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh), ++ src, width << sh); ++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh), ++ src + stride_src * (height - 1), width << sh); ++ ++ /* copy vertical edges */ ++ copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); ++ ++ copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); ++} ++ +// N.B. Src & dst are swapped as this is a restore! -+static void restore_tqb_pixels(const HEVCContext * const s, - uint8_t *src1, const uint8_t *dst1, - ptrdiff_t stride_src, ptrdiff_t stride_dst, - int x0, int y0, int width, int height, int c_idx) -@@ -223,13 +248,14 @@ static void restore_tqb_pixels(HEVCContext *s, - int y_min = ((y0 ) >> s->ps.sps->log2_min_pu_size); - int x_max = ((x0 + width ) >> s->ps.sps->log2_min_pu_size); - int y_max = ((y0 + height) >> s->ps.sps->log2_min_pu_size); -- int len = (min_pu_size >> hshift) << s->ps.sps->pixel_shift; ++static void restore_tqb_pixels(const HEVCRpiContext * const s, ++ uint8_t *src1, const uint8_t *dst1, ++ ptrdiff_t stride_src, ptrdiff_t stride_dst, ++ int x0, int y0, int width, int height, int c_idx) ++{ ++ if ( s->ps.pps->transquant_bypass_enable_flag || ++ (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) { ++ int x, y; ++ int min_pu_size = 1 << s->ps.sps->log2_min_pu_size; ++ const unsigned int hshift = ctx_hshift(s, c_idx); ++ const unsigned int vshift = ctx_vshift(s, c_idx); ++ int x_min = ((x0 ) >> s->ps.sps->log2_min_pu_size); ++ int y_min = ((y0 ) >> s->ps.sps->log2_min_pu_size); ++ int x_max = ((x0 + width ) >> s->ps.sps->log2_min_pu_size); ++ int y_max = ((y0 + height) >> s->ps.sps->log2_min_pu_size); + const unsigned int sh = pixel_shift(s, c_idx); + int len = (min_pu_size >> hshift) << sh; - for (y = y_min; y < y_max; y++) { - for (x = x_min; x < x_max; x++) { - if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) { - int n; -- uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift); -- const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift); ++ for (y = y_min; y < y_max; y++) { ++ for (x = x_min; x < x_max; x++) { ++ if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) { ++ int n; + uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh); + const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh); - for (n = 0; n < (min_pu_size >> vshift); n++) { - memcpy(src, dst, len); - src += stride_src; -@@ -243,10 +269,15 @@ static void restore_tqb_pixels(HEVCContext *s, - - #define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)]) - --static void sao_filter_CTB(HEVCContext *s, int x, int y) -+static void sao_filter_CTB(const HEVCContext * const s, const int x, const int y) - { -- static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 }; -- HEVCLocalContext *lc = s->HEVClc; ++ for (n = 0; n < (min_pu_size >> vshift); n++) { ++ memcpy(src, dst, len); ++ src += stride_src; ++ dst += stride_dst; ++ } ++ } ++ } ++ } ++ } ++} ++ ++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)]) ++ ++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y) ++{ +#if SAO_FILTER_N == 5 + static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; +#elif SAO_FILTER_N == 6 @@ -9864,50 +11335,82 @@ index b53f4cc721..9982cff40f 100644 +#else +#error Confused by size of sao fn array +#endif - int c_idx; - int edges[4]; // 0 left 1 top 2 right 3 bottom - int x_ctb = x >> s->ps.sps->log2_ctb_size; -@@ -266,12 +297,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) - uint8_t right_tile_edge = 0; - uint8_t up_tile_edge = 0; - uint8_t bottom_tile_edge = 0; -+#if RPI_HEVC_SAND -+ const int sliced = av_rpi_is_sand_frame(s->frame); -+ const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1); -+#else -+ const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1); -+#endif - - edges[0] = x_ctb == 0; - edges[1] = y_ctb == 0; - edges[2] = x_ctb == s->ps.sps->ctb_width - 1; - edges[3] = y_ctb == s->ps.sps->ctb_height - 1; - ++ int c_idx; ++ int edges[4]; // 0 left 1 top 2 right 3 bottom ++ int x_ctb = x >> s->ps.sps->log2_ctb_size; ++ int y_ctb = y >> s->ps.sps->log2_ctb_size; ++ int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb; ++ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs]; ++ SAOParams *sao = &CTB(s->sao, x_ctb, y_ctb); ++ // flags indicating unfilterable edges ++ uint8_t vert_edge[] = { 0, 0 }; ++ uint8_t horiz_edge[] = { 0, 0 }; ++ uint8_t diag_edge[] = { 0, 0, 0, 0 }; ++ uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb); ++ uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag && ++ !s->ps.pps->loop_filter_across_tiles_enabled_flag; ++ uint8_t restore = no_tile_filter || !lfase; ++ uint8_t left_tile_edge = 0; ++ uint8_t right_tile_edge = 0; ++ uint8_t up_tile_edge = 0; ++ uint8_t bottom_tile_edge = 0; ++ const int sliced = 1; ++ const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1); ++ ++ edges[0] = x_ctb == 0; ++ edges[1] = y_ctb == 0; ++ edges[2] = x_ctb == s->ps.sps->ctb_width - 1; ++ edges[3] = y_ctb == s->ps.sps->ctb_height - 1; ++ +#ifdef DISABLE_SAO + return; +#endif + - if (restore) { - if (!edges[0]) { - left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; -@@ -303,7 +344,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) - } - } - -- for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) { ++ if (restore) { ++ if (!edges[0]) { ++ left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; ++ vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge; ++ } ++ if (!edges[2]) { ++ right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]]; ++ vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge; ++ } ++ if (!edges[1]) { ++ up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]; ++ horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge; ++ } ++ if (!edges[3]) { ++ bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]]; ++ horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge; ++ } ++ if (!edges[0] && !edges[1]) { ++ diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge; ++ } ++ if (!edges[1] && !edges[2]) { ++ diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge; ++ } ++ if (!edges[2] && !edges[3]) { ++ diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge; ++ } ++ if (!edges[0] && !edges[3]) { ++ diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge; ++ } ++ } ++ + for (c_idx = 0; c_idx < plane_count; c_idx++) { - int x0 = x >> s->ps.sps->hshift[c_idx]; - int y0 = y >> s->ps.sps->vshift[c_idx]; - ptrdiff_t stride_src = s->frame->linesize[c_idx]; -@@ -312,28 +353,91 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) - int width = FFMIN(ctb_size_h, (s->ps.sps->width >> s->ps.sps->hshift[c_idx]) - x0); - int height = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0); - int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; -- uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)]; - ptrdiff_t stride_dst; - uint8_t *dst; - -+#if RPI_HEVC_SAND ++ const unsigned int vshift = ctx_vshift(s, c_idx); ++ const unsigned int hshift = ctx_hshift(s, c_idx); ++ const int x0 = x >> hshift; ++ const int y0 = y >> vshift; ++ const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx); ++ int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift; ++ int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift; ++ int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0); ++ const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0); ++ int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; ++ ptrdiff_t stride_dst; ++ uint8_t *dst; ++ + const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0); + const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; + uint8_t * const src = !sliced ? @@ -9926,43 +11429,22 @@ index b53f4cc721..9982cff40f 100644 + av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) : + av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0); + -+ + if (sliced && c_idx > 1) { + break; + } -+#else -+ const unsigned int sh = s->ps.sps->pixel_shift; -+ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; -+ uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)]; -+ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh); -+ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh); -+#endif + - switch (sao->type_idx[c_idx]) { - case SAO_BAND: - copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, - x_ctb, y_ctb); - if (s->ps.pps->transquant_bypass_enable_flag || - (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) { -- dst = lc->edge_emu_buffer; -- stride_dst = 2*MAX_PB_SIZE; -- copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src); -- s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, -- sao->offset_val[c_idx], sao->band_position[c_idx], -- width, height); -- restore_tqb_pixels(s, src, dst, stride_src, stride_dst, -- x, y, width, height, c_idx); -+#ifdef RPI ++ switch (sao->type_idx[c_idx]) { ++ case SAO_BAND: ++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, ++ x_ctb, y_ctb); ++ if (s->ps.pps->transquant_bypass_enable_flag || ++ (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) { + // Can't use the edge buffer here as it may be in use by the foreground + DECLARE_ALIGNED(64, uint8_t, dstbuf) + [2*MAX_PB_SIZE*MAX_PB_SIZE]; -+#else -+ uint8_t * const dstbuf = s->HEVClc->edge_emu_buffer; -+#endif + dst = dstbuf; + stride_dst = 2*MAX_PB_SIZE; + copy_CTB(dst, src, width << sh, height, stride_dst, stride_src); -+#if RPI_HEVC_SAND + if (sliced && c_idx != 0) + { + s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, @@ -9971,7 +11453,6 @@ index b53f4cc721..9982cff40f 100644 + width, height); + } + else -+#endif + { + s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, + sao->offset_val[c_idx], sao->band_position[c_idx], @@ -9979,178 +11460,111 @@ index b53f4cc721..9982cff40f 100644 + } + restore_tqb_pixels(s, src, dst, stride_src, stride_dst, + x, y, width, height, c_idx); - } else { -- s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, -- sao->offset_val[c_idx], sao->band_position[c_idx], -- width, height); -+#if RPI_HEVC_SAND ++ } else { + if (sliced && c_idx != 0) + { -+// printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src); -+ + s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, + sao->offset_val[1], sao->band_position[1], + sao->offset_val[2], sao->band_position[2], + width, height); + } + else -+#endif + { + s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, + sao->offset_val[c_idx], sao->band_position[c_idx], + width, height); + } - } - sao->type_idx[c_idx] = SAO_APPLIED; - break; -@@ -341,108 +445,125 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) - { - int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx]; - int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx]; -- int left_edge = edges[0]; - int top_edge = edges[1]; -- int right_edge = edges[2]; - int bottom_edge = edges[3]; -- int sh = s->ps.sps->pixel_shift; -- int left_pixels, right_pixels; -+#ifdef RPI ++ } ++ sao->type_idx[c_idx] = SAO_APPLIED; ++ break; ++ case SAO_EDGE: ++ { ++ const int w = s->ps.sps->width >> hshift; ++ const int h = s->ps.sps->height >> vshift; ++ int top_edge = edges[1]; ++ int bottom_edge = edges[3]; + // Can't use the edge buffer here as it may be in use by the foreground + DECLARE_ALIGNED(64, uint8_t, dstbuf) + [2*(MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE)*(MAX_PB_SIZE + 2) + 64]; -+#else -+ uint8_t * const dstbuf = s->HEVClc->edge_emu_buffer; -+#endif - - stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE; -- dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE; ++ ++ stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE; + dst = dstbuf + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE; - - if (!top_edge) { -- int left = 1 - left_edge; -- int right = 1 - right_edge; -- const uint8_t *src1[2]; - uint8_t *dst1; -- int src_idx, pos; ++ ++ if (!top_edge) { ++ uint8_t *dst1; + int src_idx; + const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh); + + dst1 = dst - stride_dst; - -- dst1 = dst - stride_dst - (left << sh); -- src1[0] = src - stride_src - (left << sh); -- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh); -- pos = 0; -- if (left) { ++ + if (src_l != NULL) { - src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == - SAO_APPLIED); -- copy_pixel(dst1, src1[src_idx], sh); -- pos += (1 << sh); ++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); + copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh); - } ++ } + - src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == - SAO_APPLIED); -- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); -- if (right) { -- pos += width << sh; ++ src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); + memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh); + + if (src_r != NULL) { - src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == - SAO_APPLIED); -- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); ++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); + copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh); - } - } - if (!bottom_edge) { -- int left = 1 - left_edge; -- int right = 1 - right_edge; -- const uint8_t *src1[2]; -- uint8_t *dst1; -- int src_idx, pos; ++ } ++ } ++ if (!bottom_edge) { + uint8_t * const dst1 = dst + height * stride_dst; + int src_idx; + const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh); + const unsigned int hoff = height * stride_src; - -- dst1 = dst + height * stride_dst - (left << sh); -- src1[0] = src + height * stride_src - (left << sh); -- src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh); -- pos = 0; -- if (left) { ++ + if (src_l != NULL) { - src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == - SAO_APPLIED); -- copy_pixel(dst1, src1[src_idx], sh); -- pos += (1 << sh); ++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); + copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh); - } ++ } + - src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == - SAO_APPLIED); -- memcpy(dst1 + pos, src1[src_idx] + pos, width << sh); -- if (right) { -- pos += width << sh; ++ src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); + memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh); + + if (src_r != NULL) { - src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == - SAO_APPLIED); -- copy_pixel(dst1 + pos, src1[src_idx] + pos, sh); ++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); + copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh); - } - } -- left_pixels = 0; -- if (!left_edge) { ++ } ++ } + if (src_l != NULL) { - if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { - copy_vert(dst - (1 << sh), - s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), - sh, height, stride_dst, 1 << sh); - } else { -- left_pixels = 1; ++ if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { ++ copy_vert(dst - (1 << sh), ++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), ++ sh, height, stride_dst, 1 << sh); ++ } else { + copy_vert(dst - (1 << sh), + src_l, + sh, height, stride_dst, stride_src); - } - } -- right_pixels = 0; -- if (!right_edge) { ++ } ++ } + if (src_r != NULL) { - if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { - copy_vert(dst + (width << sh), - s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), - sh, height, stride_dst, 1 << sh); - } else { -- right_pixels = 1; ++ if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { ++ copy_vert(dst + (width << sh), ++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), ++ sh, height, stride_dst, 1 << sh); ++ } else { + copy_vert(dst + (width << sh), + src_r, + sh, height, stride_dst, stride_src); - } - } - -- copy_CTB(dst - (left_pixels << sh), -- src - (left_pixels << sh), -- (width + left_pixels + right_pixels) << sh, ++ } ++ } ++ + copy_CTB(dst, + src, + width << sh, - height, stride_dst, stride_src); - - copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, - x_ctb, y_ctb); -- s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], -- sao->eo_class[c_idx], width, height); -- s->hevcdsp.sao_edge_restore[restore](src, dst, -- stride_src, stride_dst, -- sao, -- edges, width, -- height, c_idx, -- vert_edge, -- horiz_edge, -- diag_edge); -+#if RPI_HEVC_SAND ++ height, stride_dst, stride_src); ++ ++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, ++ x_ctb, y_ctb); + if (sliced && c_idx != 0) + { + // Class always the same for both U & V (which is just as well :-)) @@ -10167,7 +11581,6 @@ index b53f4cc721..9982cff40f 100644 + diag_edge); + } + else -+#endif + { + s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], + sao->eo_class[c_idx], width, height); @@ -10181,19 +11594,19 @@ index b53f4cc721..9982cff40f 100644 + diag_edge); + } + // ??? Does this actually work for chroma ??? - restore_tqb_pixels(s, src, dst, stride_src, stride_dst, - x, y, width, height, c_idx); - sao->type_idx[c_idx] = SAO_APPLIED; -@@ -450,8 +571,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y) - } - } - } ++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, ++ x, y, width, height, c_idx); ++ sao->type_idx[c_idx] = SAO_APPLIED; ++ break; ++ } ++ } ++ } + +#if RPI_ZC_SAND_8_IN_10_BUF + if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL && + (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2])) + { -+ const unsigned int stride1 = s->frame->linesize[0]; ++ const unsigned int stride1 = frame_stride1(s->frame, 1); + const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame); + const unsigned int xoff = (x >> 8) * stride2 * stride1; + const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size); @@ -10209,25 +11622,52 @@ index b53f4cc721..9982cff40f 100644 + av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3); + } +#endif - } - ++} ++ +// Returns 2 or 0. - static int get_pcm(HEVCContext *s, int x, int y) - { - int log2_min_pu_size = s->ps.sps->log2_min_pu_size; -@@ -478,7 +621,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - uint8_t *src; - int x, y; - int chroma, beta; -- int32_t c_tc[2], tc[2]; -+ int32_t c_tc[4], tc[2]; - uint8_t no_p[2] = { 0 }; - uint8_t no_q[2] = { 0 }; - -@@ -495,6 +638,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - s->ps.sps->pcm.loop_filter_disable_flag) || - s->ps.pps->transquant_bypass_enable_flag; - ++static int get_pcm(HEVCRpiContext *s, int x, int y) ++{ ++ int log2_min_pu_size = s->ps.sps->log2_min_pu_size; ++ int x_pu, y_pu; ++ ++ if (x < 0 || y < 0) ++ return 2; ++ ++ x_pu = x >> log2_min_pu_size; ++ y_pu = y >> log2_min_pu_size; ++ ++ if (x_pu >= s->ps.sps->min_pu_width || y_pu >= s->ps.sps->min_pu_height) ++ return 2; ++ return s->is_pcm[y_pu * s->ps.sps->min_pu_width + x_pu]; ++} ++ ++#define TC_CALC(qp, bs) \ ++ tctable[av_clip((qp) + DEFAULT_INTRA_TC_OFFSET * ((bs) - 1) + \ ++ (tc_offset & -2), \ ++ 0, MAX_QP + DEFAULT_INTRA_TC_OFFSET)] ++ ++static void deblocking_filter_CTB(HEVCRpiContext *s, int x0, int y0) ++{ ++ uint8_t *src; ++ int x, y; ++ int beta; ++ int32_t tc[2]; ++ uint8_t no_p[2] = { 0 }; ++ uint8_t no_q[2] = { 0 }; ++ ++ int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ int x_end, x_end2, y_end; ++ int ctb_size = 1 << log2_ctb_size; ++ int ctb = (x0 >> log2_ctb_size) + ++ (y0 >> log2_ctb_size) * s->ps.sps->ctb_width; ++ int cur_tc_offset = s->deblock[ctb].tc_offset; ++ int cur_beta_offset = s->deblock[ctb].beta_offset; ++ int left_tc_offset, left_beta_offset; ++ int tc_offset, beta_offset; ++ int pcmf = (s->ps.sps->pcm_enabled_flag && ++ s->ps.sps->pcm.loop_filter_disable_flag) || ++ s->ps.pps->transquant_bypass_enable_flag; ++ +#ifdef DISABLE_DEBLOCK_NONREF + if (!s->used_for_ref) + return; // Don't deblock non-reference frames @@ -10237,87 +11677,82 @@ index b53f4cc721..9982cff40f 100644 +#endif + if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF) + return; - if (x0) { - left_tc_offset = s->deblock[ctb - 1].tc_offset; - left_beta_offset = s->deblock[ctb - 1].beta_offset; -@@ -528,19 +680,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - - tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; - tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; -- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; - if (pcmf) { - no_p[0] = get_pcm(s, x - 1, y); - no_p[1] = get_pcm(s, x - 1, y + 4); - no_q[0] = get_pcm(s, x, y); - no_q[1] = get_pcm(s, x, y + 4); -- s->hevcdsp.hevc_v_loop_filter_luma_c(src, -- s->frame->linesize[LUMA], -- beta, tc, no_p, no_q); -- } else -- s->hevcdsp.hevc_v_loop_filter_luma(src, -- s->frame->linesize[LUMA], -- beta, tc, no_p, no_q); -+ } -+#if RPI_HEVC_SAND -+ if (av_rpi_is_sand_frame(s->frame)) { -+ -+ // This copes properly with no_p/no_q -+ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), -+ s->frame->linesize[LUMA], -+ beta, tc, no_p, no_q, -+ av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); -+ } -+ else -+#endif -+ { -+ src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; -+ if (pcmf) { -+ // Standard DSP code is broken if no_p / no_q is set -+ s->hevcdsp.hevc_v_loop_filter_luma_c(src, -+ s->frame->linesize[LUMA], -+ beta, tc, no_p, no_q); -+ } -+ else -+#ifdef RPI_DEBLOCK_VPU -+ if (s->enable_rpi_deblock) { -+ uint8_t (*setup)[2][2][4]; -+ int num16 = (y>>4)*s->setup_width + (x>>4); -+ int a = ((y>>3) & 1) << 1; -+ int b = (x>>3) & 1; -+ setup = s->dvq->y_setup_arm[num16]; -+ setup[0][b][0][a] = beta; -+ setup[0][b][0][a + 1] = beta; -+ setup[0][b][1][a] = tc[0]; -+ setup[0][b][1][a + 1] = tc[1]; -+ } else -+#endif -+ { -+ s->hevcdsp.hevc_v_loop_filter_luma(src, -+ s->frame->linesize[LUMA], -+ beta, tc, no_p, no_q); -+ } ++ if (x0) { ++ left_tc_offset = s->deblock[ctb - 1].tc_offset; ++ left_beta_offset = s->deblock[ctb - 1].beta_offset; ++ } else { ++ left_tc_offset = 0; ++ left_beta_offset = 0; ++ } ++ ++ x_end = x0 + ctb_size; ++ if (x_end > s->ps.sps->width) ++ x_end = s->ps.sps->width; ++ y_end = y0 + ctb_size; ++ if (y_end > s->ps.sps->height) ++ y_end = s->ps.sps->height; ++ ++ tc_offset = cur_tc_offset; ++ beta_offset = cur_beta_offset; ++ ++ x_end2 = x_end; ++ if (x_end2 != s->ps.sps->width) ++ x_end2 -= 8; ++ for (y = y0; y < y_end; y += 8) { ++ // vertical filtering luma ++ for (x = x0 ? x0 : 8; x < x_end; x += 8) { ++ const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; ++ const int bs1 = s->vertical_bs[(x + (y + 4) * s->bs_width) >> 2]; ++ if (bs0 || bs1) { ++ const int qp = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; ++ ++ beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; ++ ++ tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; ++ tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; ++ if (pcmf) { ++ no_p[0] = get_pcm(s, x - 1, y); ++ no_p[1] = get_pcm(s, x - 1, y + 4); ++ no_q[0] = get_pcm(s, x, y); ++ no_q[1] = get_pcm(s, x, y + 4); + } - } - } - -@@ -560,7 +744,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; - tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; - tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; -- src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; -+ src = -+#if RPI_HEVC_SAND -+ av_rpi_is_sand_frame(s->frame) ? -+ av_rpi_sand_frame_pos_y(s->frame, x, y) : -+#endif -+ &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)]; - if (pcmf) { - no_p[0] = get_pcm(s, x, y - 1); - no_p[1] = get_pcm(s, x + 4, y - 1); -@@ -570,6 +759,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); - } else ++ ++ // This copes properly with no_p/no_q ++ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ beta, tc, no_p, no_q, ++ av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); ++ // *** VPU deblock lost here ++ } ++ } ++ ++ if(!y) ++ continue; ++ ++ // horizontal filtering luma ++ for (x = x0 ? x0 - 8 : 0; x < x_end2; x += 8) { ++ const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; ++ const int bs1 = s->horizontal_bs[((x + 4) + y * s->bs_width) >> 2]; ++ if (bs0 || bs1) { ++ const int qp = (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1; ++ ++ tc_offset = x >= x0 ? cur_tc_offset : left_tc_offset; ++ beta_offset = x >= x0 ? cur_beta_offset : left_beta_offset; ++ ++ beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)]; ++ tc[0] = bs0 ? TC_CALC(qp, bs0) : 0; ++ tc[1] = bs1 ? TC_CALC(qp, bs1) : 0; ++ src = av_rpi_sand_frame_pos_y(s->frame, x, y); ++ ++ if (pcmf) { ++ no_p[0] = get_pcm(s, x, y - 1); ++ no_p[1] = get_pcm(s, x + 4, y - 1); ++ no_q[0] = get_pcm(s, x, y); ++ no_q[1] = get_pcm(s, x + 4, y); ++ s->hevcdsp.hevc_h_loop_filter_luma_c(src, ++ frame_stride1(s->frame, LUMA), ++ beta, tc, no_p, no_q); ++ } else +#ifdef RPI_DEBLOCK_VPU + if (s->enable_rpi_deblock) { + uint8_t (*setup)[2][2][4]; @@ -10331,260 +11766,113 @@ index b53f4cc721..9982cff40f 100644 + setup[1][b][1][a + 1] = tc[1]; + } else +#endif - s->hevcdsp.hevc_h_loop_filter_luma(src, - s->frame->linesize[LUMA], - beta, tc, no_p, no_q); -@@ -578,6 +780,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - } - - if (s->ps.sps->chroma_format_idc) { -+#if RPI_HEVC_SAND -+ if (av_rpi_is_sand_frame(s->frame)) { -+ const int v = 2; -+ const int h = 2; -+ -+ // vertical filtering chroma -+ for (y = y0; y < y_end; y += 8 * v) { ++ s->hevcdsp.hevc_h_loop_filter_luma(src, ++ frame_stride1(s->frame, LUMA), ++ beta, tc, no_p, no_q); ++ } ++ } ++ } ++ ++ if (ctx_cfmt(s) != 0) { ++ const int v = 2; ++ const int h = 2; ++ ++ // vertical filtering chroma ++ for (y = y0; y < y_end; y += 8 * v) { +// const int demi_y = y + 4 * v >= s->ps.sps->height; -+ const int demi_y = 0; -+ for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) { -+ const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; -+ const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2]; -+ -+ if ((bs0 == 2) || (bs1 == 2)) { -+ const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; -+ const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1; -+ unsigned int no_f = !demi_y ? 0 : 2 | 8; -+ -+ // tc_offset here should be set to cur_tc_offset I think -+ const uint32_t tc4 = -+ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) | -+ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); -+ -+ if (tc4 == 0) -+ continue; ++ const int demi_y = 0; ++ for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) { ++ const int bs0 = s->vertical_bs[(x + y * s->bs_width) >> 2]; ++ const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2]; + -+ if (pcmf) { -+ no_f = -+ (get_pcm(s, x - 1, y) ? 1 : 0) | -+ (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) | -+ (get_pcm(s, x, y) ? 4 : 0) | -+ (get_pcm(s, x, y + 4 * v) ? 8 : 0); -+ if (no_f == 0xf) -+ continue; -+ } ++ if ((bs0 == 2) || (bs1 == 2)) { ++ const int qp0 = (get_qPy(s, x - 1, y) + get_qPy(s, x, y) + 1) >> 1; ++ const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1; ++ unsigned int no_f = !demi_y ? 0 : 2 | 8; ++ ++ // tc_offset here should be set to cur_tc_offset I think ++ const uint32_t tc4 = ++ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) | ++ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); + -+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ s->frame->linesize[1], -+ tc4, -+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), -+ no_f); ++ if (tc4 == 0) ++ continue; ++ ++ if (pcmf) { ++ no_f = ++ (get_pcm(s, x - 1, y) ? 1 : 0) | ++ (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) | ++ (get_pcm(s, x, y) ? 4 : 0) | ++ (get_pcm(s, x, y + 4 * v) ? 8 : 0); ++ if (no_f == 0xf) ++ continue; + } ++ ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ tc4, ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ no_f); + } ++ } + -+ if (y == 0) -+ continue; ++ if (y == 0) ++ continue; + -+ // horizontal filtering chroma -+ tc_offset = x0 ? left_tc_offset : cur_tc_offset; -+ x_end2 = x_end; -+ if (x_end != s->ps.sps->width) -+ x_end2 = x_end - 8 * h; ++ // horizontal filtering chroma ++ tc_offset = x0 ? left_tc_offset : cur_tc_offset; ++ x_end2 = x_end; ++ if (x_end != s->ps.sps->width) ++ x_end2 = x_end - 8 * h; + -+ for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) { ++ for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) { +// const int demi_x = x + 4 * v >= s->ps.sps->width; -+ const int demi_x = 0; -+ -+ const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; -+ const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2]; -+ if ((bs0 == 2) || (bs1 == 2)) { -+ const int qp0 = bs0 == 2 ? (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1 : 0; -+ const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0; -+ const uint32_t tc4 = -+ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) | -+ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); -+ unsigned int no_f = !demi_x ? 0 : 2 | 8; -+ -+ if (tc4 == 0) -+ continue; ++ const int demi_x = 0; + -+ if (pcmf) { -+ no_f = -+ (get_pcm(s, x, y - 1) ? 1 : 0) | -+ (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) | -+ (get_pcm(s, x, y) ? 4 : 0) | -+ (get_pcm(s, x + 4 * h, y) ? 8 : 0); ++ const int bs0 = s->horizontal_bs[( x + y * s->bs_width) >> 2]; ++ const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2]; ++ if ((bs0 == 2) || (bs1 == 2)) { ++ const int qp0 = bs0 == 2 ? (get_qPy(s, x, y - 1) + get_qPy(s, x, y) + 1) >> 1 : 0; ++ const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0; ++ const uint32_t tc4 = ++ ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) | ++ ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8)); ++ unsigned int no_f = !demi_x ? 0 : 2 | 8; + -+ if (no_f == 0xf) -+ continue; -+ } ++ if (tc4 == 0) ++ continue; ++ ++ if (pcmf) { ++ no_f = ++ (get_pcm(s, x, y - 1) ? 1 : 0) | ++ (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) | ++ (get_pcm(s, x, y) ? 4 : 0) | ++ (get_pcm(s, x + 4 * h, y) ? 8 : 0); + -+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), -+ s->frame->linesize[1], -+ tc4, no_f); ++ if (no_f == 0xf) ++ continue; + } ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, LUMA), ++ tc4, no_f); + } + } ++ // **** VPU deblock code gone from here.... + } -+ else -+#endif - for (chroma = 1; chroma <= 2; chroma++) { - int h = 1 << s->ps.sps->hshift[chroma]; - int v = 1 << s->ps.sps->vshift[chroma]; -@@ -594,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - - c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0; - c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0; -- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; -+ src = -+#if RPI_HEVC_SAND -+ av_rpi_is_sand_frame(s->frame) ? -+ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : -+#endif -+ &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)]; - if (pcmf) { - no_p[0] = get_pcm(s, x - 1, y); - no_p[1] = get_pcm(s, x - 1, y + (4 * v)); -@@ -604,9 +901,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - s->frame->linesize[chroma], - c_tc, no_p, no_q); - } else -+#ifdef RPI_DEBLOCK_VPU -+ if (s->enable_rpi_deblock) { -+ uint8_t (*setup)[2][2][4]; -+ int xc = x>>s->ps.sps->hshift[chroma]; -+ int yc = y>>s->ps.sps->vshift[chroma]; -+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4); -+ int a = ((yc>>3) & 1) << 1; -+ int b = (xc>>3) & 1; -+ setup = s->dvq->uv_setup_arm[num16]; -+ setup[0][b][0][a] = c_tc[0]; -+ setup[0][b][0][a + 1] = c_tc[1]; -+ } else -+#endif - s->hevcdsp.hevc_v_loop_filter_chroma(src, - s->frame->linesize[chroma], - c_tc, no_p, no_q); ++ } ++} + - } - } - -@@ -627,7 +938,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - - c_tc[0] = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset) : 0; - c_tc[1] = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0; -- src = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; -+ src = -+#if RPI_HEVC_SAND -+ av_rpi_is_sand_frame(s->frame) ? -+ av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) : -+#endif -+ &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; - if (pcmf) { - no_p[0] = get_pcm(s, x, y - 1); - no_p[1] = get_pcm(s, x + (4 * h), y - 1); -@@ -637,6 +953,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - s->frame->linesize[chroma], - c_tc, no_p, no_q); - } else -+#ifdef RPI_DEBLOCK_VPU -+ if (s->enable_rpi_deblock) { -+ uint8_t (*setup)[2][2][4]; -+ int xc = x>>s->ps.sps->hshift[chroma]; -+ int yc = y>>s->ps.sps->vshift[chroma]; -+ int num16 = (yc>>4)*s->uv_setup_width + (xc>>4); -+ int a = ((xc>>3) & 1) << 1; -+ int b = (yc>>3) & 1; -+ setup = s->dvq->uv_setup_arm[num16]; -+ setup[1][b][0][a] = c_tc[0]; -+ setup[1][b][0][a + 1] = c_tc[1]; -+ } else -+#endif - s->hevcdsp.hevc_h_loop_filter_chroma(src, - s->frame->linesize[chroma], - c_tc, no_p, no_q); -@@ -647,83 +976,31 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0) - } - } - --static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh, -- RefPicList *neigh_refPicList) --{ -- if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { -- // same L0 and L1 -- if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]] && -- s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] && -- neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) { -- if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || -- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) && -- (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || -- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)) -- return 1; -- else -- return 0; -- } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] && -- neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) { -- if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || -- FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) -- return 1; -- else -- return 0; -- } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] && -- neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) { -- if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || -- FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4) -- return 1; -- else -- return 0; -- } else { -- return 1; -- } -- } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV -- Mv A, B; -- int ref_A, ref_B; -- -- if (curr->pred_flag & 1) { -- A = curr->mv[0]; -- ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]]; -- } else { -- A = curr->mv[1]; -- ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]]; -- } -- -- if (neigh->pred_flag & 1) { -- B = neigh->mv[0]; -- ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]]; -- } else { -- B = neigh->mv[1]; -- ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]]; -- } -- -- if (ref_A == ref_B) { -- if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4) -- return 1; -- else -- return 0; -- } else -- return 1; -- } -- -- return 1; --} - --void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, -+void ff_hevc_deblocking_boundary_strengths(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, - int log2_trafo_size) - { -- HEVCLocalContext *lc = s->HEVClc; - MvField *tab_mvf = s->ref->tab_mvf; - int log2_min_pu_size = s->ps.sps->log2_min_pu_size; - int log2_min_tu_size = s->ps.sps->log2_min_tb_size; - int min_pu_width = s->ps.sps->min_pu_width; - int min_tu_width = s->ps.sps->min_tb_width; -- int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width + -- (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA; - int boundary_upper, boundary_left; -- int i, j, bs; ++ ++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, ++ int log2_trafo_size) ++{ ++ MvField *tab_mvf = s->ref->tab_mvf; ++ int log2_min_pu_size = s->ps.sps->log2_min_pu_size; ++ int log2_min_tu_size = s->ps.sps->log2_min_tb_size; ++ int min_pu_width = s->ps.sps->min_pu_width; ++ int min_tu_width = s->ps.sps->min_tb_width; ++ int boundary_upper, boundary_left; + int i, j; + const RefPicList *rpl = s->ref->refPicList; + const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size); @@ -10600,24 +11888,22 @@ index b53f4cc721..9982cff40f 100644 +#ifdef DISABLE_STRENGTHS + return; +#endif - - boundary_upper = y0 > 0 && !(y0 & 7); - if (boundary_upper && -@@ -735,34 +1012,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, - (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) - boundary_upper = 0; - ++ ++ boundary_upper = y0 > 0 && !(y0 & 7); ++ if (boundary_upper && ++ ((!s->sh.slice_loop_filter_across_slices_enabled_flag && ++ lc->boundary_flags & BOUNDARY_UPPER_SLICE && ++ (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0) || ++ (!s->ps.pps->loop_filter_across_tiles_enabled_flag && ++ lc->boundary_flags & BOUNDARY_UPPER_TILE && ++ (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) ++ boundary_upper = 0; ++ + bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2]; + - if (boundary_upper) { -- RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ? ++ if (boundary_upper) { + const RefPicList *const rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ? - ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) : -- s->ref->refPicList; -- int yp_pu = (y0 - 1) >> log2_min_pu_size; -- int yq_pu = y0 >> log2_min_pu_size; -- int yp_tu = (y0 - 1) >> log2_min_tu_size; -- int yq_tu = y0 >> log2_min_tu_size; ++ ff_hevc_rpi_get_ref_list(s, s->ref, x0, y0 - 1) : + rpl; + MvField *top = curr - min_pu_width; + @@ -10635,22 +11921,8 @@ index b53f4cc721..9982cff40f 100644 + min_pu_in_4pix, sizeof (MvField), 4 >> 2, + rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list, + curr, top, bs); - - for (i = 0; i < (1 << log2_trafo_size); i += 4) { -- int x_pu = (x0 + i) >> log2_min_pu_size; -- int x_tu = (x0 + i) >> log2_min_tu_size; -- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu]; -- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu]; -- uint8_t top_cbf_luma = s->cbf_luma[yp_tu * min_tu_width + x_tu]; -- uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu]; -- -- if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA) -- bs = 2; -- else if (curr_cbf_luma || top_cbf_luma) -- bs = 1; -- else -- bs = boundary_strength(s, curr, top, rpl_top); -- s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs; ++ ++ for (i = 0; i < (1 << log2_trafo_size); i += 4) { + int i_pu = i >> log2_min_pu_size; + int i_tu = i >> log2_min_tu_size; + @@ -10658,7 +11930,7 @@ index b53f4cc721..9982cff40f 100644 + bs[i >> 2] = 2; + else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu]) + bs[i >> 2] = 1; - } ++ } + } + } + @@ -10675,67 +11947,31 @@ index b53f4cc721..9982cff40f 100644 + rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list, + curr, top, bs); + } - } - -- // bs for vertical TU boundaries - boundary_left = x0 > 0 && !(x0 & 7); - if (boundary_left && - ((!s->sh.slice_loop_filter_across_slices_enabled_flag && -@@ -773,64 +1072,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, - (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) - boundary_left = 0; - ++ } ++ ++ boundary_left = x0 > 0 && !(x0 & 7); ++ if (boundary_left && ++ ((!s->sh.slice_loop_filter_across_slices_enabled_flag && ++ lc->boundary_flags & BOUNDARY_LEFT_SLICE && ++ (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0) || ++ (!s->ps.pps->loop_filter_across_tiles_enabled_flag && ++ lc->boundary_flags & BOUNDARY_LEFT_TILE && ++ (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0))) ++ boundary_left = 0; ++ + curr = &tab_mvf[y_pu * min_pu_width + x_pu]; + bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2]; + - if (boundary_left) { -- RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ? ++ if (boundary_left) { + const RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ? - ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) : -- s->ref->refPicList; -- int xp_pu = (x0 - 1) >> log2_min_pu_size; -- int xq_pu = x0 >> log2_min_pu_size; -- int xp_tu = (x0 - 1) >> log2_min_tu_size; -- int xq_tu = x0 >> log2_min_tu_size; -- -- for (i = 0; i < (1 << log2_trafo_size); i += 4) { -- int y_pu = (y0 + i) >> log2_min_pu_size; -- int y_tu = (y0 + i) >> log2_min_tu_size; -- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu]; -- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu]; -- uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu]; -- uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu]; -- -- if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA) -- bs = 2; -- else if (curr_cbf_luma || left_cbf_luma) -- bs = 1; -- else -- bs = boundary_strength(s, curr, left, rpl_left); -- s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs; -- } -- } ++ ff_hevc_rpi_get_ref_list(s, s->ref, x0 - 1, y0) : + rpl; + MvField *left = curr - 1; - -- if (log2_trafo_size > log2_min_pu_size && !is_intra) { -- RefPicList *rpl = s->ref->refPicList; ++ + if (is_intra) { + for (j = 0; j < (1 << log2_trafo_size); j += 4) + bs[j * s->bs_width >> 2] = 2; - -- // bs for TU internal horizontal PU boundaries -- for (j = 8; j < (1 << log2_trafo_size); j += 8) { -- int yp_pu = (y0 + j - 1) >> log2_min_pu_size; -- int yq_pu = (y0 + j) >> log2_min_pu_size; -- -- for (i = 0; i < (1 << log2_trafo_size); i += 4) { -- int x_pu = (x0 + i) >> log2_min_pu_size; -- MvField *top = &tab_mvf[yp_pu * min_pu_width + x_pu]; -- MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu]; -- -- bs = boundary_strength(s, curr, top, rpl); -- s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs; ++ + } else { + int y_tu = y0 >> log2_min_tu_size; + int x_tu = x0 >> log2_min_tu_size; @@ -10755,57 +11991,49 @@ index b53f4cc721..9982cff40f 100644 + bs[j * s->bs_width >> 2] = 2; + else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width]) + bs[j * s->bs_width >> 2] = 1; - } - } ++ } ++ } + } - -- // bs for TU internal vertical PU boundaries -- for (j = 0; j < (1 << log2_trafo_size); j += 4) { -- int y_pu = (y0 + j) >> log2_min_pu_size; ++ + if (!is_intra) { + for (i = inc; i < trafo_in_min_pus; i += inc) { + MvField *left; - -- for (i = 8; i < (1 << log2_trafo_size); i += 8) { -- int xp_pu = (x0 + i - 1) >> log2_min_pu_size; -- int xq_pu = (x0 + i) >> log2_min_pu_size; -- MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu]; -- MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu]; ++ + curr += inc; + left = curr - 1; + bs += inc << log2_min_pu_size >> 2; - -- bs = boundary_strength(s, curr, left, rpl); -- s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs; -- } ++ + s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus, + min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2, + rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list, + curr, left, bs); - } - } - } -@@ -839,39 +1128,119 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, - #undef CB - #undef CR - --void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size) ++ } ++ } ++} ++ ++#undef LUMA ++#undef CB ++#undef CR ++ +#ifdef RPI_DEBLOCK_VPU -+// ff_hevc_flush_buffer_lines ++// ff_hevc_rpi_flush_buffer_lines +// flushes and invalidates all pixel rows in [start,end-1] -+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma) ++static void ff_hevc_rpi_flush_buffer_lines(HEVCRpiContext *s, int start, int end, int flush_luma, int flush_chroma) +{ + rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); + rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ 0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma); ++ 0, start, s->ps.sps->width, end - start, ctx_vshift(s, 1), flush_luma, flush_chroma); + rpi_cache_flush_finish(rfe); +} + +/* rpi_deblock deblocks an entire row of ctbs using the VPU */ -+static void rpi_deblock(HEVCContext *s, int y, int ctb_size) ++static void rpi_deblock(HEVCRpiContext *s, int y, int ctb_size) +{ ++ int num16high = (ctb_size+15)>>4; // May go over bottom of the image, but setup will be zero for these so should have no effect. ++ // TODO check that image allocation is large enough for this to be okay as well. ++ + // Flush image, 4 lines above to bottom of ctb stripe -+ ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1); ++ ff_hevc_rpi_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1); + // TODO flush buffer of beta/tc setup when it becomes cached + + // Prepare three commands at once to avoid calling overhead @@ -10813,23 +12041,23 @@ index b53f4cc721..9982cff40f 100644 + s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0]; + s->dvq->vpu_cmds_arm[0][2] = s->setup_width; + s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) ); -+ s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4; ++ s->dvq->vpu_cmds_arm[0][4] = num16high; + s->dvq->vpu_cmds_arm[0][5] = 2; + + s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]); + s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1]; + s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width; + s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) ); -+ s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1]; ++ s->dvq->vpu_cmds_arm[1][4] = (num16high + 1) >> s->ps.sps->vshift[1]; + s->dvq->vpu_cmds_arm[1][5] = 3; + + s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]); + s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2]; + s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width; + s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) ); -+ s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1]; ++ s->dvq->vpu_cmds_arm[2][4] = (num16high + 1) >> s->ps.sps->vshift[1]; + s->dvq->vpu_cmds_arm[2][5] = 4; -+ ++ + // Call VPU + { + const vpu_qpu_job_h vqj = vpu_qpu_job_new(); @@ -10846,16 +12074,14 @@ index b53f4cc721..9982cff40f 100644 + +#endif + -+void ff_hevc_hls_filter(HEVCContext * const s, const int x, const int y, const int ctb_size) - { -- int x_end = x >= s->ps.sps->width - ctb_size; ++void ff_hevc_rpi_hls_filter(HEVCRpiContext * const s, const int x, const int y, const int ctb_size) ++{ + const int x_end = x >= s->ps.sps->width - ctb_size; + - if (s->avctx->skip_loop_filter < AVDISCARD_ALL) - deblocking_filter_CTB(s, x, y); ++ if (s->avctx->skip_loop_filter < AVDISCARD_ALL) ++ deblocking_filter_CTB(s, x, y); + +#ifdef RPI_DEBLOCK_VPU -+#error Deblock VPU thoroughly rotted + if (s->enable_rpi_deblock && x_end) + { + int y_at_end = y >= s->ps.sps->height - ctb_size; @@ -10863,16468 +12089,4360 @@ index b53f4cc721..9982cff40f 100644 + int y_start = y&~63; + if (y_at_end) height = s->ps.sps->height - y_start; + if ((((y+ctb_size)&63)==0) || y_at_end) { -+ done_deblock = 1; + rpi_deblock(s, y_start, height); + } + } +#endif + - if (s->ps.sps->sao_enabled) { - int y_end = y >= s->ps.sps->height - ctb_size; -- if (y && x) ++ if (s->ps.sps->sao_enabled) { ++ int y_end = y >= s->ps.sps->height - ctb_size; + if (y != 0 && x != 0) - sao_filter_CTB(s, x - ctb_size, y - ctb_size); -- if (x && y_end) ++ sao_filter_CTB(s, x - ctb_size, y - ctb_size); + if (x != 0 && y_end) - sao_filter_CTB(s, x - ctb_size, y); -- if (y && x_end) { ++ sao_filter_CTB(s, x - ctb_size, y); + if (y != 0 && x_end) - sao_filter_CTB(s, x, y - ctb_size); -- if (s->threads_type & FF_THREAD_FRAME ) -- ff_thread_report_progress(&s->ref->tf, y, 0); -- } -- if (x_end && y_end) { ++ sao_filter_CTB(s, x, y - ctb_size); + if (x_end && y_end) - sao_filter_CTB(s, x , y); -- if (s->threads_type & FF_THREAD_FRAME ) -- ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0); -- } -- } else if (s->threads_type & FF_THREAD_FRAME && x_end) -- ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0); ++ sao_filter_CTB(s, x , y); + } - } - - void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size) - { -- int x_end = x_ctb >= s->ps.sps->width - ctb_size; -- int y_end = y_ctb >= s->ps.sps->height - ctb_size; ++} ++ ++void ff_hevc_rpi_hls_filters(HEVCRpiContext *s, int x_ctb, int y_ctb, int ctb_size) ++{ + // * This can break strict L->R then U->D ordering - mostly it doesn't matter + // Never called if rpi_enabled so no need for cache flush ops + const int x_end = x_ctb >= s->ps.sps->width - ctb_size; + const int y_end = y_ctb >= s->ps.sps->height - ctb_size; - if (y_ctb && x_ctb) - ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size, ctb_size); - if (y_ctb && x_end) ++ if (y_ctb && x_ctb) ++ ff_hevc_rpi_hls_filter(s, x_ctb - ctb_size, y_ctb - ctb_size, ctb_size); ++ if (y_ctb && x_end) + { - ff_hevc_hls_filter(s, x_ctb, y_ctb - ctb_size, ctb_size); ++ ff_hevc_rpi_hls_filter(s, x_ctb, y_ctb - ctb_size, ctb_size); + // Signal progress - this is safe for SAO + if (s->threads_type == FF_THREAD_FRAME && y_ctb > ctb_size) -+ ff_hevc_progress_signal_recon(s, y_ctb - ctb_size - 1); ++ ff_hevc_rpi_progress_signal_recon(s, y_ctb - ctb_size - 1); + } - if (x_ctb && y_end) - ff_hevc_hls_filter(s, x_ctb - ctb_size, y_ctb, ctb_size); ++ if (x_ctb && y_end) ++ ff_hevc_rpi_hls_filter(s, x_ctb - ctb_size, y_ctb, ctb_size); + if (x_end && y_end) + { -+ ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); ++ ff_hevc_rpi_hls_filter(s, x_ctb, y_ctb, ctb_size); + // All done - signal such + if (s->threads_type == FF_THREAD_FRAME) -+ ff_hevc_progress_signal_recon(s, INT_MAX); ++ ff_hevc_rpi_progress_signal_recon(s, INT_MAX); + } - } -diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c -index a8f7876b59..ca55da9d81 100644 ---- a/libavcodec/hevc_mvs.c -+++ b/libavcodec/hevc_mvs.c -@@ -39,10 +39,9 @@ static const uint8_t l0_l1_cand_idx[12][2] = { - { 3, 2, }, - }; - --void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH) -+void ff_hevc_set_neighbour_available(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0, ++} +diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c +new file mode 100644 +index 0000000000..9db79e658f +--- /dev/null ++++ b/libavcodec/rpi_hevc_mvs.c +@@ -0,0 +1,769 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 Anand Meher Kotra ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++ ++static const uint8_t l0_l1_cand_idx[12][2] = { ++ { 0, 1, }, ++ { 1, 0, }, ++ { 0, 2, }, ++ { 2, 0, }, ++ { 1, 2, }, ++ { 2, 1, }, ++ { 0, 3, }, ++ { 3, 0, }, ++ { 1, 3, }, ++ { 3, 1, }, ++ { 2, 3, }, ++ { 3, 2, }, ++}; ++ ++void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, + const int nPbW, const int nPbH) - { -- HEVCLocalContext *lc = s->HEVClc; - int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); - int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); - -@@ -61,8 +60,8 @@ void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0, - /* - * 6.4.1 Derivation process for z-scan order block availability - */ --static av_always_inline int z_scan_block_avail(HEVCContext *s, int xCurr, int yCurr, -- int xN, int yN) -+static av_always_inline int z_scan_block_avail(const HEVCContext * const s, const int xCurr, const int yCurr, ++{ ++ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); ++ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); ++ ++ lc->na.cand_up = (lc->ctb_up_flag || y0b); ++ lc->na.cand_left = (lc->ctb_left_flag || x0b); ++ lc->na.cand_up_left = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up; ++ lc->na.cand_up_right_sap = ++ ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size)) ? ++ lc->ctb_up_right_flag && !y0b : lc->na.cand_up; ++ lc->na.cand_up_right = ++ lc->na.cand_up_right_sap ++ && (x0 + nPbW) < lc->end_of_tiles_x; ++ lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left; ++} ++ ++/* ++ * 6.4.1 Derivation process for z-scan order block availability ++ */ ++static av_always_inline int z_scan_block_avail(const HEVCRpiContext * const s, const int xCurr, const int yCurr, + const int xN, const int yN) - { - #define MIN_TB_ADDR_ZS(x, y) \ - s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)] -@@ -83,7 +82,7 @@ static av_always_inline int z_scan_block_avail(HEVCContext *s, int xCurr, int yC - } - - //check if the two luma locations belong to the same motion estimation region --static av_always_inline int is_diff_mer(HEVCContext *s, int xN, int yN, int xP, int yP) -+static av_always_inline int is_diff_mer(const HEVCContext * const s, int xN, int yN, int xP, int yP) - { - uint8_t plevel = s->ps.pps->log2_parallel_merge_level; - -@@ -95,7 +94,7 @@ static av_always_inline int is_diff_mer(HEVCContext *s, int xN, int yN, int xP, - #define MATCH(x) (A.x == B.x) - - // check if the mv's and refidx are the same between A and B --static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField B) ++{ ++#define MIN_TB_ADDR_ZS(x, y) \ ++ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)] ++ ++ int xCurr_ctb = xCurr >> s->ps.sps->log2_ctb_size; ++ int yCurr_ctb = yCurr >> s->ps.sps->log2_ctb_size; ++ int xN_ctb = xN >> s->ps.sps->log2_ctb_size; ++ int yN_ctb = yN >> s->ps.sps->log2_ctb_size; ++ if( yN_ctb < yCurr_ctb || xN_ctb < xCurr_ctb ) ++ return 1; ++ else { ++ int Curr = MIN_TB_ADDR_ZS((xCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask, ++ (yCurr >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask); ++ int N = MIN_TB_ADDR_ZS((xN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask, ++ (yN >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask); ++ return N <= Curr; ++ } ++} ++ ++//check if the two luma locations belong to the same motion estimation region ++static av_always_inline int is_diff_mer(const HEVCRpiContext * const s, int xN, int yN, int xP, int yP) ++{ ++ uint8_t plevel = s->ps.pps->log2_parallel_merge_level; ++ ++ return xN >> plevel == xP >> plevel && ++ yN >> plevel == yP >> plevel; ++} ++ ++#define MATCH_MV(x) (AV_RN32A(&A.x) == AV_RN32A(&B.x)) ++#define MATCH(x) (A.x == B.x) ++ ++// check if the mv's and refidx are the same between A and B +static av_always_inline int compare_mv_ref_idx(const struct MvField A, const struct MvField B) - { - int a_pf = A.pred_flag; - int b_pf = B.pred_flag; -@@ -112,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField - return 0; - } - --static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb) ++{ ++ int a_pf = A.pred_flag; ++ int b_pf = B.pred_flag; ++ if (a_pf == b_pf) { ++ if (a_pf == PF_BI) { ++ return MATCH(ref_idx[0]) && MATCH_MV(mv[0]) && ++ MATCH(ref_idx[1]) && MATCH_MV(mv[1]); ++ } else if (a_pf == PF_L0) { ++ return MATCH(ref_idx[0]) && MATCH_MV(mv[0]); ++ } else if (a_pf == PF_L1) { ++ return MATCH(ref_idx[1]) && MATCH_MV(mv[1]); ++ } ++ } ++ return 0; ++} ++ +static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb) - { - int tx, scale_factor; - -@@ -126,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb) - (scale_factor * src->y < 0)) >> 8); - } - --static int check_mvset(Mv *mvLXCol, Mv *mvCol, -- int colPic, int poc, -- RefPicList *refPicList, int X, int refIdxLx, -- RefPicList *refPicList_col, int listCol, int refidxCol) ++{ ++ int tx, scale_factor; ++ ++ td = av_clip_int8(td); ++ tb = av_clip_int8(tb); ++ tx = (0x4000 + abs(td / 2)) / td; ++ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12); ++ dst->x = av_clip_int16((scale_factor * src->x + 127 + ++ (scale_factor * src->x < 0)) >> 8); ++ dst->y = av_clip_int16((scale_factor * src->y + 127 + ++ (scale_factor * src->y < 0)) >> 8); ++} ++ +static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol, + const int colPic, const int poc, + const RefPicList * const refPicList, const int X, const int refIdxLx, + const RefPicList * const refPicList_col, const int listCol, const int refidxCol) - { - int cur_lt = refPicList[X].isLongTerm[refIdxLx]; - int col_lt = refPicList_col[listCol].isLongTerm[refidxCol]; -@@ -160,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol, - refPicList_col, L ## l, temp_col.ref_idx[l]) - - // derive the motion vectors section 8.5.3.1.8 --static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col, -- int refIdxLx, Mv *mvLXCol, int X, -- int colPic, RefPicList *refPicList_col) -+static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col, ++{ ++ int cur_lt = refPicList[X].isLongTerm[refIdxLx]; ++ int col_lt = refPicList_col[listCol].isLongTerm[refidxCol]; ++ int col_poc_diff, cur_poc_diff; ++ ++ if (cur_lt != col_lt) { ++ mvLXCol->x = 0; ++ mvLXCol->y = 0; ++ return 0; ++ } ++ ++ col_poc_diff = colPic - refPicList_col[listCol].list[refidxCol]; ++ cur_poc_diff = poc - refPicList[X].list[refIdxLx]; ++ ++ if (cur_lt || col_poc_diff == cur_poc_diff || !col_poc_diff) { ++ mvLXCol->x = mvCol->x; ++ mvLXCol->y = mvCol->y; ++ } else { ++ mv_scale(mvLXCol, mvCol, col_poc_diff, cur_poc_diff); ++ } ++ return 1; ++} ++ ++#define CHECK_MVSET(l) \ ++ check_mvset(mvLXCol, temp_col.mv + l, \ ++ colPic, s->poc, \ ++ refPicList, X, refIdxLx, \ ++ refPicList_col, L ## l, temp_col.ref_idx[l]) ++ ++// derive the motion vectors section 8.5.3.1.8 ++static int derive_temporal_colocated_mvs(const HEVCRpiContext * const s, const MvField temp_col, + const int refIdxLx, Mv * const mvLXCol, const int X, + const int colPic, const RefPicList * const refPicList_col) - { -- RefPicList *refPicList = s->ref->refPicList; ++{ + const RefPicList * const refPicList = s->ref->refPicList; - - if (temp_col.pred_flag == PF_INTRA) - return 0; -@@ -215,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col, - /* - * 8.5.3.1.7 temporal luma motion vector prediction - */ --static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH, int refIdxLx, -- Mv *mvLXCol, int X) -+static int temporal_luma_motion_vector(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0, ++ ++ if (temp_col.pred_flag == PF_INTRA) ++ return 0; ++ ++ if (!(temp_col.pred_flag & PF_L0)) ++ return CHECK_MVSET(1); ++ else if (temp_col.pred_flag == PF_L0) ++ return CHECK_MVSET(0); ++ else if (temp_col.pred_flag == PF_BI) { ++ int check_diffpicount = 0; ++ int i, j; ++ for (j = 0; j < 2; j++) { ++ for (i = 0; i < refPicList[j].nb_refs; i++) { ++ if (refPicList[j].list[i] > s->poc) { ++ check_diffpicount++; ++ break; ++ } ++ } ++ } ++ if (!check_diffpicount) { ++ if (X==0) ++ return CHECK_MVSET(0); ++ else ++ return CHECK_MVSET(1); ++ } else { ++ if (s->sh.collocated_list == L1) ++ return CHECK_MVSET(0); ++ else ++ return CHECK_MVSET(1); ++ } ++ } ++ ++ return 0; ++} ++ ++#define TAB_MVF(x, y) \ ++ tab_mvf[(y) * min_pu_width + x] ++ ++#define TAB_MVF_PU(v) \ ++ TAB_MVF(((x ## v) >> s->ps.sps->log2_min_pu_size), \ ++ ((y ## v) >> s->ps.sps->log2_min_pu_size)) ++ ++#define DERIVE_TEMPORAL_COLOCATED_MVS \ ++ derive_temporal_colocated_mvs(s, temp_col, \ ++ refIdxLx, mvLXCol, X, colPic, \ ++ ff_hevc_rpi_get_ref_list(s, ref, x, y)) ++ ++/* ++ * 8.5.3.1.7 temporal luma motion vector prediction ++ */ ++static int temporal_luma_motion_vector(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, + const int nPbW, const int nPbH, const int refIdxLx, + Mv * const mvLXCol, const int X) - { - MvField *tab_mvf; - MvField temp_col; - int x, y, x_pu, y_pu; -- int min_pu_width = s->ps.sps->min_pu_width; ++{ ++ MvField *tab_mvf; ++ MvField temp_col; ++ int x, y, x_pu, y_pu; + const int min_pu_width = s->ps.sps->min_pu_width; - int availableFlagLXCol = 0; - int colPic; - -- HEVCFrame *ref = s->ref->collocated_ref; ++ int availableFlagLXCol = 0; ++ int colPic; ++ + HEVCFrame * const ref = s->ref->collocated_ref; - -- if (!ref) { ++ + if (ref == NULL || ref->tab_mvf == NULL) { - memset(mvLXCol, 0, sizeof(*mvLXCol)); - return 0; - } -@@ -240,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, - x = x0 + nPbW; - y = y0 + nPbH; - -- if (tab_mvf && -- (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && ++ memset(mvLXCol, 0, sizeof(*mvLXCol)); ++ return 0; ++ } ++ ++ tab_mvf = ref->tab_mvf; ++ colPic = ref->poc; ++ ++ //bottom right collocated motion vector ++ x = x0 + nPbW; ++ y = y0 + nPbH; ++ + if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && - y < s->ps.sps->height && - x < s->ps.sps->width) { - x &= ~15; - y &= ~15; - if (s->threads_type == FF_THREAD_FRAME) -- ff_thread_await_progress(&ref->tf, y, 0); -+ ff_hevc_progress_wait_mv(s, lc->jb0, ref, y); - x_pu = x >> s->ps.sps->log2_min_pu_size; - y_pu = y >> s->ps.sps->log2_min_pu_size; - temp_col = TAB_MVF(x_pu, y_pu); -@@ -255,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, - } - - // derive center collocated motion vector -- if (tab_mvf && !availableFlagLXCol) { ++ y < s->ps.sps->height && ++ x < s->ps.sps->width) { ++ x &= ~15; ++ y &= ~15; ++ if (s->threads_type == FF_THREAD_FRAME) ++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); ++ x_pu = x >> s->ps.sps->log2_min_pu_size; ++ y_pu = y >> s->ps.sps->log2_min_pu_size; ++ temp_col = TAB_MVF(x_pu, y_pu); ++ availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS; ++ } ++ ++ // derive center collocated motion vector + if (!availableFlagLXCol) { - x = x0 + (nPbW >> 1); - y = y0 + (nPbH >> 1); - x &= ~15; - y &= ~15; - if (s->threads_type == FF_THREAD_FRAME) -- ff_thread_await_progress(&ref->tf, y, 0); -+ ff_hevc_progress_wait_mv(s, lc->jb0, ref, y); - x_pu = x >> s->ps.sps->log2_min_pu_size; - y_pu = y >> s->ps.sps->log2_min_pu_size; - temp_col = TAB_MVF(x_pu, y_pu); -@@ -282,16 +280,15 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0, - /* - * 8.5.3.1.2 Derivation process for spatial merging candidates - */ --static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0, -+static void derive_spatial_merge_candidates(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, - int nPbW, int nPbH, - int log2_cb_size, - int singleMCLFlag, int part_idx, - int merge_idx, - struct MvField mergecandlist[]) - { -- HEVCLocalContext *lc = s->HEVClc; -- RefPicList *refPicList = s->ref->refPicList; -- MvField *tab_mvf = s->ref->tab_mvf; ++ x = x0 + (nPbW >> 1); ++ y = y0 + (nPbH >> 1); ++ x &= ~15; ++ y &= ~15; ++ if (s->threads_type == FF_THREAD_FRAME) ++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, ref, y); ++ x_pu = x >> s->ps.sps->log2_min_pu_size; ++ y_pu = y >> s->ps.sps->log2_min_pu_size; ++ temp_col = TAB_MVF(x_pu, y_pu); ++ availableFlagLXCol = DERIVE_TEMPORAL_COLOCATED_MVS; ++ } ++ return availableFlagLXCol; ++} ++ ++#define AVAILABLE(cand, v) \ ++ (cand && !(TAB_MVF_PU(v).pred_flag == PF_INTRA)) ++ ++#define PRED_BLOCK_AVAILABLE(v) \ ++ z_scan_block_avail(s, x0, y0, x ## v, y ## v) ++ ++#define COMPARE_MV_REFIDX(a, b) \ ++ compare_mv_ref_idx(TAB_MVF_PU(a), TAB_MVF_PU(b)) ++ ++/* ++ * 8.5.3.1.2 Derivation process for spatial merging candidates ++ */ ++static void derive_spatial_merge_candidates(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, ++ int nPbW, int nPbH, ++ int log2_cb_size, ++ int singleMCLFlag, int part_idx, ++ int merge_idx, ++ struct MvField mergecandlist[]) ++{ + const RefPicList * const refPicList = s->ref->refPicList; + const MvField * const tab_mvf = s->ref->tab_mvf; - - const int min_pu_width = s->ps.sps->min_pu_width; - -@@ -410,10 +407,10 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0, - if (s->sh.slice_temporal_mvp_enabled_flag && - nb_merge_cand < s->sh.max_num_merge_cand) { - Mv mv_l0_col = { 0 }, mv_l1_col = { 0 }; -- int available_l0 = temporal_luma_motion_vector(s, x0, y0, nPbW, nPbH, ++ ++ const int min_pu_width = s->ps.sps->min_pu_width; ++ ++ const int cand_bottom_left = lc->na.cand_bottom_left; ++ const int cand_left = lc->na.cand_left; ++ const int cand_up_left = lc->na.cand_up_left; ++ const int cand_up = lc->na.cand_up; ++ const int cand_up_right = lc->na.cand_up_right_sap; ++ ++ const int xA1 = x0 - 1; ++ const int yA1 = y0 + nPbH - 1; ++ ++ const int xB1 = x0 + nPbW - 1; ++ const int yB1 = y0 - 1; ++ ++ const int xB0 = x0 + nPbW; ++ const int yB0 = y0 - 1; ++ ++ const int xA0 = x0 - 1; ++ const int yA0 = y0 + nPbH; ++ ++ const int xB2 = x0 - 1; ++ const int yB2 = y0 - 1; ++ ++ const int nb_refs = (s->sh.slice_type == HEVC_SLICE_P) ? ++ s->sh.nb_refs[0] : FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]); ++ ++ int zero_idx = 0; ++ ++ int nb_merge_cand = 0; ++ int nb_orig_merge_cand = 0; ++ ++ int is_available_a0; ++ int is_available_a1; ++ int is_available_b0; ++ int is_available_b1; ++ int is_available_b2; ++ ++ ++ if (!singleMCLFlag && part_idx == 1 && ++ (lc->cu.part_mode == PART_Nx2N || ++ lc->cu.part_mode == PART_nLx2N || ++ lc->cu.part_mode == PART_nRx2N) || ++ is_diff_mer(s, xA1, yA1, x0, y0)) { ++ is_available_a1 = 0; ++ } else { ++ is_available_a1 = AVAILABLE(cand_left, A1); ++ if (is_available_a1) { ++ mergecandlist[nb_merge_cand] = TAB_MVF_PU(A1); ++ if (merge_idx == 0) ++ return; ++ nb_merge_cand++; ++ } ++ } ++ ++ if (!singleMCLFlag && part_idx == 1 && ++ (lc->cu.part_mode == PART_2NxN || ++ lc->cu.part_mode == PART_2NxnU || ++ lc->cu.part_mode == PART_2NxnD) || ++ is_diff_mer(s, xB1, yB1, x0, y0)) { ++ is_available_b1 = 0; ++ } else { ++ is_available_b1 = AVAILABLE(cand_up, B1); ++ if (is_available_b1 && ++ !(is_available_a1 && COMPARE_MV_REFIDX(B1, A1))) { ++ mergecandlist[nb_merge_cand] = TAB_MVF_PU(B1); ++ if (merge_idx == nb_merge_cand) ++ return; ++ nb_merge_cand++; ++ } ++ } ++ ++ // above right spatial merge candidate ++ is_available_b0 = AVAILABLE(cand_up_right, B0) && ++ xB0 < s->ps.sps->width && ++ PRED_BLOCK_AVAILABLE(B0) && ++ !is_diff_mer(s, xB0, yB0, x0, y0); ++ ++ if (is_available_b0 && ++ !(is_available_b1 && COMPARE_MV_REFIDX(B0, B1))) { ++ mergecandlist[nb_merge_cand] = TAB_MVF_PU(B0); ++ if (merge_idx == nb_merge_cand) ++ return; ++ nb_merge_cand++; ++ } ++ ++ // left bottom spatial merge candidate ++ is_available_a0 = AVAILABLE(cand_bottom_left, A0) && ++ yA0 < s->ps.sps->height && ++ PRED_BLOCK_AVAILABLE(A0) && ++ !is_diff_mer(s, xA0, yA0, x0, y0); ++ ++ if (is_available_a0 && ++ !(is_available_a1 && COMPARE_MV_REFIDX(A0, A1))) { ++ mergecandlist[nb_merge_cand] = TAB_MVF_PU(A0); ++ if (merge_idx == nb_merge_cand) ++ return; ++ nb_merge_cand++; ++ } ++ ++ // above left spatial merge candidate ++ is_available_b2 = AVAILABLE(cand_up_left, B2) && ++ !is_diff_mer(s, xB2, yB2, x0, y0); ++ ++ if (is_available_b2 && ++ !(is_available_a1 && COMPARE_MV_REFIDX(B2, A1)) && ++ !(is_available_b1 && COMPARE_MV_REFIDX(B2, B1)) && ++ nb_merge_cand != 4) { ++ mergecandlist[nb_merge_cand] = TAB_MVF_PU(B2); ++ if (merge_idx == nb_merge_cand) ++ return; ++ nb_merge_cand++; ++ } ++ ++ // temporal motion vector candidate ++ if (s->sh.slice_temporal_mvp_enabled_flag && ++ nb_merge_cand < s->sh.max_num_merge_cand) { ++ Mv mv_l0_col = { 0 }, mv_l1_col = { 0 }; + int available_l0 = temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, - 0, &mv_l0_col, 0); - int available_l1 = (s->sh.slice_type == HEVC_SLICE_B) ? -- temporal_luma_motion_vector(s, x0, y0, nPbW, nPbH, ++ 0, &mv_l0_col, 0); ++ int available_l1 = (s->sh.slice_type == HEVC_SLICE_B) ? + temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, - 0, &mv_l1_col, 1) : 0; - - if (available_l0 || available_l1) { -@@ -476,16 +473,15 @@ static void derive_spatial_merge_candidates(HEVCContext *s, int x0, int y0, - /* - * 8.5.3.1.1 Derivation process of luma Mvs for merge mode - */ --void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW, -+void ff_hevc_luma_mv_merge_mode(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int nPbW, - int nPbH, int log2_cb_size, int part_idx, -- int merge_idx, MvField *mv) ++ 0, &mv_l1_col, 1) : 0; ++ ++ if (available_l0 || available_l1) { ++ mergecandlist[nb_merge_cand].pred_flag = available_l0 + (available_l1 << 1); ++ AV_ZERO16(mergecandlist[nb_merge_cand].ref_idx); ++ mergecandlist[nb_merge_cand].mv[0] = mv_l0_col; ++ mergecandlist[nb_merge_cand].mv[1] = mv_l1_col; ++ ++ if (merge_idx == nb_merge_cand) ++ return; ++ nb_merge_cand++; ++ } ++ } ++ ++ nb_orig_merge_cand = nb_merge_cand; ++ ++ // combined bi-predictive merge candidates (applies for B slices) ++ if (s->sh.slice_type == HEVC_SLICE_B && nb_orig_merge_cand > 1 && ++ nb_orig_merge_cand < s->sh.max_num_merge_cand) { ++ int comb_idx = 0; ++ ++ for (comb_idx = 0; nb_merge_cand < s->sh.max_num_merge_cand && ++ comb_idx < nb_orig_merge_cand * (nb_orig_merge_cand - 1); comb_idx++) { ++ int l0_cand_idx = l0_l1_cand_idx[comb_idx][0]; ++ int l1_cand_idx = l0_l1_cand_idx[comb_idx][1]; ++ MvField l0_cand = mergecandlist[l0_cand_idx]; ++ MvField l1_cand = mergecandlist[l1_cand_idx]; ++ ++ if ((l0_cand.pred_flag & PF_L0) && (l1_cand.pred_flag & PF_L1) && ++ (refPicList[0].list[l0_cand.ref_idx[0]] != ++ refPicList[1].list[l1_cand.ref_idx[1]] || ++ AV_RN32A(&l0_cand.mv[0]) != AV_RN32A(&l1_cand.mv[1]))) { ++ mergecandlist[nb_merge_cand].ref_idx[0] = l0_cand.ref_idx[0]; ++ mergecandlist[nb_merge_cand].ref_idx[1] = l1_cand.ref_idx[1]; ++ mergecandlist[nb_merge_cand].pred_flag = PF_BI; ++ AV_COPY32(&mergecandlist[nb_merge_cand].mv[0], &l0_cand.mv[0]); ++ AV_COPY32(&mergecandlist[nb_merge_cand].mv[1], &l1_cand.mv[1]); ++ if (merge_idx == nb_merge_cand) ++ return; ++ nb_merge_cand++; ++ } ++ } ++ } ++ ++ // append Zero motion vector candidates ++ while (nb_merge_cand < s->sh.max_num_merge_cand) { ++ mergecandlist[nb_merge_cand].pred_flag = PF_L0 + ((s->sh.slice_type == HEVC_SLICE_B) << 1); ++ AV_ZERO32(mergecandlist[nb_merge_cand].mv + 0); ++ AV_ZERO32(mergecandlist[nb_merge_cand].mv + 1); ++ mergecandlist[nb_merge_cand].ref_idx[0] = zero_idx < nb_refs ? zero_idx : 0; ++ mergecandlist[nb_merge_cand].ref_idx[1] = zero_idx < nb_refs ? zero_idx : 0; ++ ++ if (merge_idx == nb_merge_cand) ++ return; ++ nb_merge_cand++; ++ zero_idx++; ++ } ++} ++ ++/* ++ * 8.5.3.1.1 Derivation process of luma Mvs for merge mode ++ */ ++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, + int merge_idx, MvField * const mv) - { - int singleMCLFlag = 0; - int nCS = 1 << log2_cb_size; - LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]); - int nPbW2 = nPbW; - int nPbH2 = nPbH; -- HEVCLocalContext *lc = s->HEVClc; - - if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) { - singleMCLFlag = 1; -@@ -496,8 +492,8 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW, - part_idx = 0; - } - -- ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH); -- derive_spatial_merge_candidates(s, x0, y0, nPbW, nPbH, log2_cb_size, -+ ff_hevc_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); ++{ ++ int singleMCLFlag = 0; ++ int nCS = 1 << log2_cb_size; ++ LOCAL_ALIGNED(4, MvField, mergecand_list, [MRG_MAX_NUM_CANDS]); ++ int nPbW2 = nPbW; ++ int nPbH2 = nPbH; ++ ++ if (s->ps.pps->log2_parallel_merge_level > 2 && nCS == 8) { ++ singleMCLFlag = 1; ++ x0 = lc->cu.x; ++ y0 = lc->cu.y; ++ nPbW = nCS; ++ nPbH = nCS; ++ part_idx = 0; ++ } ++ ++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); + derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, - singleMCLFlag, part_idx, - merge_idx, mergecand_list); - -@@ -509,12 +505,12 @@ void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, int nPbW, - *mv = mergecand_list[merge_idx]; - } - --static av_always_inline void dist_scale(HEVCContext *s, Mv *mv, -+static av_always_inline void dist_scale(const HEVCContext * const s, Mv * const mv, - int min_pu_width, int x, int y, - int elist, int ref_idx_curr, int ref_idx) - { -- RefPicList *refPicList = s->ref->refPicList; -- MvField *tab_mvf = s->ref->tab_mvf; ++ singleMCLFlag, part_idx, ++ merge_idx, mergecand_list); ++ ++ if (mergecand_list[merge_idx].pred_flag == PF_BI && ++ (nPbW2 + nPbH2) == 12) { ++ mergecand_list[merge_idx].pred_flag = PF_L0; ++ } ++ ++ *mv = mergecand_list[merge_idx]; ++} ++ ++static av_always_inline void dist_scale(const HEVCRpiContext * const s, Mv * const mv, ++ int min_pu_width, int x, int y, ++ int elist, int ref_idx_curr, int ref_idx) ++{ + const RefPicList * const refPicList = s->ref->refPicList; + const MvField * const tab_mvf = s->ref->tab_mvf; - int ref_pic_elist = refPicList[elist].list[TAB_MVF(x, y).ref_idx[elist]]; - int ref_pic_curr = refPicList[ref_idx_curr].list[ref_idx]; - -@@ -526,13 +522,13 @@ static av_always_inline void dist_scale(HEVCContext *s, Mv *mv, - } - } - --static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index, -- Mv *mv, int ref_idx_curr, int ref_idx) -+static int mv_mp_mode_mx(const HEVCContext * const s, const int x, const int y, const int pred_flag_index, ++ int ref_pic_elist = refPicList[elist].list[TAB_MVF(x, y).ref_idx[elist]]; ++ int ref_pic_curr = refPicList[ref_idx_curr].list[ref_idx]; ++ ++ if (ref_pic_elist != ref_pic_curr) { ++ int poc_diff = s->poc - ref_pic_elist; ++ if (!poc_diff) ++ poc_diff = 1; ++ mv_scale(mv, mv, poc_diff, s->poc - ref_pic_curr); ++ } ++} ++ ++static int mv_mp_mode_mx(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index, + Mv * const mv, const int ref_idx_curr, const int ref_idx) - { -- MvField *tab_mvf = s->ref->tab_mvf; -- int min_pu_width = s->ps.sps->min_pu_width; ++{ + const MvField * const tab_mvf = s->ref->tab_mvf; + const int min_pu_width = s->ps.sps->min_pu_width; - -- RefPicList *refPicList = s->ref->refPicList; ++ + const RefPicList * const refPicList = s->ref->refPicList; - - if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) && - refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) { -@@ -542,8 +538,8 @@ static int mv_mp_mode_mx(HEVCContext *s, int x, int y, int pred_flag_index, - return 0; - } - --static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index, -- Mv *mv, int ref_idx_curr, int ref_idx) -+static int mv_mp_mode_mx_lt(const HEVCContext * const s, const int x, const int y, const int pred_flag_index, ++ ++ if (((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) && ++ refPicList[pred_flag_index].list[TAB_MVF(x, y).ref_idx[pred_flag_index]] == refPicList[ref_idx_curr].list[ref_idx]) { ++ *mv = TAB_MVF(x, y).mv[pred_flag_index]; ++ return 1; ++ } ++ return 0; ++} ++ ++static int mv_mp_mode_mx_lt(const HEVCRpiContext * const s, const int x, const int y, const int pred_flag_index, + Mv * const mv, const int ref_idx_curr, const int ref_idx) - { - MvField *tab_mvf = s->ref->tab_mvf; - int min_pu_width = s->ps.sps->min_pu_width; -@@ -579,13 +575,12 @@ static int mv_mp_mode_mx_lt(HEVCContext *s, int x, int y, int pred_flag_index, - (y ## v) >> s->ps.sps->log2_min_pu_size, \ - pred, &mx, ref_idx_curr, ref_idx) - --void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, -+void ff_hevc_luma_mv_mvp_mode(const HEVCContext * const s, HEVCLocalContext *lc, int x0, int y0, int nPbW, - int nPbH, int log2_cb_size, int part_idx, -- int merge_idx, MvField *mv, -+ int merge_idx, MvField * const mv, - int mvp_lx_flag, int LX) - { -- HEVCLocalContext *lc = s->HEVClc; -- MvField *tab_mvf = s->ref->tab_mvf; -+ const MvField *tab_mvf = s->ref->tab_mvf; - int isScaledFlag_L0 = 0; - int availableFlagLXA0 = 1; - int availableFlagLXB0 = 1; -@@ -763,7 +758,7 @@ scalef: - if (numMVPCandLX < 2 && s->sh.slice_temporal_mvp_enabled_flag && - mvp_lx_flag == numMVPCandLX) { - Mv mv_col; -- int available_col = temporal_luma_motion_vector(s, x0, y0, nPbW, -+ int available_col = temporal_luma_motion_vector(s, lc, x0, y0, nPbW, - nPbH, ref_idx, - &mv_col, LX); - if (available_col) -diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c -index 902917d4dd..c1140e2a76 100644 ---- a/libavcodec/hevc_ps.c -+++ b/libavcodec/hevc_ps.c -@@ -820,7 +820,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) - switch (sps->bit_depth) { - case 8: - if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8; -+#if RPI_HEVC_SAND -+ // *** Horrid kludge s.t. we start out with sand format -+ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P; -+#else - if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P; -+#endif - if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P; - if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P; - break; -@@ -832,7 +837,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps) - break; - case 10: - if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY10; -+#if RPI_HEVC_SAND -+ // *** Horrid kludge s.t. we start out with sand format -+ if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10; -+#else - if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10; -+#endif - if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10; - if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10; - break; -@@ -1100,7 +1110,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, - skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); - if (sps_extension_flag[0]) { - int extended_precision_processing_flag; -- int high_precision_offsets_enabled_flag; - int cabac_bypass_alignment_enabled_flag; - - sps->transform_skip_rotation_enabled_flag = get_bits1(gb); -@@ -1115,10 +1124,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id, - "extended_precision_processing_flag not yet implemented\n"); - - sps->intra_smoothing_disabled_flag = get_bits1(gb); -- high_precision_offsets_enabled_flag = get_bits1(gb); -- if (high_precision_offsets_enabled_flag) -+ sps->high_precision_offsets_enabled_flag = get_bits1(gb); -+ if (sps->high_precision_offsets_enabled_flag) - av_log(avctx, AV_LOG_WARNING, -- "high_precision_offsets_enabled_flag not yet implemented\n"); -+ "high_precision_offsets_enabled_flag not fully implemented\n"); - - sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); - -@@ -1285,6 +1294,7 @@ static void hevc_pps_free(void *opaque, uint8_t *data) - av_freep(&pps->ctb_addr_rs_to_ts); - av_freep(&pps->ctb_addr_ts_to_rs); - av_freep(&pps->tile_pos_rs); -+ av_freep(&pps->tile_size); - av_freep(&pps->tile_id); - av_freep(&pps->min_tb_addr_zs_tab); - -@@ -1369,7 +1379,7 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb, - pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i]; - - for (i = 0, j = 0; i < sps->ctb_width; i++) { -- if (i > pps->col_bd[j]) -+ if (i >= pps->col_bd[j + 1]) - j++; - pps->col_idxX[i] = j; - } -@@ -1382,6 +1392,7 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb, - pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); - pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); - pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id)); -+ pps->tile_size = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_size)); - pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab)); - if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || - !pps->tile_id || !pps->min_tb_addr_zs_tab) { -@@ -1433,8 +1444,12 @@ static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb, - - for (j = 0; j < pps->num_tile_rows; j++) - for (i = 0; i < pps->num_tile_columns; i++) -+ { -+ pps->tile_size[j * pps->num_tile_columns + i] = -+ pps->column_width[i] * pps->row_height[j]; - pps->tile_pos_rs[j * pps->num_tile_columns + i] = - pps->row_bd[j] * sps->ctb_width + pps->col_bd[i]; -+ } - - log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size; - pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1]; -diff --git a/libavcodec/hevc_ps.h b/libavcodec/hevc_ps.h -index 76f8eb31e6..edf541ce8b 100644 ---- a/libavcodec/hevc_ps.h -+++ b/libavcodec/hevc_ps.h -@@ -289,6 +289,7 @@ typedef struct HEVCSPS { - int implicit_rdpcm_enabled_flag; - int explicit_rdpcm_enabled_flag; - int intra_smoothing_disabled_flag; -+ int high_precision_offsets_enabled_flag; - int persistent_rice_adaptation_enabled_flag; - - ///< coded frame dimension in various units -@@ -384,6 +385,7 @@ typedef struct HEVCPPS { - int *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS - int *tile_id; ///< TileId - int *tile_pos_rs; ///< TilePosRS -+ int *tile_size; ///< TileSize - int *min_tb_addr_zs; ///< MinTbAddrZS - int *min_tb_addr_zs_tab;///< MinTbAddrZS - -diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c -index ac462d350b..c3890e015c 100644 ---- a/libavcodec/hevc_refs.c -+++ b/libavcodec/hevc_refs.c -@@ -23,7 +23,7 @@ - - #include "libavutil/avassert.h" - #include "libavutil/pixdesc.h" -- -+#include "libavutil/rpi_sand_fns.h" - #include "internal.h" - #include "thread.h" - #include "hevc.h" -@@ -54,13 +54,13 @@ void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags) - } - } - --RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *ref, int x0, int y0) -+const RefPicList *ff_hevc_get_ref_list(const HEVCContext * const s, const HEVCFrame * const ref, int x0, int y0) - { - int x_cb = x0 >> s->ps.sps->log2_ctb_size; - int y_cb = y0 >> s->ps.sps->log2_ctb_size; - int pic_width_cb = s->ps.sps->ctb_width; - int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb]; -- return (RefPicList *)ref->rpl_tab[ctb_addr_ts]; -+ return (const RefPicList *)ref->rpl_tab[ctb_addr_ts]; - } - - void ff_hevc_clear_refs(HEVCContext *s) -@@ -207,7 +207,6 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush) - - if (nb_output) { - HEVCFrame *frame = &s->DPB[min_idx]; -- - if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1) - return 0; - -@@ -218,7 +217,6 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush) - ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); - if (ret < 0) - return ret; -- - av_log(s->avctx, AV_LOG_DEBUG, - "Output frame with POC %d.\n", frame->poc); - return 1; -@@ -422,8 +420,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc) - frame->sequence = s->seq_decode; - frame->flags = 0; - -- if (s->threads_type == FF_THREAD_FRAME) -- ff_thread_report_progress(&frame->tf, INT_MAX, 0); -+ ff_hevc_progress_set_all_done(frame); - - return frame; - } -diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c -index 2e4add2ae3..bffae105ef 100644 ---- a/libavcodec/hevcdec.c -+++ b/libavcodec/hevcdec.c -@@ -43,8 +43,681 @@ - #include "hevcdec.h" - #include "profiles.h" - -+#ifdef RPI -+ #include "rpi_qpu.h" -+ #include "rpi_shader.h" -+ #include "rpi_shader_cmd.h" -+ #include "rpi_shader_template.h" -+ #include "rpi_zc.h" -+ #include "libavutil/rpi_sand_fns.h" -+ -+ // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory -+ #define RPI_CACHE_UNIF_MVS 1 -+ -+ #include "pthread.h" -+ #include "libavutil/atomic.h" -+#endif ++{ ++ MvField *tab_mvf = s->ref->tab_mvf; ++ int min_pu_width = s->ps.sps->min_pu_width; + -+#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards ++ RefPicList *refPicList = s->ref->refPicList; + -+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) ++ if ((TAB_MVF(x, y).pred_flag) & (1 << pred_flag_index)) { ++ int currIsLongTerm = refPicList[ref_idx_curr].isLongTerm[ref_idx]; + -+#ifndef av_mod_uintp2 -+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) -+{ -+ return a & ((1 << p) - 1); ++ int colIsLongTerm = ++ refPicList[pred_flag_index].isLongTerm[(TAB_MVF(x, y).ref_idx[pred_flag_index])]; ++ ++ if (colIsLongTerm == currIsLongTerm) { ++ *mv = TAB_MVF(x, y).mv[pred_flag_index]; ++ if (!currIsLongTerm) ++ dist_scale(s, mv, min_pu_width, x, y, ++ pred_flag_index, ref_idx_curr, ref_idx); ++ return 1; ++ } ++ } ++ return 0; +} -+# define av_mod_uintp2 av_mod_uintp2_c -+#endif + - const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; - ++#define MP_MX(v, pred, mx) \ ++ mv_mp_mode_mx(s, \ ++ (x ## v) >> s->ps.sps->log2_min_pu_size, \ ++ (y ## v) >> s->ps.sps->log2_min_pu_size, \ ++ pred, &mx, ref_idx_curr, ref_idx) + -+#if RPI_INTER ++#define MP_MX_LT(v, pred, mx) \ ++ mv_mp_mode_mx_lt(s, \ ++ (x ## v) >> s->ps.sps->log2_min_pu_size, \ ++ (y ## v) >> s->ps.sps->log2_min_pu_size, \ ++ pred, &mx, ref_idx_curr, ref_idx) + -+static void rpi_begin(const HEVCContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first); ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, ++ int merge_idx, MvField * const mv, ++ int mvp_lx_flag, int LX) ++{ ++ const MvField *tab_mvf = s->ref->tab_mvf; ++ int isScaledFlag_L0 = 0; ++ int availableFlagLXA0 = 1; ++ int availableFlagLXB0 = 1; ++ int numMVPCandLX = 0; ++ int min_pu_width = s->ps.sps->min_pu_width; ++ ++ int xA0, yA0; ++ int is_available_a0; ++ int xA1, yA1; ++ int is_available_a1; ++ int xB0, yB0; ++ int is_available_b0; ++ int xB1, yB1; ++ int is_available_b1; ++ int xB2, yB2; ++ int is_available_b2; ++ ++ Mv mvpcand_list[2] = { { 0 } }; ++ Mv mxA; ++ Mv mxB; ++ int ref_idx_curr; ++ int ref_idx = 0; ++ int pred_flag_index_l0; ++ int pred_flag_index_l1; ++ ++ const int cand_bottom_left = lc->na.cand_bottom_left; ++ const int cand_left = lc->na.cand_left; ++ const int cand_up_left = lc->na.cand_up_left; ++ const int cand_up = lc->na.cand_up; ++ const int cand_up_right = lc->na.cand_up_right_sap; ++ ref_idx_curr = LX; ++ ref_idx = mv->ref_idx[LX]; ++ pred_flag_index_l0 = LX; ++ pred_flag_index_l1 = !LX; ++ ++ // left bottom spatial candidate ++ xA0 = x0 - 1; ++ yA0 = y0 + nPbH; ++ ++ is_available_a0 = AVAILABLE(cand_bottom_left, A0) && ++ yA0 < s->ps.sps->height && ++ PRED_BLOCK_AVAILABLE(A0); ++ ++ //left spatial merge candidate ++ xA1 = x0 - 1; ++ yA1 = y0 + nPbH - 1; ++ ++ is_available_a1 = AVAILABLE(cand_left, A1); ++ if (is_available_a0 || is_available_a1) ++ isScaledFlag_L0 = 1; ++ ++ if (is_available_a0) { ++ if (MP_MX(A0, pred_flag_index_l0, mxA)) { ++ goto b_candidates; ++ } ++ if (MP_MX(A0, pred_flag_index_l1, mxA)) { ++ goto b_candidates; ++ } ++ } + -+#define MC_DUMMY_X (-32) -+#define MC_DUMMY_Y (-32) ++ if (is_available_a1) { ++ if (MP_MX(A1, pred_flag_index_l0, mxA)) { ++ goto b_candidates; ++ } ++ if (MP_MX(A1, pred_flag_index_l1, mxA)) { ++ goto b_candidates; ++ } ++ } + -+// UV & Y both have min 4x4 pred (no 2x2 chroma) -+// Allow for even spread +1 for setup, +1 for rounding -+// As we have load sharing this can (in theory) be exceeded so we have to -+// check after each CTU, but it is a good base size ++ if (is_available_a0) { ++ if (MP_MX_LT(A0, pred_flag_index_l0, mxA)) { ++ goto b_candidates; ++ } ++ if (MP_MX_LT(A0, pred_flag_index_l1, mxA)) { ++ goto b_candidates; ++ } ++ } + -+// Worst case (all 4x4) commands per CTU -+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16) -+#define QPU_C_CMD_PER_CTU_MAX (8 * 8) ++ if (is_available_a1) { ++ if (MP_MX_LT(A1, pred_flag_index_l0, mxA)) { ++ goto b_candidates; ++ } ++ if (MP_MX_LT(A1, pred_flag_index_l1, mxA)) { ++ goto b_candidates; ++ } ++ } ++ availableFlagLXA0 = 0; ++ ++b_candidates: ++ // B candidates ++ // above right spatial merge candidate ++ xB0 = x0 + nPbW; ++ yB0 = y0 - 1; ++ ++ is_available_b0 = AVAILABLE(cand_up_right, B0) && ++ xB0 < s->ps.sps->width && ++ PRED_BLOCK_AVAILABLE(B0); ++ ++ // above spatial merge candidate ++ xB1 = x0 + nPbW - 1; ++ yB1 = y0 - 1; ++ is_available_b1 = AVAILABLE(cand_up, B1); ++ ++ // above left spatial merge candidate ++ xB2 = x0 - 1; ++ yB2 = y0 - 1; ++ is_available_b2 = AVAILABLE(cand_up_left, B2); ++ ++ // above right spatial merge candidate ++ if (is_available_b0) { ++ if (MP_MX(B0, pred_flag_index_l0, mxB)) { ++ goto scalef; ++ } ++ if (MP_MX(B0, pred_flag_index_l1, mxB)) { ++ goto scalef; ++ } ++ } + -+#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX) -+#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) + 2 * QPU_N_MAX) ++ // above spatial merge candidate ++ if (is_available_b1) { ++ if (MP_MX(B1, pred_flag_index_l0, mxB)) { ++ goto scalef; ++ } ++ if (MP_MX(B1, pred_flag_index_l1, mxB)) { ++ goto scalef; ++ } ++ } + -+// The QPU code for UV blocks only works up to a block width of 8 -+#define RPI_CHROMA_BLOCK_WIDTH 8 ++ // above left spatial merge candidate ++ if (is_available_b2) { ++ if (MP_MX(B2, pred_flag_index_l0, mxB)) { ++ goto scalef; ++ } ++ if (MP_MX(B2, pred_flag_index_l1, mxB)) { ++ goto scalef; ++ } ++ } ++ availableFlagLXB0 = 0; + -+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) ++scalef: ++ if (!isScaledFlag_L0) { ++ if (availableFlagLXB0) { ++ availableFlagLXA0 = 1; ++ mxA = mxB; ++ } ++ availableFlagLXB0 = 0; + ++ // XB0 and L1 ++ if (is_available_b0) { ++ availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l0, mxB); ++ if (!availableFlagLXB0) ++ availableFlagLXB0 = MP_MX_LT(B0, pred_flag_index_l1, mxB); ++ } + -+// Actual filter goes -ve, +ve, +ve, -ve using these values -+static const uint32_t rpi_filter_coefs[8] = { -+ ENCODE_COEFFS( 0, 64, 0, 0), -+ ENCODE_COEFFS( 2, 58, 10, 2), -+ ENCODE_COEFFS( 4, 54, 16, 2), -+ ENCODE_COEFFS( 6, 46, 28, 4), -+ ENCODE_COEFFS( 4, 36, 36, 4), -+ ENCODE_COEFFS( 4, 28, 46, 6), -+ ENCODE_COEFFS( 2, 16, 54, 4), -+ ENCODE_COEFFS( 2, 10, 58, 2) -+}; ++ if (is_available_b1 && !availableFlagLXB0) { ++ availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l0, mxB); ++ if (!availableFlagLXB0) ++ availableFlagLXB0 = MP_MX_LT(B1, pred_flag_index_l1, mxB); ++ } + -+// Function arrays by QPU ++ if (is_available_b2 && !availableFlagLXB0) { ++ availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l0, mxB); ++ if (!availableFlagLXB0) ++ availableFlagLXB0 = MP_MX_LT(B2, pred_flag_index_l1, mxB); ++ } ++ } + -+static const int * const inter_pred_setup_c_qpu[12] = { -+ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, -+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, -+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn -+}; ++ if (availableFlagLXA0) ++ mvpcand_list[numMVPCandLX++] = mxA; + -+static const int * const inter_pred_setup_c10_qpu[12] = { -+ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, -+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, -+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn -+}; ++ if (availableFlagLXB0 && (!availableFlagLXA0 || mxA.x != mxB.x || mxA.y != mxB.y)) ++ mvpcand_list[numMVPCandLX++] = mxB; + -+static const int * const inter_pred_setup_y_qpu[12] = { -+ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, -+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, -+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn -+}; ++ //temporal motion vector prediction candidate ++ if (numMVPCandLX < 2 && s->sh.slice_temporal_mvp_enabled_flag && ++ mvp_lx_flag == numMVPCandLX) { ++ Mv mv_col; ++ int available_col = temporal_luma_motion_vector(s, lc, x0, y0, nPbW, ++ nPbH, ref_idx, ++ &mv_col, LX); ++ if (available_col) ++ mvpcand_list[numMVPCandLX++] = mv_col; ++ } + -+static const int * const inter_pred_setup_y10_qpu[12] = { -+ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, -+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, -+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn -+}; ++ mv->mv[LX] = mvpcand_list[mvp_lx_flag]; ++} +diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c +new file mode 100644 +index 0000000000..04f9231acc +--- /dev/null ++++ b/libavcodec/rpi_hevc_parse.c +@@ -0,0 +1,142 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+static const int * const inter_pred_sync_qpu[12] = { -+ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, -+ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, -+ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 -+}; ++#include "bytestream.h" ++#include "h2645_parse.h" ++#include "hevc.h" ++#include "rpi_hevc_parse.h" + -+static const int * const inter_pred_sync10_qpu[12] = { -+ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, -+ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, -+ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 -+}; ++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int is_nalff, int nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx) ++{ ++ int i; ++ int ret = 0; ++ H2645Packet pkt = { 0 }; + -+static const int * const inter_pred_exit_c_qpu[12] = { -+ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, -+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, -+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn -+}; ++ ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, nal_length_size, AV_CODEC_ID_HEVC, 1); ++ if (ret < 0) { ++ goto done; ++ } + -+static const int * const inter_pred_exit_c10_qpu[12] = { -+ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, -+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, -+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn -+}; ++ for (i = 0; i < pkt.nb_nals; i++) { ++ H2645NAL *nal = &pkt.nals[i]; + -+static const int * const inter_pred_exit_y_qpu[12] = { -+ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, -+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, -+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn -+}; ++ /* ignore everything except parameter sets and VCL NALUs */ ++ switch (nal->type) { ++ case HEVC_NAL_VPS: ++ ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_SPS: ++ ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_PPS: ++ ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_SEI_PREFIX: ++ case HEVC_NAL_SEI_SUFFIX: ++ ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type); ++ if (ret < 0) ++ goto done; ++ break; ++ default: ++ av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type); ++ break; ++ } ++ } + -+static const int * const inter_pred_exit_y10_qpu[12] = { -+ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, -+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, -+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn -+}; ++done: ++ ff_h2645_packet_uninit(&pkt); ++ if (err_recognition & AV_EF_EXPLODE) ++ return ret; + -+typedef struct ipe_chan_info_s ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx) +{ -+ const uint8_t bit_depth; -+ const uint8_t n; -+ const int * const * setup_fns; -+ const int * const * sync_fns; -+ const int * const * exit_fns; -+} ipe_chan_info_t; ++ int ret = 0; ++ GetByteContext gb; + -+typedef struct ipe_init_info_s -+{ -+ ipe_chan_info_t luma; -+ ipe_chan_info_t chroma; -+} ipe_init_info_t; ++ bytestream2_init(&gb, data, size); + -+static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 -+ { // 8 -+ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, -+ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} -+ }, -+ { // 9 -+ .luma = {0}, -+ .chroma = {0} -+ }, -+ { // 10 -+ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, -+ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} -+ } ++ if (size > 3 && (data[0] || data[1] || data[2] > 1)) { ++ /* It seems the extradata is encoded as hvcC format. ++ * Temporarily, we support configurationVersion==0 until 14496-15 3rd ++ * is finalized. When finalized, configurationVersion will be 1 and we ++ * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */ ++ int i, j, num_arrays, nal_len_size; + -+}; ++ *is_nalff = 1; + -+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) -+{ -+ const unsigned int n = ici->n; -+ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word ++ bytestream2_skip(&gb, 21); ++ nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1; ++ num_arrays = bytestream2_get_byte(&gb); + -+ ipe->n = n; -+ ipe->max_fill = q1_size - ipe->min_gap; -+ for(unsigned int i = 0; i < n; i++) { -+ HEVCRpiInterPredQ * const q = ipe->q + i; -+ q->qpu_mc_curr = q->qpu_mc_base = -+ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); -+ q->code_setup = qpu_fn(ici->setup_fns[i]); -+ q->code_sync = qpu_fn(ici->sync_fns[i]); -+ q->code_exit = qpu_fn(ici->exit_fns[i]); ++ /* nal units in the hvcC always have length coded with 2 bytes, ++ * so put a fake nal_length_size = 2 while parsing them */ ++ *nal_length_size = 2; ++ ++ /* Decode nal units from hvcC. */ ++ for (i = 0; i < num_arrays; i++) { ++ int type = bytestream2_get_byte(&gb) & 0x3f; ++ int cnt = bytestream2_get_be16(&gb); ++ ++ for (j = 0; j < cnt; j++) { ++ // +2 for the nal size field ++ int nalsize = bytestream2_peek_be16(&gb) + 2; ++ if (bytestream2_get_bytes_left(&gb) < nalsize) { ++ av_log(logctx, AV_LOG_ERROR, ++ "Invalid NAL unit size in extradata.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff, ++ *nal_length_size, err_recognition, apply_defdispwin, ++ logctx); ++ if (ret < 0) { ++ av_log(logctx, AV_LOG_ERROR, ++ "Decoding nal unit %d %d from hvcC failed\n", ++ type, i); ++ return ret; ++ } ++ bytestream2_skip(&gb, nalsize); ++ } ++ } ++ ++ /* Now store right nal length size, that will be used to parse ++ * all other nals */ ++ *nal_length_size = nal_len_size; ++ } else { ++ *is_nalff = 0; ++ ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size, ++ err_recognition, apply_defdispwin, logctx); ++ if (ret < 0) ++ return ret; + } ++ ++ return ret; +} +diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h +new file mode 100644 +index 0000000000..4b4d032a16 +--- /dev/null ++++ b/libavcodec/rpi_hevc_parse.h +@@ -0,0 +1,36 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth) -+{ -+ av_assert0(bit_depth >= 8 && bit_depth <= 16); ++/** ++ * @file ++ * H.265 parser code ++ */ + -+ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); -+} ++#ifndef AVCODEC_RPI_HEVC_PARSE_H ++#define AVCODEC_RPI_HEVC_PARSE_H + ++#include + -+#endif ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" + ++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx); + -+#ifdef RPI ++#endif /* AVCODEC_RPI_HEVC_PARSE_H */ +diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c +new file mode 100644 +index 0000000000..f65efa1015 +--- /dev/null ++++ b/libavcodec/rpi_hevc_ps.c +@@ -0,0 +1,1712 @@ ++/* ++ * HEVC Parameter Set decoding ++ * ++ * Copyright (C) 2012 - 2103 Guillaume Martres ++ * Copyright (C) 2012 - 2103 Mickael Raulet ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2013 Vittorio Giovara ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+// If we only have one worker then we could just do this -+// But if we have multiple threads then they might start "at the same time" -+// and we need to share out the work carefully -+// -+// returns pq->job_n++ -+static uintptr_t pass_queue_inc_job_n(HEVCRpiPassQueue * const pq, const uintptr_t mod_n) -+{ -+#if 1 -+ // Single thread processing -+ uintptr_t x1; -+ void * const x2 = pq->job_n; -+ if ((x1 = (uintptr_t)x2 + 1) >= mod_n) -+ x1 = 0; -+ pq->job_n = (void *)x1; -+#else -+ void * x0; -+ uintptr_t x1; -+ void * x2 = pq->job_n; -+ do -+ { -+ if ((x1 = (uintptr_t)x2 + 1) >= mod_n) -+ x1 = 0; -+ x0 = x2; -+ } while ((x2 = avpriv_atomic_ptr_cas(&pq->job_n, x0, (void *)x1)) == x0); -+#endif -+ return (uintptr_t)x2; -+} ++#include "libavutil/imgutils.h" ++#include "golomb.h" ++#include "rpi_hevc_data.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevcdec.h" ++ ++static const uint8_t default_scaling_list_intra[] = { ++ 16, 16, 16, 16, 17, 18, 21, 24, ++ 16, 16, 16, 16, 17, 19, 22, 25, ++ 16, 16, 17, 18, 20, 22, 25, 29, ++ 16, 16, 18, 21, 24, 27, 31, 36, ++ 17, 17, 20, 24, 30, 35, 41, 47, ++ 18, 19, 22, 27, 35, 44, 54, 65, ++ 21, 22, 25, 31, 41, 54, 70, 88, ++ 24, 25, 29, 36, 47, 65, 88, 115 ++}; + -+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n) -+{ -+ pq->terminate = 0; -+ pq->job_n = (void *)0; -+ pq->context = s; -+ pq->worker = worker; -+ pq->psem_out = psem_out; -+ pq->pass_n = n; -+ pq->started = 0; -+ sem_init(&pq->sem_in, 0, 0); -+} ++static const uint8_t default_scaling_list_inter[] = { ++ 16, 16, 16, 16, 17, 18, 20, 24, ++ 16, 16, 16, 17, 18, 20, 24, 25, ++ 16, 16, 17, 18, 20, 24, 25, 28, ++ 16, 17, 18, 20, 24, 25, 28, 33, ++ 17, 18, 20, 24, 25, 28, 33, 41, ++ 18, 20, 24, 25, 28, 33, 41, 54, ++ 20, 24, 25, 28, 33, 41, 54, 71, ++ 24, 25, 28, 33, 41, 54, 71, 91 ++}; + -+static void pass_queue_kill(HEVCRpiPassQueue * const pq) -+{ -+ sem_destroy(&pq->sem_in); -+} ++static const AVRational vui_sar[] = { ++ { 0, 1 }, ++ { 1, 1 }, ++ { 12, 11 }, ++ { 10, 11 }, ++ { 16, 11 }, ++ { 40, 33 }, ++ { 24, 11 }, ++ { 20, 11 }, ++ { 32, 11 }, ++ { 80, 33 }, ++ { 18, 11 }, ++ { 15, 11 }, ++ { 64, 33 }, ++ { 160, 99 }, ++ { 4, 3 }, ++ { 3, 2 }, ++ { 2, 1 }, ++}; + -+static inline void rpi_sem_wait(sem_t * const sem) ++static void remove_pps(HEVCRpiParamSets *s, int id) +{ -+ while (sem_wait(sem) != 0) { -+ av_assert0(errno == EINTR); -+ } ++ if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data) ++ s->pps = NULL; ++ av_buffer_unref(&s->pps_list[id]); +} + -+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq) ++static void remove_sps(HEVCRpiParamSets *s, int id) +{ -+ sem_post(&pq->sem_in); -+} ++ int i; ++ if (s->sps_list[id]) { ++ if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data) ++ s->sps = NULL; + -+// Unsigned Trivial MOD -+static inline unsigned int utmod(const unsigned int x, const unsigned int n) -+{ -+ return x >= n ? x - n : x; -+} ++ /* drop all PPS that depend on this SPS */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++) ++ if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id) ++ remove_pps(s, i); + -+static inline void pass_queue_do_all(HEVCContext * const s, HEVCRpiJob * const jb) -+{ -+ // Do the various passes - common with the worker code -+ for (unsigned int i = 0; i != RPI_PASSES; ++i) { -+ s->passq[i].worker(s, jb); ++ av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data)); + } ++ av_buffer_unref(&s->sps_list[id]); +} + -+ -+#if 0 -+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func) ++static void remove_vps(HEVCRpiParamSets *s, int id) +{ -+ int x; -+ sem_getvalue((sem_t *)&jbc->sem_out, &x); -+ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x); -+} -+#endif ++ int i; ++ if (s->vps_list[id]) { ++ if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data) ++ s->vps = NULL; + ++ for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++) ++ if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id) ++ remove_sps(s, i); ++ } ++ av_buffer_unref(&s->vps_list[id]); ++} + -+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCLocalContext * const lc) ++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, ++ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header) +{ -+ HEVCRpiJob * jb; -+ HEVCRpiJobGlobal * const jbg = jbc->jbg; ++ uint8_t rps_predict = 0; ++ int delta_poc; ++ int k0 = 0; ++ int k1 = 0; ++ int k = 0; ++ int i; + -+ pthread_mutex_lock(&jbg->lock); -+ // Check local 1st -+ if ((jb = jbc->jb1) != NULL) -+ { -+ // Only 1 - very easy :-) -+ // ?? Can we do this with atomic_exch outside the global lock -+ jbc->jb1 = NULL; -+ } -+ else -+ { -+ // Now look for global free chain -+ if ((jb = jbg->free1) != NULL) -+ { -+ // Found one - unlink it -+ jbg->free1 = jb->next; -+ jb->next = NULL; ++ if (rps != sps->st_rps && sps->nb_st_rps) ++ rps_predict = get_bits1(gb); ++ ++ if (rps_predict) { ++ const ShortTermRPS *rps_ridx; ++ int delta_rps; ++ unsigned abs_delta_rps; ++ uint8_t use_delta_flag = 0; ++ uint8_t delta_rps_sign; ++ ++ if (is_slice_header) { ++ unsigned int delta_idx = get_ue_golomb_long(gb) + 1; ++ if (delta_idx > sps->nb_st_rps) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_idx in slice header RPS: %d > %d.\n", ++ delta_idx, sps->nb_st_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx]; ++ rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs; ++ } else ++ rps_ridx = &sps->st_rps[rps - sps->st_rps - 1]; ++ ++ delta_rps_sign = get_bits1(gb); ++ abs_delta_rps = get_ue_golomb_long(gb) + 1; ++ if (abs_delta_rps < 1 || abs_delta_rps > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of abs_delta_rps: %d\n", ++ abs_delta_rps); ++ return AVERROR_INVALIDDATA; + } -+ else -+ { -+ // Out of places to look - wait for one to become free - add to Qs ++ delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps; ++ for (i = 0; i <= rps_ridx->num_delta_pocs; i++) { ++ int used = rps->used[k] = get_bits1(gb); + -+ // Global -+ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good) -+ { -+ // Add to end as we had to wait last time or wait Q empty -+ if ((lc->jw_prev = jbg->wait_tail) == NULL) -+ jbg->wait_head = lc; ++ if (!used) ++ use_delta_flag = get_bits1(gb); ++ ++ if (used || use_delta_flag) { ++ if (i < rps_ridx->num_delta_pocs) ++ delta_poc = delta_rps + rps_ridx->delta_poc[i]; + else -+ lc->jw_prev->jw_next = lc; -+ lc->jw_next = NULL; -+ jbg->wait_tail = lc; ++ delta_poc = delta_rps; ++ rps->delta_poc[k] = delta_poc; ++ if (delta_poc < 0) ++ k0++; ++ else ++ k1++; ++ k++; + } -+ else -+ { -+ // Skip over els which had good progress -+ // We know that the Q isn't empty and there is at least one !last_progess_good el in it -+ HEVCLocalContext *p = jbg->wait_head; ++ } + -+ if (!p->last_progress_good) -+ { -+ jbg->wait_head = lc; -+ lc->jw_prev = NULL; -+ } -+ else -+ { -+ do { -+ p = p->jw_next; -+ } while (p->last_progress_good); ++ if (k >= FF_ARRAY_ELEMS(rps->used)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid num_delta_pocs: %d\n", k); ++ return AVERROR_INVALIDDATA; ++ } + -+ lc->jw_prev = p->jw_prev; -+ lc->jw_prev->jw_next = lc; ++ rps->num_delta_pocs = k; ++ rps->num_negative_pics = k0; ++ // sort in increasing order (smallest first) ++ if (rps->num_delta_pocs != 0) { ++ int used, tmp; ++ for (i = 1; i < rps->num_delta_pocs; i++) { ++ delta_poc = rps->delta_poc[i]; ++ used = rps->used[i]; ++ for (k = i - 1; k >= 0; k--) { ++ tmp = rps->delta_poc[k]; ++ if (delta_poc < tmp) { ++ rps->delta_poc[k + 1] = tmp; ++ rps->used[k + 1] = rps->used[k]; ++ rps->delta_poc[k] = delta_poc; ++ rps->used[k] = used; ++ } + } -+ -+ p->jw_prev = lc; -+ lc->jw_next = p; + } ++ } ++ if ((rps->num_negative_pics >> 1) != 0) { ++ int used; ++ k = rps->num_negative_pics - 1; ++ // flip the negative values to largest first ++ for (i = 0; i < rps->num_negative_pics >> 1; i++) { ++ delta_poc = rps->delta_poc[i]; ++ used = rps->used[i]; ++ rps->delta_poc[i] = rps->delta_poc[k]; ++ rps->used[i] = rps->used[k]; ++ rps->delta_poc[k] = delta_poc; ++ rps->used[k] = used; ++ k--; ++ } ++ } ++ } else { ++ unsigned int prev, nb_positive_pics; ++ rps->num_negative_pics = get_ue_golomb_long(gb); ++ nb_positive_pics = get_ue_golomb_long(gb); + -+ // Local -+ if ((lc->ljw_prev = jbc->lcw_tail) == NULL) -+ jbc->lcw_head = lc; -+ else -+ lc->ljw_prev->ljw_next = lc; -+ lc->ljw_next = NULL; -+ jbc->lcw_tail = lc; ++ if (rps->num_negative_pics >= HEVC_MAX_REFS || ++ nb_positive_pics >= HEVC_MAX_REFS) { ++ av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics; ++ if (rps->num_delta_pocs) { ++ prev = 0; ++ for (i = 0; i < rps->num_negative_pics; i++) { ++ delta_poc = get_ue_golomb_long(gb) + 1; ++ if (delta_poc < 1 || delta_poc > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_poc: %d\n", ++ delta_poc); ++ return AVERROR_INVALIDDATA; ++ } ++ prev -= delta_poc; ++ rps->delta_poc[i] = prev; ++ rps->used[i] = get_bits1(gb); ++ } ++ prev = 0; ++ for (i = 0; i < nb_positive_pics; i++) { ++ delta_poc = get_ue_golomb_long(gb) + 1; ++ if (delta_poc < 1 || delta_poc > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_poc: %d\n", ++ delta_poc); ++ return AVERROR_INVALIDDATA; ++ } ++ prev += delta_poc; ++ rps->delta_poc[rps->num_negative_pics + i] = prev; ++ rps->used[rps->num_negative_pics + i] = get_bits1(gb); ++ } + } + } ++ return 0; ++} + -+ pthread_mutex_unlock(&jbg->lock); + -+ if (jb == NULL) // Need to wait -+ { -+ rpi_sem_wait(&lc->jw_sem); -+ jb = lc->jw_job; // Set by free code ++static int decode_profile_tier_level(GetBitContext *gb, AVCodecContext *avctx, ++ PTLCommon *ptl) ++{ ++ int i; ++ ++ if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12) ++ return -1; ++ ++ ptl->profile_space = get_bits(gb, 2); ++ ptl->tier_flag = get_bits1(gb); ++ ptl->profile_idc = get_bits(gb, 5); ++ if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN) ++ av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10) ++ av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE) ++ av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT) ++ av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n"); ++ else ++ av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc); ++ ++ for (i = 0; i < 32; i++) { ++ ptl->profile_compatibility_flag[i] = get_bits1(gb); ++ ++ if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i]) ++ ptl->profile_idc = i; + } ++ ptl->progressive_source_flag = get_bits1(gb); ++ ptl->interlaced_source_flag = get_bits1(gb); ++ ptl->non_packed_constraint_flag = get_bits1(gb); ++ ptl->frame_only_constraint_flag = get_bits1(gb); + -+ return jb; -+} ++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15] ++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31] ++ skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43] + ++ return 0; ++} + -+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb) ++static int parse_ptl(GetBitContext *gb, AVCodecContext *avctx, ++ PTL *ptl, int max_num_sub_layers) +{ -+ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock -+ HEVCRpiJobCtl * jbc = jb->jbc_local; -+ HEVCLocalContext * lc = NULL; ++ int i; ++ if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 || ++ get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) { ++ av_log(avctx, AV_LOG_ERROR, "PTL information too short\n"); ++ return -1; ++ } + -+ pthread_mutex_lock(&jbg->lock); ++ ptl->general_ptl.level_idc = get_bits(gb, 8); + -+ if (jbc != NULL) -+ { -+ av_assert1(jbc->jb1 == NULL); ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ ptl->sub_layer_profile_present_flag[i] = get_bits1(gb); ++ ptl->sub_layer_level_present_flag[i] = get_bits1(gb); ++ } + -+ // Release to Local if nothing waiting there -+ if ((lc = jbc->lcw_head) == NULL) -+ { -+ jbc->jb1 = jb; ++ if (max_num_sub_layers - 1> 0) ++ for (i = max_num_sub_layers - 1; i < 8; i++) ++ skip_bits(gb, 2); // reserved_zero_2bits[i] ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ if (ptl->sub_layer_profile_present_flag[i] && ++ decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "PTL information for sublayer %i too short\n", i); ++ return -1; + } -+ } -+ else -+ { -+ // Release to global if nothing waiting there -+ if ((lc = jbg->wait_head) == NULL) -+ { -+ jb->next = jbg->free1; -+ jbg->free1 = jb; ++ if (ptl->sub_layer_level_present_flag[i]) { ++ if (get_bits_left(gb) < 8) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Not enough data for sublayer %i level_idc\n", i); ++ return -1; ++ } else ++ ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8); + } -+ else -+ { -+ // ? seems somehow mildy ugly... -+ jbc = lc->context->jbc; ++ } ++ ++ return 0; ++} ++ ++static void decode_sublayer_hrd(GetBitContext *gb, unsigned int nb_cpb, ++ int subpic_params_present) ++{ ++ int i; ++ ++ for (i = 0; i < nb_cpb; i++) { ++ get_ue_golomb_long(gb); // bit_rate_value_minus1 ++ get_ue_golomb_long(gb); // cpb_size_value_minus1 ++ ++ if (subpic_params_present) { ++ get_ue_golomb_long(gb); // cpb_size_du_value_minus1 ++ get_ue_golomb_long(gb); // bit_rate_du_value_minus1 + } ++ skip_bits1(gb); // cbr_flag + } ++} + -+ if (lc != NULL) -+ { -+ // Something was waiting ++static int decode_hrd(GetBitContext *gb, int common_inf_present, ++ int max_sublayers) ++{ ++ int nal_params_present = 0, vcl_params_present = 0; ++ int subpic_params_present = 0; ++ int i; + -+ // Unlink -+ // Global -+ if (lc->jw_next == NULL) -+ jbg->wait_tail = lc->jw_prev; -+ else -+ lc->jw_next->jw_prev = lc->jw_prev; ++ if (common_inf_present) { ++ nal_params_present = get_bits1(gb); ++ vcl_params_present = get_bits1(gb); + -+ if (lc->jw_prev == NULL) -+ jbg->wait_head = lc->jw_next; -+ else -+ lc->jw_prev->jw_next = lc->jw_next; ++ if (nal_params_present || vcl_params_present) { ++ subpic_params_present = get_bits1(gb); + -+ // Local -+ if (lc->ljw_next == NULL) -+ jbc->lcw_tail = lc->ljw_prev; -+ else -+ lc->ljw_next->ljw_prev = lc->ljw_prev; ++ if (subpic_params_present) { ++ skip_bits(gb, 8); // tick_divisor_minus2 ++ skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1 ++ skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag ++ skip_bits(gb, 5); // dpb_output_delay_du_length_minus1 ++ } + -+ if (lc->ljw_prev == NULL) -+ jbc->lcw_head = lc->ljw_next; -+ else -+ lc->ljw_prev->ljw_next = lc->ljw_next; ++ skip_bits(gb, 4); // bit_rate_scale ++ skip_bits(gb, 4); // cpb_size_scale + -+ // Prod -+ lc->jw_job = jb; -+ sem_post(&lc->jw_sem); ++ if (subpic_params_present) ++ skip_bits(gb, 4); // cpb_size_du_scale ++ ++ skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1 ++ skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1 ++ skip_bits(gb, 5); // dpb_output_delay_length_minus1 ++ } + } + -+ pthread_mutex_unlock(&jbg->lock); -+} ++ for (i = 0; i < max_sublayers; i++) { ++ int low_delay = 0; ++ unsigned int nb_cpb = 1; ++ int fixed_rate = get_bits1(gb); + -+static void job_lc_kill(HEVCLocalContext * const lc) -+{ -+ sem_destroy(&lc->jw_sem); -+} ++ if (!fixed_rate) ++ fixed_rate = get_bits1(gb); + -+static void job_lc_init(HEVCLocalContext * const lc) -+{ -+ lc->jw_next = NULL; -+ lc->jw_prev = NULL; -+ lc->ljw_next = NULL; -+ lc->ljw_prev = NULL; -+ lc->jw_job = NULL; -+ sem_init(&lc->jw_sem, 0, 0); -+} ++ if (fixed_rate) ++ get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1 ++ else ++ low_delay = get_bits1(gb); + -+static int progress_good(const HEVCContext *const s, const HEVCRpiJob * const jb) -+{ -+ if (jb->waited) -+ return 0; -+ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) -+ { -+ if (jb->progress[i] >= 0 && s->DPB[i].tf.progress != NULL && -+ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress[i]) -+ return 0; ++ if (!low_delay) { ++ nb_cpb = get_ue_golomb_long(gb) + 1; ++ if (nb_cpb < 1 || nb_cpb > 32) { ++ av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ if (nal_params_present) ++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); ++ if (vcl_params_present) ++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); + } -+ return 1; ++ return 0; +} + -+// Submit job if it is full (indicated by having ctu_ts_last set >= 0) -+static inline void worker_submit_job(HEVCContext *const s, HEVCLocalContext * const lc) ++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps) +{ -+ HEVCRpiJobCtl *const jbc = s->jbc; -+ HEVCRpiJob * const jb = lc->jb0; ++ int i,j; ++ int vps_id = 0; ++ ptrdiff_t nal_size; ++ HEVCRpiVPS *vps; ++ AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps)); + -+ av_assert1(jb != NULL); -+ -+ if (jb->ctu_ts_last < 0) { -+ return; -+ } ++ if (!vps_buf) ++ return AVERROR(ENOMEM); ++ vps = (HEVCRpiVPS*)vps_buf->data; + -+ lc->last_progress_good = progress_good(s, jb); -+ lc->jb0 = NULL; ++ av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n"); + -+ if (s->offload_recon) -+ { -+ pthread_mutex_lock(&jbc->in_lock); -+ jbc->offloadq[jbc->offload_in] = jb; -+ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS); -+ pthread_mutex_unlock(&jbc->in_lock); ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(vps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(vps->data)); ++ vps->data_size = sizeof(vps->data); ++ } else { ++ vps->data_size = nal_size; ++ } ++ memcpy(vps->data, gb->buffer, vps->data_size); + -+ pass_queue_submit_job(s->passq + 0); // Consumes job eventually ++ vps_id = get_bits(gb, 4); ++ if (vps_id >= HEVC_MAX_VPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id); ++ goto err; + } -+ else -+ { -+ pass_queue_do_all(s, jb); // Consumes job before return ++ ++ if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits ++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n"); ++ goto err; + } -+} + ++ vps->vps_max_layers = get_bits(gb, 6) + 1; ++ vps->vps_max_sub_layers = get_bits(gb, 3) + 1; ++ vps->vps_temporal_id_nesting_flag = get_bits1(gb); + -+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes -+// available to receive the next job. -+// -+// Now safe against multiple callers - needed for tiles -+// "normal" and WPP will only call here one at a time -+// -+// Preferably max jobs > bit_threads + passes but this is a minimum for -+// the non-offload logic to work -+#if RPI_MAX_JOBS < RPI_BIT_THREADS -+#error Max Jobs must be >= Bit threads -+#endif -+static inline void worker_pass0_ready(const HEVCContext * const s, HEVCLocalContext * const lc) -+{ -+ HEVCRpiJobCtl * const jbc = s->jbc; ++ if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits ++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n"); ++ goto err; ++ } + -+ if (lc->jb0 != NULL) { -+ return; ++ if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) { ++ av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n", ++ vps->vps_max_sub_layers); ++ goto err; + } + ++ if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0) ++ goto err; + -+ if (s->offload_recon) -+ { -+ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much -+ lc->jb0 = job_alloc(jbc, lc); ++ vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb); ++ ++ i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1; ++ for (; i < vps->vps_max_sub_layers; i++) { ++ vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1; ++ vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb); ++ vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1; ++ ++ if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) { ++ av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n", ++ vps->vps_max_dec_pic_buffering[i] - 1); ++ goto err; ++ } ++ if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) { ++ av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n", ++ vps->vps_num_reorder_pics[i]); ++ if (avctx->err_recognition & AV_EF_EXPLODE) ++ goto err; ++ } + } -+ else -+ { -+ lc->jb0 = job_alloc(jbc, lc); ++ ++ vps->vps_max_layer_id = get_bits(gb, 6); ++ vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1; ++ if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 || ++ (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) { ++ av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n"); ++ goto err; + } + -+ rpi_begin(s, lc->jb0, lc->ts); -+} ++ for (i = 1; i < vps->vps_num_layer_sets; i++) ++ for (j = 0; j <= vps->vps_max_layer_id; j++) ++ skip_bits(gb, 1); // layer_id_included_flag[i][j] ++ ++ vps->vps_timing_info_present_flag = get_bits1(gb); ++ if (vps->vps_timing_info_present_flag) { ++ vps->vps_num_units_in_tick = get_bits_long(gb, 32); ++ vps->vps_time_scale = get_bits_long(gb, 32); ++ vps->vps_poc_proportional_to_timing_flag = get_bits1(gb); ++ if (vps->vps_poc_proportional_to_timing_flag) ++ vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1; ++ vps->vps_num_hrd_parameters = get_ue_golomb_long(gb); ++ if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) { ++ av_log(avctx, AV_LOG_ERROR, ++ "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters); ++ goto err; ++ } ++ for (i = 0; i < vps->vps_num_hrd_parameters; i++) { ++ int common_inf_present = 1; + -+// Free up a job without submission -+static void worker_free(const HEVCContext * const s, HEVCLocalContext * const lc) -+{ -+ HEVCRpiJobCtl * const jbc = s->jbc; -+ HEVCRpiJob * const jb = lc->jb0; ++ get_ue_golomb_long(gb); // hrd_layer_set_idx ++ if (i) ++ common_inf_present = get_bits1(gb); ++ decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers); ++ } ++ } ++ get_bits1(gb); /* vps_extension_flag */ + -+ if (jb == NULL) { -+ return; ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread VPS by %d bits\n", -get_bits_left(gb)); ++ if (ps->vps_list[vps_id]) ++ goto err; + } + -+ lc->jb0 = NULL; ++ if (ps->vps_list[vps_id] && ++ !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) { ++ av_buffer_unref(&vps_buf); ++ } else { ++ remove_vps(ps, vps_id); ++ ps->vps_list[vps_id] = vps_buf; ++ } + -+ job_free(jbc, jb); ++ return 0; + -+ // If offload then poke sem_out too -+ if (s->offload_recon) { -+ sem_post(&jbc->sem_out); -+ } ++err: ++ av_buffer_unref(&vps_buf); ++ return AVERROR_INVALIDDATA; +} + -+ -+// Call this to wait for all jobs to have completed at the end of a frame -+// Slightly icky as there is no clean way to wait for a sem to count up -+// Not reentrant - call on main thread only -+static void worker_wait(const HEVCContext * const s, HEVCLocalContext * const lc) ++static void decode_vui(GetBitContext *gb, AVCodecContext *avctx, ++ int apply_defdispwin, HEVCRpiSPS *sps) +{ -+ HEVCRpiJobCtl * const jbc = s->jbc; -+ int i = 0; ++ VUI backup_vui, *vui = &sps->vui; ++ GetBitContext backup; ++ int sar_present, alt = 0; + -+ // We shouldn't reach here with an unsubmitted job -+ av_assert1(lc->jb0 == NULL); ++ av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n"); + -+ // If no offload then there can't be anything to wait for -+ if (!s->offload_recon) { -+ return; ++ sar_present = get_bits1(gb); ++ if (sar_present) { ++ uint8_t sar_idx = get_bits(gb, 8); ++ if (sar_idx < FF_ARRAY_ELEMS(vui_sar)) ++ vui->sar = vui_sar[sar_idx]; ++ else if (sar_idx == 255) { ++ vui->sar.num = get_bits(gb, 16); ++ vui->sar.den = get_bits(gb, 16); ++ } else ++ av_log(avctx, AV_LOG_WARNING, ++ "Unknown SAR index: %u.\n", sar_idx); + } + -+ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS) -+ { -+ for (i = 0; i != RPI_MAX_JOBS; ++i) { -+ rpi_sem_wait(&jbc->sem_out); -+ } -+ for (i = 0; i != RPI_MAX_JOBS; ++i) { -+ sem_post(&jbc->sem_out); ++ vui->overscan_info_present_flag = get_bits1(gb); ++ if (vui->overscan_info_present_flag) ++ vui->overscan_appropriate_flag = get_bits1(gb); ++ ++ vui->video_signal_type_present_flag = get_bits1(gb); ++ if (vui->video_signal_type_present_flag) { ++ vui->video_format = get_bits(gb, 3); ++ vui->video_full_range_flag = get_bits1(gb); ++ vui->colour_description_present_flag = get_bits1(gb); ++ if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P) ++ sps->pix_fmt = AV_PIX_FMT_YUVJ420P; ++ if (vui->colour_description_present_flag) { ++ vui->colour_primaries = get_bits(gb, 8); ++ vui->transfer_characteristic = get_bits(gb, 8); ++ vui->matrix_coeffs = get_bits(gb, 8); ++ ++ // Set invalid values to "unspecified" ++ if (!av_color_primaries_name(vui->colour_primaries)) ++ vui->colour_primaries = AVCOL_PRI_UNSPECIFIED; ++ if (!av_color_transfer_name(vui->transfer_characteristic)) ++ vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED; ++ if (!av_color_space_name(vui->matrix_coeffs)) ++ vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED; ++ if (vui->matrix_coeffs == AVCOL_SPC_RGB) { ++ switch (sps->pix_fmt) { ++ case AV_PIX_FMT_YUV444P: ++ sps->pix_fmt = AV_PIX_FMT_GBRP; ++ break; ++ case AV_PIX_FMT_YUV444P10: ++ sps->pix_fmt = AV_PIX_FMT_GBRP10; ++ break; ++ case AV_PIX_FMT_YUV444P12: ++ sps->pix_fmt = AV_PIX_FMT_GBRP12; ++ break; ++ } ++ } + } + } + -+ // As the Q is the same length as the number of buffers out = in means all returned -+ // ?????? probably not in the new world -+ jbc->offload_out = jbc->offload_in; -+} -+ -+static void * pass_worker(void *arg) -+{ -+ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg; -+ HEVCContext *const s = pq->context; ++ vui->chroma_loc_info_present_flag = get_bits1(gb); ++ if (vui->chroma_loc_info_present_flag) { ++ vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb); ++ vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb); ++ } + -+ for (;;) -+ { -+ rpi_sem_wait(&pq->sem_in); ++ vui->neutra_chroma_indication_flag = get_bits1(gb); ++ vui->field_seq_flag = get_bits1(gb); ++ vui->frame_field_info_present_flag = get_bits1(gb); ++ ++ // Backup context in case an alternate header is detected ++ memcpy(&backup, gb, sizeof(backup)); ++ memcpy(&backup_vui, vui, sizeof(backup_vui)); ++ if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) { ++ vui->default_display_window_flag = 0; ++ av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n"); ++ } else ++ vui->default_display_window_flag = get_bits1(gb); ++ ++ if (vui->default_display_window_flag) { ++ int vert_mult = 1 + (sps->chroma_format_idc < 2); ++ int horiz_mult = 1 + (sps->chroma_format_idc < 3); ++ vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; ++ vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; ++ vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult; ++ vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; ++ ++ if (apply_defdispwin && ++ avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "discarding vui default display window, " ++ "original values are l:%u r:%u t:%u b:%u\n", ++ vui->def_disp_win.left_offset, ++ vui->def_disp_win.right_offset, ++ vui->def_disp_win.top_offset, ++ vui->def_disp_win.bottom_offset); ++ ++ vui->def_disp_win.left_offset = ++ vui->def_disp_win.right_offset = ++ vui->def_disp_win.top_offset = ++ vui->def_disp_win.bottom_offset = 0; ++ } ++ } + -+ if (pq->terminate) -+ break; ++timing_info: ++ vui->vui_timing_info_present_flag = get_bits1(gb); ++ ++ if (vui->vui_timing_info_present_flag) { ++ if( get_bits_left(gb) < 66 && !alt) { ++ // The alternate syntax seem to have timing info located ++ // at where def_disp_win is normally located ++ av_log(avctx, AV_LOG_WARNING, ++ "Strange VUI timing information, retrying...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++ vui->vui_num_units_in_tick = get_bits_long(gb, 32); ++ vui->vui_time_scale = get_bits_long(gb, 32); ++ if (alt) { ++ av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n", ++ vui->vui_time_scale, vui->vui_num_units_in_tick); ++ } ++ vui->vui_poc_proportional_to_timing_flag = get_bits1(gb); ++ if (vui->vui_poc_proportional_to_timing_flag) ++ vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb); ++ vui->vui_hrd_parameters_present_flag = get_bits1(gb); ++ if (vui->vui_hrd_parameters_present_flag) ++ decode_hrd(gb, 1, sps->max_sub_layers); ++ } + -+ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq, RPI_MAX_JOBS)]); -+ // * should really set jb->passes_done here ++ vui->bitstream_restriction_flag = get_bits1(gb); ++ if (vui->bitstream_restriction_flag) { ++ if (get_bits_left(gb) < 8 && !alt) { ++ av_log(avctx, AV_LOG_WARNING, ++ "Strange VUI bitstream restriction information, retrying" ++ " from timing information...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++ vui->tiles_fixed_structure_flag = get_bits1(gb); ++ vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb); ++ vui->restricted_ref_pic_lists_flag = get_bits1(gb); ++ vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb); ++ vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb); ++ vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb); ++ vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb); ++ vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb); ++ } + -+ sem_post(pq->psem_out); ++ if (get_bits_left(gb) < 1 && !alt) { ++ // XXX: Alternate syntax when sps_range_extension_flag != 0? ++ av_log(avctx, AV_LOG_WARNING, ++ "Overread in VUI, retrying from timing information...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; + } -+ return NULL; +} + -+static void pass_queues_start_all(HEVCContext *const s) ++static void set_default_scaling_list_data(ScalingList *sl) +{ -+ unsigned int i; -+ HEVCRpiPassQueue * const pqs = s->passq; ++ int matrixId; + -+ for (i = 0; i != RPI_PASSES; ++i) -+ { -+ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0); -+ pqs[i].started = 1; ++ for (matrixId = 0; matrixId < 6; matrixId++) { ++ // 4x4 default is 16 ++ memset(sl->sl[0][matrixId], 16, 16); ++ sl->sl_dc[0][matrixId] = 16; // default for 16x16 ++ sl->sl_dc[1][matrixId] = 16; // default for 32x32 + } -+} ++ memcpy(sl->sl[1][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[1][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[1][2], default_scaling_list_intra, 64); ++ memcpy(sl->sl[1][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[1][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[1][5], default_scaling_list_inter, 64); ++ memcpy(sl->sl[2][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[2][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[2][2], default_scaling_list_intra, 64); ++ memcpy(sl->sl[2][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[2][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[2][5], default_scaling_list_inter, 64); ++ memcpy(sl->sl[3][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[3][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[3][2], default_scaling_list_intra, 64); ++ memcpy(sl->sl[3][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[3][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[3][5], default_scaling_list_inter, 64); ++} ++ ++static int scaling_list_data(GetBitContext *gb, AVCodecContext *avctx, ScalingList *sl, HEVCRpiSPS *sps) ++{ ++ uint8_t scaling_list_pred_mode_flag; ++ int32_t scaling_list_dc_coef[2][6]; ++ int size_id, matrix_id, pos; ++ int i; + -+static void pass_queues_term_all(HEVCContext *const s) -+{ -+ unsigned int i; -+ HEVCRpiPassQueue * const pqs = s->passq; ++ for (size_id = 0; size_id < 4; size_id++) ++ for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) { ++ scaling_list_pred_mode_flag = get_bits1(gb); ++ if (!scaling_list_pred_mode_flag) { ++ unsigned int delta = get_ue_golomb_long(gb); ++ /* Only need to handle non-zero delta. Zero means default, ++ * which should already be in the arrays. */ ++ if (delta) { ++ // Copy from previous array. ++ delta *= (size_id == 3) ? 3 : 1; ++ if (matrix_id < delta) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid delta in scaling list data: %d.\n", delta); ++ return AVERROR_INVALIDDATA; ++ } + -+ for (i = 0; i != RPI_PASSES; ++i) -+ pqs[i].terminate = 1; -+ for (i = 0; i != RPI_PASSES; ++i) -+ { -+ if (pqs[i].started) -+ sem_post(&pqs[i].sem_in); -+ } -+ for (i = 0; i != RPI_PASSES; ++i) -+ { -+ if (pqs[i].started) { -+ pthread_join(pqs[i].thread, NULL); -+ pqs[i].started = 0; ++ memcpy(sl->sl[size_id][matrix_id], ++ sl->sl[size_id][matrix_id - delta], ++ size_id > 0 ? 64 : 16); ++ if (size_id > 1) ++ sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta]; ++ } ++ } else { ++ int next_coef, coef_num; ++ int32_t scaling_list_delta_coef; ++ ++ next_coef = 8; ++ coef_num = FFMIN(64, 1 << (4 + (size_id << 1))); ++ if (size_id > 1) { ++ scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8; ++ next_coef = scaling_list_dc_coef[size_id - 2][matrix_id]; ++ sl->sl_dc[size_id - 2][matrix_id] = next_coef; ++ } ++ for (i = 0; i < coef_num; i++) { ++ if (size_id == 0) ++ pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] + ++ ff_hevc_rpi_diag_scan4x4_x[i]; ++ else ++ pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] + ++ ff_hevc_rpi_diag_scan8x8_x[i]; ++ ++ scaling_list_delta_coef = get_se_golomb(gb); ++ next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256; ++ sl->sl[size_id][matrix_id][pos] = next_coef; ++ } ++ } + } ++ ++ if (sps->chroma_format_idc == 3) { ++ for (i = 0; i < 64; i++) { ++ sl->sl[3][1][i] = sl->sl[2][1][i]; ++ sl->sl[3][2][i] = sl->sl[2][2][i]; ++ sl->sl[3][4][i] = sl->sl[2][4][i]; ++ sl->sl[3][5][i] = sl->sl[2][5][i]; ++ } ++ sl->sl_dc[1][1] = sl->sl_dc[0][1]; ++ sl->sl_dc[1][2] = sl->sl_dc[0][2]; ++ sl->sl_dc[1][4] = sl->sl_dc[0][4]; ++ sl->sl_dc[1][5] = sl->sl_dc[0][5]; + } -+} + -+static void pass_queues_kill_all(HEVCContext *const s) -+{ -+ unsigned int i; -+ HEVCRpiPassQueue * const pqs = s->passq; + -+ for (i = 0; i != RPI_PASSES; ++i) -+ pass_queue_kill(pqs + i); ++ return 0; +} + -+ -+static void worker_pic_free_one(HEVCRpiJob * const jb) ++static int map_pixel_format(HEVCRpiSPS * const sps) +{ -+ // Free coeff stuff - allocation not the same for all buffers -+ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ const int cfmt = sps->chroma_format_idc; + -+ if (cf->s[0].buf != NULL) -+ av_freep(&cf->mptr); -+ if (cf->s[2].buf != NULL) -+ gpu_free(&cf->gptr); -+ memset(cf, 0, sizeof(*cf)); -+} ++ sps->pix_fmt = AV_PIX_FMT_NONE; ++ switch (sps->bit_depth) { ++ case 8: ++ if (cfmt == 1) ++ sps->pix_fmt = AV_PIX_FMT_SAND128; ++ break; ++ case 10: ++ if (cfmt == 1) ++ sps->pix_fmt = AV_PIX_FMT_SAND64_10; ++ break; ++ default: ++ break; ++ } + -+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count) -+{ -+ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ sps->hshift[0] = sps->vshift[0] = 0; ++ sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4 ++ sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2 + -+ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) -+ goto fail; -+ cf->s[2].buf = (int16_t *)cf->gptr.arm; -+ cf->s[3].buf = cf->s[2].buf + coeff_count; ++ sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0; + -+ // Must be 64 byte aligned for our zero zapping code so over-allocate & -+ // round -+ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL) -+ goto fail; -+ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); + return 0; -+ -+fail: -+ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__); -+ worker_pic_free_one(jb); -+ return -1; +} + -+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) ++int ff_hevc_rpi_parse_sps(HEVCRpiSPS *sps, GetBitContext *gb, unsigned int *sps_id, ++ int apply_defdispwin, AVBufferRef **vps_list, AVCodecContext *avctx) +{ -+ unsigned int i; -+ for (i = 0; i != 4; ++i) { -+ cf->s[i].n = 0; -+ } -+} -+#endif ++ HEVCWindow *ow; ++ int ret = 0; ++ int log2_diff_max_min_transform_block_size; ++ int bit_depth_chroma, start, vui_present, sublayer_ordering_info; ++ int i; + ++ // Coded parameters + - /** - * NOTE: Each function hls_foo correspond to the function foo in the - * specification (HLS stands for High Level Syntax). -@@ -57,6 +730,19 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12 - /* free everything allocated by pic_arrays_init() */ - static void pic_arrays_free(HEVCContext *s) - { -+#ifdef RPI_DEBLOCK_VPU -+ { -+ int i; -+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) { -+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; ++ sps->vps_id = get_bits(gb, 4); ++ if (sps->vps_id >= HEVC_MAX_VPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id); ++ return AVERROR_INVALIDDATA; ++ } + -+ if (dvq->vpu_cmds_arm) { -+ gpu_free(&dvq->deblock_vpu_gmem); -+ dvq->vpu_cmds_arm = 0; ++ if (vps_list && !vps_list[sps->vps_id]) { ++ av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n", ++ sps->vps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->max_sub_layers = get_bits(gb, 3) + 1; ++ if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) { ++ av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n", ++ sps->max_sub_layers); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->temporal_id_nesting_flag = get_bits(gb, 1); ++ ++ if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0) ++ return ret; ++ ++ *sps_id = get_ue_golomb_long(gb); ++ if (*sps_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->chroma_format_idc = get_ue_golomb_long(gb); ++ if (sps->chroma_format_idc > 3U) { ++ av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->chroma_format_idc == 3) ++ sps->separate_colour_plane_flag = get_bits1(gb); ++ ++ if (sps->separate_colour_plane_flag) ++ sps->chroma_format_idc = 0; ++ ++ sps->width = get_ue_golomb_long(gb); ++ sps->height = get_ue_golomb_long(gb); ++ if ((ret = av_image_check_size(sps->width, ++ sps->height, 0, avctx)) < 0) ++ return ret; ++ ++ if (get_bits1(gb)) { // pic_conformance_flag ++ int vert_mult = 1 + (sps->chroma_format_idc < 2); ++ int horiz_mult = 1 + (sps->chroma_format_idc < 3); ++ sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; ++ sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; ++ sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult; ++ sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; ++ ++ if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "discarding sps conformance window, " ++ "original values are l:%u r:%u t:%u b:%u\n", ++ sps->pic_conf_win.left_offset, ++ sps->pic_conf_win.right_offset, ++ sps->pic_conf_win.top_offset, ++ sps->pic_conf_win.bottom_offset); ++ ++ sps->pic_conf_win.left_offset = ++ sps->pic_conf_win.right_offset = ++ sps->pic_conf_win.top_offset = ++ sps->pic_conf_win.bottom_offset = 0; ++ } ++ sps->output_window = sps->pic_conf_win; ++ } ++ ++ sps->bit_depth = get_ue_golomb_long(gb) + 8; ++ bit_depth_chroma = get_ue_golomb_long(gb) + 8; ++ if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Luma bit depth (%d) is different from chroma bit depth (%d), " ++ "this is unsupported.\n", ++ sps->bit_depth, bit_depth_chroma); ++ return AVERROR_INVALIDDATA; ++ } ++ sps->bit_depth_chroma = bit_depth_chroma; ++ ++ ret = map_pixel_format(sps); ++ if (ret < 0) ++ return ret; ++ ++ sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4; ++ if (sps->log2_max_poc_lsb > 16) { ++ av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n", ++ sps->log2_max_poc_lsb - 4); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sublayer_ordering_info = get_bits1(gb); ++ start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1; ++ for (i = start; i < sps->max_sub_layers; i++) { ++ sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1; ++ sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb); ++ sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1; ++ if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) { ++ av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n", ++ sps->temporal_layer[i].max_dec_pic_buffering - 1U); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) { ++ av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n", ++ sps->temporal_layer[i].num_reorder_pics); ++ if (avctx->err_recognition & AV_EF_EXPLODE || ++ sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) { ++ return AVERROR_INVALIDDATA; + } ++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1; + } + } -+#endif - av_freep(&s->sao); - av_freep(&s->deblock); - -@@ -93,6 +779,64 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps) - int ctb_count = sps->ctb_width * sps->ctb_height; - int min_pu_size = sps->min_pu_width * sps->min_pu_height; - -+#ifdef RPI -+ const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size); -+ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * RPI_MAX_WIDTH; + -+ av_assert0(sps); -+ s->max_ctu_count = coefs_per_luma / coefs_in_ctb; -+#endif -+#ifdef RPI_DEBLOCK_VPU -+ { -+ int i; -+ s->enable_rpi_deblock = !sps->sao_enabled; -+ s->setup_width = (sps->width+15) / 16; -+ s->setup_height = (sps->height+15) / 16; -+ s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16; -+ s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16; ++ if (!sublayer_ordering_info) { ++ for (i = 0; i < start; i++) { ++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering; ++ sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics; ++ sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase; ++ } ++ } + -+ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) -+ { -+ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; -+ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15; -+ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15; -+ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15; -+ const unsigned int total_size =- cmd_size + y_size + uv_size; -+ int p_vc; -+ uint8_t * p_arm; -+ #if RPI_VPU_DEBLOCK_CACHED -+ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem); -+ #else -+ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem); -+ #endif -+ p_vc = dvq->deblock_vpu_gmem.vc; -+ p_arm = dvq->deblock_vpu_gmem.arm; ++ sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3; ++ sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb); ++ sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2; ++ log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb); ++ sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size + ++ sps->log2_min_tb_size; + -+ // Zap all -+ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes); ++ if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size); ++ return AVERROR_INVALIDDATA; ++ } + -+ // Subdivide -+ dvq->vpu_cmds_arm = (void*)p_arm; -+ dvq->vpu_cmds_vc = p_vc; ++ if (sps->log2_diff_max_min_coding_block_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size); ++ return AVERROR_INVALIDDATA; ++ } + -+ p_arm += cmd_size; -+ p_vc += cmd_size; ++ if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size"); ++ return AVERROR_INVALIDDATA; ++ } + -+ dvq->y_setup_arm = (void*)p_arm; -+ dvq->y_setup_vc = (void*)p_vc; ++ if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size); ++ return AVERROR_INVALIDDATA; ++ } + -+ p_arm += y_size; -+ p_vc += y_size; ++ sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb); ++ sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb); + -+ dvq->uv_setup_arm = (void*)p_arm; -+ dvq->uv_setup_vc = (void*)p_vc; ++ sps->scaling_list_enable_flag = get_bits1(gb); ++ if (sps->scaling_list_enable_flag) { ++ set_default_scaling_list_data(&sps->scaling_list); ++ ++ if (get_bits1(gb)) { ++ ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps); ++ if (ret < 0) ++ return ret; + } ++ } + -+ s->dvq_n = 0; -+ s->dvq = s->dvq_ents + s->dvq_n; ++ sps->amp_enabled_flag = get_bits1(gb); ++ sps->sao_enabled = get_bits1(gb); ++ ++ sps->pcm_enabled_flag = get_bits1(gb); ++ if (sps->pcm_enabled_flag) { ++ sps->pcm.bit_depth = get_bits(gb, 4) + 1; ++ sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1; ++ sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3; ++ sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size + ++ get_ue_golomb_long(gb); ++ if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) { ++ av_log(avctx, AV_LOG_ERROR, ++ "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n", ++ sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->pcm.loop_filter_disable_flag = get_bits1(gb); + } -+#endif + - s->bs_width = (width >> 2) + 1; - s->bs_height = (height >> 2) + 1; - -@@ -139,6 +883,29 @@ fail: - return AVERROR(ENOMEM); - } - -+static void default_pred_weight_table(HEVCContext * const s) -+{ -+ unsigned int i; -+ s->sh.luma_log2_weight_denom = 0; -+ s->sh.chroma_log2_weight_denom = 0; -+ for (i = 0; i < s->sh.nb_refs[L0]; i++) { -+ s->sh.luma_weight_l0[i] = 1; -+ s->sh.luma_offset_l0[i] = 0; -+ s->sh.chroma_weight_l0[i][0] = 1; -+ s->sh.chroma_offset_l0[i][0] = 0; -+ s->sh.chroma_weight_l0[i][1] = 1; -+ s->sh.chroma_offset_l0[i][1] = 0; -+ } -+ for (i = 0; i < s->sh.nb_refs[L1]; i++) { -+ s->sh.luma_weight_l1[i] = 1; -+ s->sh.luma_offset_l1[i] = 0; -+ s->sh.chroma_weight_l1[i][0] = 1; -+ s->sh.chroma_offset_l1[i][0] = 0; -+ s->sh.chroma_weight_l1[i][1] = 1; -+ s->sh.chroma_offset_l1[i][1] = 0; -+ } -+} ++ sps->nb_st_rps = get_ue_golomb_long(gb); ++ if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_RPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n", ++ sps->nb_st_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < sps->nb_st_rps; i++) { ++ if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i], ++ sps, 0)) < 0) ++ return ret; ++ } + - static int pred_weight_table(HEVCContext *s, GetBitContext *gb) - { - int i = 0; -@@ -357,11 +1124,17 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - CONFIG_HEVC_VAAPI_HWACCEL + \ - CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ - CONFIG_HEVC_VDPAU_HWACCEL) -- enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; -+ enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts; - - switch (sps->pix_fmt) { - case AV_PIX_FMT_YUV420P: - case AV_PIX_FMT_YUVJ420P: -+#if RPI_HEVC_SAND -+ // Currently geometry calc is stuffed for big sizes -+ if (sps->width < 2048 && sps->height <= 1088) { -+ *fmt++ = AV_PIX_FMT_SAND128; ++ sps->long_term_ref_pics_present_flag = get_bits1(gb); ++ if (sps->long_term_ref_pics_present_flag) { ++ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb); ++ if (sps->num_long_term_ref_pics_sps > 31U) { ++ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n", ++ sps->num_long_term_ref_pics_sps); ++ return AVERROR_INVALIDDATA; + } -+#endif - #if CONFIG_HEVC_DXVA2_HWACCEL - *fmt++ = AV_PIX_FMT_DXVA2_VLD; - #endif -@@ -380,6 +1153,12 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - #endif - break; - case AV_PIX_FMT_YUV420P10: -+#if RPI_HEVC_SAND -+ // Currently geometry calc is stuffed for big sizes -+ if (sps->width < 2048 && sps->height <= 1088) { -+ *fmt++ = AV_PIX_FMT_SAND64_10; ++ for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) { ++ sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb); ++ sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb); + } -+#endif - #if CONFIG_HEVC_DXVA2_HWACCEL - *fmt++ = AV_PIX_FMT_DXVA2_VLD; - #endif -@@ -405,7 +1184,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) - static int set_sps(HEVCContext *s, const HEVCSPS *sps, - enum AVPixelFormat pix_fmt) - { -- int ret, i; -+ int ret; - - pic_arrays_free(s); - s->ps.sps = NULL; -@@ -425,26 +1204,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, - ff_hevc_pred_init(&s->hpc, sps->bit_depth); - ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth); - ff_videodsp_init (&s->vdsp, sps->bit_depth); -+#ifdef RPI -+ rpi_hevc_qpu_set_fns(s, sps->bit_depth); -+#endif - -- for (i = 0; i < 3; i++) { -- av_freep(&s->sao_pixel_buffer_h[i]); -- av_freep(&s->sao_pixel_buffer_v[i]); -- } -+ av_freep(&s->sao_pixel_buffer_h[0]); -+ av_freep(&s->sao_pixel_buffer_v[0]); - - if (sps->sao_enabled && !s->avctx->hwaccel) { -- int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; -- int c_idx; -+ const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1; -+ unsigned int c_idx; -+ size_t vsize[3] = {0}; -+ size_t hsize[3] = {0}; - - for(c_idx = 0; c_idx < c_count; c_idx++) { - int w = sps->width >> sps->hshift[c_idx]; - int h = sps->height >> sps->vshift[c_idx]; -- s->sao_pixel_buffer_h[c_idx] = -- av_malloc((w * 2 * sps->ctb_height) << -- sps->pixel_shift); -- s->sao_pixel_buffer_v[c_idx] = -- av_malloc((h * 2 * sps->ctb_width) << -- sps->pixel_shift); -+ // ctb height & width are a min of 8 so this must a multiple of 16 -+ // so no point rounding up! -+ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; -+ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; - } ++ } + -+ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] -+ // when we have plaited chroma -+ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); -+ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); -+ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; -+ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; -+ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; -+ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; - } - - s->ps.sps = sps; -@@ -719,6 +1508,11 @@ static int hls_slice_header(HEVCContext *s) - if (ret < 0) - return ret; - } -+ else -+ { -+ // Give us unit weights -+ default_pred_weight_table(s); -+ } - - sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); - if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { -@@ -867,55 +1661,52 @@ static int hls_slice_header(HEVCContext *s) - return AVERROR_INVALIDDATA; - } - -- s->HEVClc->first_qp_group = !s->sh.dependent_slice_segment_flag; -+ // ??? overridden by get neighbour ??? -+// s->HEVClc->first_qp_group = !s->sh.dependent_slice_segment_flag; -+// if (!s->sh.dependent_slice_segment_flag) { -+// s->HEVClc->qPy_pred = s->sh.slice_qp; -+// } - -- if (!s->ps.pps->cu_qp_delta_enabled_flag) -- s->HEVClc->qp_y = s->sh.slice_qp; -+// if (!s->ps.pps->cu_qp_delta_enabled_flag) -+// s->HEVClc->qp_y = s->sh.slice_qp; - - s->slice_initialized = 1; -- s->HEVClc->tu.cu_qp_offset_cb = 0; -- s->HEVClc->tu.cu_qp_offset_cr = 0; -+// s->HEVClc->tu.cu_qp_offset_cb = 0; -+// s->HEVClc->tu.cu_qp_offset_cr = 0; - - return 0; - } - --#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)]) -- --#define SET_SAO(elem, value) \ --do { \ -- if (!sao_merge_up_flag && !sao_merge_left_flag) \ -- sao->elem = value; \ -- else if (sao_merge_left_flag) \ -- sao->elem = CTB(s->sao, rx-1, ry).elem; \ -- else if (sao_merge_up_flag) \ -- sao->elem = CTB(s->sao, rx, ry-1).elem; \ -- else \ -- sao->elem = 0; \ --} while (0) -- --static void hls_sao_param(HEVCContext *s, int rx, int ry) -+static void hls_sao_param(const HEVCContext *s, HEVCLocalContext * const lc, const int rx, const int ry) - { -- HEVCLocalContext *lc = s->HEVClc; -- int sao_merge_left_flag = 0; -- int sao_merge_up_flag = 0; -- SAOParams *sao = &CTB(s->sao, rx, ry); -+ SAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width; - int c_idx, i; - - if (s->sh.slice_sample_adaptive_offset_flag[0] || - s->sh.slice_sample_adaptive_offset_flag[1]) { -- if (rx > 0) { -- if (lc->ctb_left_flag) -- sao_merge_left_flag = ff_hevc_sao_merge_flag_decode(s); -+ if (lc->ctb_left_flag) -+ { -+ const int sao_merge_left_flag = ff_hevc_sao_merge_flag_decode(lc); -+ if (sao_merge_left_flag) { -+ *sao = sao[-1]; -+ return; -+ } - } -- if (ry > 0 && !sao_merge_left_flag) { -- if (lc->ctb_up_flag) -- sao_merge_up_flag = ff_hevc_sao_merge_flag_decode(s); -+ if (lc->ctb_up_flag) -+ { -+ const int sao_merge_up_flag = ff_hevc_sao_merge_flag_decode(lc); -+ if (sao_merge_up_flag) { -+ *sao = sao[-(int)s->ps.sps->ctb_width]; -+ return; -+ } - } - } - - for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) { -- int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma : -+ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma : - s->ps.pps->log2_sao_offset_scale_chroma; -+ int offset_abs[4]; -+ char offset_sign[4] = {0}; - - if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) { - sao->type_idx[c_idx] = SAO_NOT_APPLIED; -@@ -926,53 +1717,47 @@ static void hls_sao_param(HEVCContext *s, int rx, int ry) - sao->type_idx[2] = sao->type_idx[1]; - sao->eo_class[2] = sao->eo_class[1]; - } else { -- SET_SAO(type_idx[c_idx], ff_hevc_sao_type_idx_decode(s)); -+ sao->type_idx[c_idx] = ff_hevc_sao_type_idx_decode(lc); - } - -+ // *** Could use BY22 here quite plausibly - this is all bypass stuff ++ sps->sps_temporal_mvp_enabled_flag = get_bits1(gb); ++ sps->sps_strong_intra_smoothing_enable_flag = get_bits1(gb); ++ sps->vui.sar = (AVRational){0, 1}; ++ vui_present = get_bits1(gb); ++ if (vui_present) ++ decode_vui(gb, avctx, apply_defdispwin, sps); ++ ++ if (get_bits1(gb)) { // sps_extension_flag ++ int sps_extension_flag[1]; ++ for (i = 0; i < 1; i++) ++ sps_extension_flag[i] = get_bits1(gb); ++ skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); ++ if (sps_extension_flag[0]) { ++ int extended_precision_processing_flag; ++ int cabac_bypass_alignment_enabled_flag; ++ ++ sps->transform_skip_rotation_enabled_flag = get_bits1(gb); ++ sps->transform_skip_context_enabled_flag = get_bits1(gb); ++ sps->implicit_rdpcm_enabled_flag = get_bits1(gb); ++ ++ sps->explicit_rdpcm_enabled_flag = get_bits1(gb); ++ ++ extended_precision_processing_flag = get_bits1(gb); ++ if (extended_precision_processing_flag) ++ av_log(avctx, AV_LOG_WARNING, ++ "extended_precision_processing_flag not yet implemented\n"); ++ ++ sps->intra_smoothing_disabled_flag = get_bits1(gb); ++ sps->high_precision_offsets_enabled_flag = get_bits1(gb); ++ if (sps->high_precision_offsets_enabled_flag) ++ av_log(avctx, AV_LOG_WARNING, ++ "high_precision_offsets_enabled_flag not fully implemented\n"); + - if (sao->type_idx[c_idx] == SAO_NOT_APPLIED) - continue; - - for (i = 0; i < 4; i++) -- SET_SAO(offset_abs[c_idx][i], ff_hevc_sao_offset_abs_decode(s)); -+ offset_abs[i] = ff_hevc_sao_offset_abs_decode(s, lc); - - if (sao->type_idx[c_idx] == SAO_BAND) { - for (i = 0; i < 4; i++) { -- if (sao->offset_abs[c_idx][i]) { -- SET_SAO(offset_sign[c_idx][i], -- ff_hevc_sao_offset_sign_decode(s)); -- } else { -- sao->offset_sign[c_idx][i] = 0; -- } -+ if (offset_abs[i] != 0) -+ offset_sign[i] = ff_hevc_sao_offset_sign_decode(lc); - } -- SET_SAO(band_position[c_idx], ff_hevc_sao_band_position_decode(s)); -+ sao->band_position[c_idx] = ff_hevc_sao_band_position_decode(lc); - } else if (c_idx != 2) { -- SET_SAO(eo_class[c_idx], ff_hevc_sao_eo_class_decode(s)); -+ sao->eo_class[c_idx] = ff_hevc_sao_eo_class_decode(lc); - } - - // Inferred parameters - sao->offset_val[c_idx][0] = 0; - for (i = 0; i < 4; i++) { -- sao->offset_val[c_idx][i + 1] = sao->offset_abs[c_idx][i]; -+ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale; - if (sao->type_idx[c_idx] == SAO_EDGE) { - if (i > 1) - sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; -- } else if (sao->offset_sign[c_idx][i]) { -+ } else if (offset_sign[i]) { - sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; - } -- sao->offset_val[c_idx][i + 1] *= 1 << log2_sao_offset_scale; - } - } - } - --#undef SET_SAO --#undef CTB - --static int hls_cross_component_pred(HEVCContext *s, int idx) { -- HEVCLocalContext *lc = s->HEVClc; -- int log2_res_scale_abs_plus1 = ff_hevc_log2_res_scale_abs(s, idx); -+static int hls_cross_component_pred(HEVCLocalContext * const lc, const int idx) { -+ int log2_res_scale_abs_plus1 = ff_hevc_log2_res_scale_abs(lc, idx); - - if (log2_res_scale_abs_plus1 != 0) { -- int res_scale_sign_flag = ff_hevc_res_scale_sign_flag(s, idx); -+ int res_scale_sign_flag = ff_hevc_res_scale_sign_flag(lc, idx); - lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) * - (1 - 2 * res_scale_sign_flag); - } else { -@@ -983,20 +1768,54 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) { - return 0; - } - --static int hls_transform_unit(HEVCContext *s, int x0, int y0, -+#ifdef RPI -+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb) -+{ -+ return jb->intra.cmds + jb->intra.n++; -+} ++ sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); + -+static void rpi_intra_pred(const HEVCContext * const s, HEVCLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx) -+{ -+ // U & V done on U call in the case of sliced frames -+ if (av_rpi_is_sand_frame(s->frame) && c_idx > 1) -+ return; ++ cabac_bypass_alignment_enabled_flag = get_bits1(gb); ++ if (cabac_bypass_alignment_enabled_flag) ++ av_log(avctx, AV_LOG_WARNING, ++ "cabac_bypass_alignment_enabled_flag not yet implemented\n"); ++ } ++ } ++ if (apply_defdispwin) { ++ sps->output_window.left_offset += sps->vui.def_disp_win.left_offset; ++ sps->output_window.right_offset += sps->vui.def_disp_win.right_offset; ++ sps->output_window.top_offset += sps->vui.def_disp_win.top_offset; ++ sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset; ++ } + -+ if (s->enable_rpi) { -+ HEVCPredCmd *cmd = rpi_new_intra_cmd(lc->jb0); -+ cmd->type = RPI_PRED_INTRA; -+ cmd->size = log2_trafo_size; -+ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; -+ cmd->c_idx = c_idx; -+ cmd->i_pred.x = x0; -+ cmd->i_pred.y = y0; -+ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ ow = &sps->output_window; ++ if (ow->left_offset >= INT_MAX - ow->right_offset || ++ ow->top_offset >= INT_MAX - ow->bottom_offset || ++ ow->left_offset + ow->right_offset >= sps->width || ++ ow->top_offset + ow->bottom_offset >= sps->height) { ++ av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n", ++ ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset); ++ if (avctx->err_recognition & AV_EF_EXPLODE) { ++ return AVERROR_INVALIDDATA; ++ } ++ av_log(avctx, AV_LOG_WARNING, ++ "Displaying the whole video surface.\n"); ++ memset(ow, 0, sizeof(*ow)); ++ memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win)); + } -+ else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) { -+ s->hpc.intra_pred_c[log2_trafo_size - 2](s, lc, x0, y0, c_idx); ++ ++ // Inferred parameters ++ sps->log2_ctb_size = sps->log2_min_cb_size + ++ sps->log2_diff_max_min_coding_block_size; ++ sps->log2_min_pu_size = sps->log2_min_cb_size - 1; ++ ++ if (sps->log2_ctb_size > HEVC_MAX_LOG2_CTB_SIZE) { ++ av_log(avctx, AV_LOG_ERROR, "CTB size out of range: 2^%d\n", sps->log2_ctb_size); ++ return AVERROR_INVALIDDATA; + } -+ else { -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, x0, y0, c_idx); ++ if (sps->log2_ctb_size < 4) { ++ av_log(avctx, ++ AV_LOG_ERROR, ++ "log2_ctb_size %d differs from the bounds of any known profile\n", ++ sps->log2_ctb_size); ++ avpriv_request_sample(avctx, "log2_ctb_size %d", sps->log2_ctb_size); ++ return AVERROR_INVALIDDATA; + } + -+} -+#endif ++ sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; ++ sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; ++ sps->ctb_size = sps->ctb_width * sps->ctb_height; + -+static int hls_transform_unit(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, - int xBase, int yBase, int cb_xBase, int cb_yBase, - int log2_cb_size, int log2_trafo_size, - int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr) - { -- HEVCLocalContext *lc = s->HEVClc; - const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1]; - int i; - - if (lc->cu.pred_mode == MODE_INTRA) { - int trafo_size = 1 << log2_trafo_size; -- ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size); -- -- s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0); -+ ff_hevc_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size, x0, y0, 0); -+#else -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, x0, y0, 0); -+#endif - } - - if (cbf_luma || cbf_cb[0] || cbf_cr[0] || -@@ -1008,9 +1827,9 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - (cbf_cb[1] || cbf_cr[1])); - - if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) { -- lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(s); -+ lc->tu.cu_qp_delta = ff_hevc_cu_qp_delta_abs(lc); - if (lc->tu.cu_qp_delta != 0) -- if (ff_hevc_cu_qp_delta_sign_flag(s) == 1) -+ if (ff_hevc_cu_qp_delta_sign_flag(lc) == 1) - lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta; - lc->tu.is_cu_qp_delta_coded = 1; - -@@ -1025,24 +1844,24 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - return AVERROR_INVALIDDATA; - } - -- ff_hevc_set_qPy(s, cb_xBase, cb_yBase, log2_cb_size); -+ ff_hevc_set_qPy(s, lc, cb_xBase, cb_yBase, log2_cb_size); - } - -- if (s->sh.cu_chroma_qp_offset_enabled_flag && cbf_chroma && -- !lc->cu.cu_transquant_bypass_flag && !lc->tu.is_cu_chroma_qp_offset_coded) { -- int cu_chroma_qp_offset_flag = ff_hevc_cu_chroma_qp_offset_flag(s); -+ if (!lc->tu.is_cu_chroma_qp_offset_coded && cbf_chroma && -+ !lc->cu.cu_transquant_bypass_flag) { -+ int cu_chroma_qp_offset_flag = ff_hevc_cu_chroma_qp_offset_flag(lc); - if (cu_chroma_qp_offset_flag) { - int cu_chroma_qp_offset_idx = 0; - if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) { -- cu_chroma_qp_offset_idx = ff_hevc_cu_chroma_qp_offset_idx(s); -+ cu_chroma_qp_offset_idx = ff_hevc_cu_chroma_qp_offset_idx(s, lc); - av_log(s->avctx, AV_LOG_ERROR, - "cu_chroma_qp_offset_idx not yet tested.\n"); - } - lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx]; - lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx]; - } else { -- lc->tu.cu_qp_offset_cb = 0; -- lc->tu.cu_qp_offset_cr = 0; -+// lc->tu.cu_qp_offset_cb = 0; -+// lc->tu.cu_qp_offset_cr = 0; - } - lc->tu.is_cu_chroma_qp_offset_coded = 1; - } -@@ -1068,7 +1887,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - lc->tu.cross_pf = 0; - - if (cbf_luma) -- ff_hevc_hls_residual_coding(s, x0, y0, log2_trafo_size, scan_idx, 0); -+ ff_hevc_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); - if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) { - int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); - int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); -@@ -1077,15 +1896,19 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - (lc->tu.chroma_mode_c == 4))); - - if (lc->tu.cross_pf) { -- hls_cross_component_pred(s, 0); -+ hls_cross_component_pred(lc, 0); - } - for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { - if (lc->cu.pred_mode == MODE_INTRA) { -- ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1); -+ ff_hevc_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1); -+#else -+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (i << log2_trafo_size_c), 1); -+#endif - } - if (cbf_cb[i]) -- ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -+ ff_hevc_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), - log2_trafo_size_c, scan_idx_c, 1); - else - if (lc->tu.cross_pf) { -@@ -1106,15 +1929,19 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - } - - if (lc->tu.cross_pf) { -- hls_cross_component_pred(s, 1); -+ hls_cross_component_pred(lc, 1); - } - for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { - if (lc->cu.pred_mode == MODE_INTRA) { -- ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2); -+ ff_hevc_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2); -+#else -+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (i << log2_trafo_size_c), 2); -+#endif - } - if (cbf_cr[i]) -- ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c), -+ ff_hevc_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), - log2_trafo_size_c, scan_idx_c, 2); - else - if (lc->tu.cross_pf) { -@@ -1138,22 +1965,30 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]); - for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { - if (lc->cu.pred_mode == MODE_INTRA) { -- ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), -+ ff_hevc_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), - trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1); -+#else -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (i << log2_trafo_size), 1); -+#endif - } - if (cbf_cb[i]) -- ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -+ ff_hevc_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), - log2_trafo_size, scan_idx_c, 1); - } - for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) { - if (lc->cu.pred_mode == MODE_INTRA) { -- ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size), -+ ff_hevc_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), - trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2); -+#else -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (i << log2_trafo_size), 2); -+#endif - } - if (cbf_cr[i]) -- ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size), -+ ff_hevc_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), - log2_trafo_size, scan_idx_c, 2); - } - } -@@ -1161,27 +1996,47 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - if (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3) { - int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]); - int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]); -- ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1); -- s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2); -+ ff_hevc_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1); -+ rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2); -+#else -+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0, 1); -+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0, 2); -+#endif - if (s->ps.sps->chroma_format_idc == 2) { -- ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c), -+ ff_hevc_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c), - trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1); -- s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1); -+ rpi_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2); -+#else -+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (1 << log2_trafo_size_c), 1); -+ s->hpc.intra_pred[log2_trafo_size_c - 2](s, lc, x0, y0 + (1 << log2_trafo_size_c), 2); -+#endif - } - } else if (blk_idx == 3) { - int trafo_size_h = 1 << (log2_trafo_size + 1); - int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]); -- ff_hevc_set_neighbour_available(s, xBase, yBase, -+ ff_hevc_set_neighbour_available(s, lc, xBase, yBase, - trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1); -- s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1); -+ rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2); -+#else -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase, 1); -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase, 2); -+#endif - if (s->ps.sps->chroma_format_idc == 2) { -- ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)), -+ ff_hevc_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)), - trafo_size_h, trafo_size_v); -- s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1); -- s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2); -+#ifdef RPI -+ rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1); -+ rpi_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2); -+#else -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (1 << (log2_trafo_size)), 1); -+ s->hpc.intra_pred[log2_trafo_size - 2](s, lc, xBase, yBase + (1 << (log2_trafo_size)), 2); -+#endif - } - } - } -@@ -1189,7 +2044,7 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0, - return 0; - } - --static void set_deblocking_bypass(HEVCContext *s, int x0, int y0, int log2_cb_size) -+static void set_deblocking_bypass(const HEVCContext * const s, const int x0, const int y0, const int log2_cb_size) - { - int cb_size = 1 << log2_cb_size; - int log2_min_pu_size = s->ps.sps->log2_min_pu_size; -@@ -1204,13 +2059,12 @@ static void set_deblocking_bypass(HEVCContext *s, int x0, int y0, int log2_cb_si - s->is_pcm[i + j * min_pu_width] = 2; - } - --static int hls_transform_tree(HEVCContext *s, int x0, int y0, -+static int hls_transform_tree(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, - int xBase, int yBase, int cb_xBase, int cb_yBase, - int log2_cb_size, int log2_trafo_size, - int trafo_depth, int blk_idx, - const int *base_cbf_cb, const int *base_cbf_cr) - { -- HEVCLocalContext *lc = s->HEVClc; - uint8_t split_transform_flag; - int cbf_cb[2]; - int cbf_cr[2]; -@@ -1242,7 +2096,7 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0, - log2_trafo_size > s->ps.sps->log2_min_tb_size && - trafo_depth < lc->cu.max_trafo_depth && - !(lc->cu.intra_split_flag && trafo_depth == 0)) { -- split_transform_flag = ff_hevc_split_transform_flag_decode(s, log2_trafo_size); -+ split_transform_flag = ff_hevc_split_transform_flag_decode(lc, log2_trafo_size); - } else { - int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 && - lc->cu.pred_mode == MODE_INTER && -@@ -1256,16 +2110,16 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0, - - if (s->ps.sps->chroma_format_idc && (log2_trafo_size > 2 || s->ps.sps->chroma_format_idc == 3)) { - if (trafo_depth == 0 || cbf_cb[0]) { -- cbf_cb[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth); -+ cbf_cb[0] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth); - if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) { -- cbf_cb[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth); -+ cbf_cb[1] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth); - } - } - - if (trafo_depth == 0 || cbf_cr[0]) { -- cbf_cr[0] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth); -+ cbf_cr[0] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth); - if (s->ps.sps->chroma_format_idc == 2 && (!split_transform_flag || log2_trafo_size == 3)) { -- cbf_cr[1] = ff_hevc_cbf_cb_cr_decode(s, trafo_depth); -+ cbf_cr[1] = ff_hevc_cbf_cb_cr_decode(lc, trafo_depth); - } - } - } -@@ -1277,7 +2131,7 @@ static int hls_transform_tree(HEVCContext *s, int x0, int y0, - - #define SUBDIVIDE(x, y, idx) \ - do { \ -- ret = hls_transform_tree(s, x, y, x0, y0, cb_xBase, cb_yBase, log2_cb_size, \ -+ ret = hls_transform_tree(s, lc, x, y, x0, y0, cb_xBase, cb_yBase, log2_cb_size, \ - log2_trafo_size - 1, trafo_depth + 1, idx, \ - cbf_cb, cbf_cr); \ - if (ret < 0) \ -@@ -1299,10 +2153,10 @@ do { - if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 || - cbf_cb[0] || cbf_cr[0] || - (s->ps.sps->chroma_format_idc == 2 && (cbf_cb[1] || cbf_cr[1]))) { -- cbf_luma = ff_hevc_cbf_luma_decode(s, trafo_depth); -+ cbf_luma = ff_hevc_cbf_luma_decode(lc, trafo_depth); - } - -- ret = hls_transform_unit(s, x0, y0, xBase, yBase, cb_xBase, cb_yBase, -+ ret = hls_transform_unit(s, lc, x0, y0, xBase, yBase, cb_xBase, cb_yBase, - log2_cb_size, log2_trafo_size, - blk_idx, cbf_luma, cbf_cb, cbf_cr); - if (ret < 0) -@@ -1318,7 +2172,7 @@ do { - } - } - if (!s->sh.disable_deblocking_filter_flag) { -- ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_trafo_size); -+ ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size); - if (s->ps.pps->transquant_bypass_enable_flag && - lc->cu.cu_transquant_bypass_flag) - set_deblocking_bypass(s, x0, y0, log2_trafo_size); -@@ -1327,47 +2181,119 @@ do { - return 0; - } - --static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) ++ sps->min_cb_width = sps->width >> sps->log2_min_cb_size; ++ sps->min_cb_height = sps->height >> sps->log2_min_cb_size; ++ sps->min_tb_width = sps->width >> sps->log2_min_tb_size; ++ sps->min_tb_height = sps->height >> sps->log2_min_tb_size; ++ sps->min_pu_width = sps->width >> sps->log2_min_pu_size; ++ sps->min_pu_height = sps->height >> sps->log2_min_pu_size; ++ sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1; + -+static int pcm_extract(const HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) - { -- HEVCLocalContext *lc = s->HEVClc; - GetBitContext gb; -- int cb_size = 1 << log2_cb_size; -- ptrdiff_t stride0 = s->frame->linesize[0]; -- ptrdiff_t stride1 = s->frame->linesize[1]; -- ptrdiff_t stride2 = s->frame->linesize[2]; -- uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; -- uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; -- uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; -- -- int length = cb_size * cb_size * s->ps.sps->pcm.bit_depth + -- (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) + -- ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) * -- s->ps.sps->pcm.bit_depth_chroma; -- const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3); - int ret; - -- if (!s->sh.disable_deblocking_filter_flag) -- ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size); -- - ret = init_get_bits(&gb, pcm, length); - if (ret < 0) - return ret; - -- s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); -- if (s->ps.sps->chroma_format_idc) { -- s->hevcdsp.put_pcm(dst1, stride1, -+#if RPI_HEVC_SAND -+ if (av_rpi_is_sand_frame(s->frame)) { -+ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), -+ s->frame->linesize[0], -+ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); -+ -+ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]), -+ s->frame->linesize[1], - cb_size >> s->ps.sps->hshift[1], - cb_size >> s->ps.sps->vshift[1], - &gb, s->ps.sps->pcm.bit_depth_chroma); -- s->hevcdsp.put_pcm(dst2, stride2, -- cb_size >> s->ps.sps->hshift[2], -- cb_size >> s->ps.sps->vshift[2], -- &gb, s->ps.sps->pcm.bit_depth_chroma); - } -+ else -+#endif -+ { -+ const int stride0 = s->frame->linesize[0]; -+ uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)]; -+ const int stride1 = s->frame->linesize[1]; -+ uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)]; -+ const int stride2 = s->frame->linesize[2]; -+ uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)]; -+ -+ s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); -+ if (s->ps.sps->chroma_format_idc) { -+ s->hevcdsp.put_pcm(dst1, stride1, -+ cb_size >> s->ps.sps->hshift[1], -+ cb_size >> s->ps.sps->vshift[1], -+ &gb, s->ps.sps->pcm.bit_depth_chroma); -+ s->hevcdsp.put_pcm(dst2, stride2, -+ cb_size >> s->ps.sps->hshift[2], -+ cb_size >> s->ps.sps->vshift[2], -+ &gb, s->ps.sps->pcm.bit_depth_chroma); -+ } - ++ sps->qp_bd_offset = 6 * (sps->bit_depth - 8); ++ ++ if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) || ++ av_mod_uintp2(sps->height, sps->log2_min_cb_size)) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n"); ++ return AVERROR_INVALIDDATA; + } - return 0; - } - -+#ifdef RPI -+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n) -+{ -+ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no; -+ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); -+ cfe->n += n; -+ return coeffs; -+} -+#endif + -+// x * 2^(y*2) -+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) -+{ -+ return x << (y * 2); ++ if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) { ++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n", ++ sps->max_transform_hierarchy_depth_inter); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) { ++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n", ++ sps->max_transform_hierarchy_depth_intra); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "max transform block size out of range: %d\n", ++ sps->log2_max_trafo_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread SPS by %d bits\n", -get_bits_left(gb)); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ return 0; +} + -+static int hls_pcm_sample(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size) ++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps, int apply_defdispwin) +{ -+ // Length in bits -+ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + -+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) + -+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]); ++ HEVCRpiSPS *sps; ++ AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps)); ++ unsigned int sps_id; ++ int ret; ++ ptrdiff_t nal_size; + -+ const uint8_t * const pcm = skip_bytes(&lc->cc, (length + 7) >> 3); ++ if (!sps_buf) ++ return AVERROR(ENOMEM); ++ sps = (HEVCRpiSPS*)sps_buf->data; + -+ if (!s->sh.disable_deblocking_filter_flag) -+ ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size); ++ av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n"); + -+#ifdef RPI -+ if (s->enable_rpi) { -+ // Copy coeffs -+ const int blen = (length + 7) >> 3; -+ // Round allocated bytes up to nearest 32 to avoid alignment confusion -+ // Allocation is in int16_t s -+ // As we are only using 1 byte per sample and the coeff buffer allows 2 per -+ // sample this rounding doesn't affect the total size we need to allocate for -+ // the coeff buffer -+ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1); -+ memcpy(coeffs, pcm, blen); ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(sps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(sps->data)); ++ sps->data_size = sizeof(sps->data); ++ } else { ++ sps->data_size = nal_size; ++ } ++ memcpy(sps->data, gb->buffer, sps->data_size); + -+ // Our coeff stash assumes that any partially allocated 64byte lump -+ // is zeroed so make that true. -+ { -+ uint8_t * const eopcm = (uint8_t *)coeffs + blen; -+ if ((-(intptr_t)eopcm & 63) != 0) -+ memset(eopcm, 0, -(intptr_t)eopcm & 63); -+ } ++ ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id, ++ apply_defdispwin, ++ ps->vps_list, avctx); ++ if (ret < 0) { ++ av_buffer_unref(&sps_buf); ++ return ret; ++ } + -+ // Add command -+ { -+ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); -+ cmd->type = RPI_PRED_I_PCM; -+ cmd->size = log2_cb_size; -+ cmd->i_pcm.src = coeffs; -+ cmd->i_pcm.x = x0; -+ cmd->i_pcm.y = y0; -+ cmd->i_pcm.src_len = length; -+ } -+ return 0; ++ if (avctx->debug & FF_DEBUG_BITSTREAM) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "Parsed SPS: id %d; coded wxh: %dx%d; " ++ "cropped wxh: %dx%d; pix_fmt: %s.\n", ++ sps_id, sps->width, sps->height, ++ sps->width - (sps->output_window.left_offset + sps->output_window.right_offset), ++ sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset), ++ av_get_pix_fmt_name(sps->pix_fmt)); + } -+#endif + -+ return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size); ++ /* check if this is a repeat of an already parsed SPS, then keep the ++ * original one. ++ * otherwise drop all PPSes that depend on it */ ++ if (ps->sps_list[sps_id] && ++ !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) { ++ av_buffer_unref(&sps_buf); ++ } else { ++ remove_sps(ps, sps_id); ++ ps->sps_list[sps_id] = sps_buf; ++ } ++ ++ return 0; +} + - /** - * 8.5.3.2.2.1 Luma sample unidirectional interpolation process - * -@@ -1384,11 +2310,11 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size) - * @param luma_offset additive offset applied to the luma prediction value - */ - --static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+static void luma_mc_uni(const HEVCContext * const s, HEVCLocalContext * const lc, -+ uint8_t *dst, ptrdiff_t dststride, - AVFrame *ref, const Mv *mv, int x_off, int y_off, - int block_w, int block_h, int luma_weight, int luma_offset) - { -- HEVCLocalContext *lc = s->HEVClc; - uint8_t *src = ref->data[0]; - ptrdiff_t srcstride = ref->linesize[0]; - int pic_width = s->ps.sps->width; -@@ -1399,6 +2325,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag); - int idx = ff_hevc_pel_weight[block_w]; - -+#ifdef DISABLE_MC -+ return; -+#endif ++static void hevc_pps_free(void *opaque, uint8_t *data) ++{ ++ HEVCRpiPPS *pps = (HEVCRpiPPS*)data; + - x_off += mv->x >> 2; - y_off += mv->y >> 2; - src += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1445,11 +2375,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - * @param mv1 motion vector1 (relative to block position) to get pixel data from - * @param current_mv current motion vector structure - */ -- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, -+static void luma_mc_bi(const HEVCContext * const s, HEVCLocalContext * const lc, uint8_t *dst, ptrdiff_t dststride, - AVFrame *ref0, const Mv *mv0, int x_off, int y_off, - int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv) - { -- HEVCLocalContext *lc = s->HEVClc; - ptrdiff_t src0stride = ref0->linesize[0]; - ptrdiff_t src1stride = ref1->linesize[0]; - int pic_width = s->ps.sps->width; -@@ -1469,6 +2398,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - uint8_t *src0 = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift); - uint8_t *src1 = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift); - -+#ifdef DISABLE_MC -+ return; -+#endif ++ av_freep(&pps->column_width); ++ av_freep(&pps->row_height); ++ av_freep(&pps->col_bd); ++ av_freep(&pps->row_bd); ++ av_freep(&pps->col_idxX); ++ av_freep(&pps->ctb_addr_rs_to_ts); ++ av_freep(&pps->ctb_addr_ts_to_rs); ++ av_freep(&pps->tile_pos_rs); ++ av_freep(&pps->tile_size); ++ av_freep(&pps->tile_id); ++ av_freep(&pps->min_tb_addr_zs_tab); + - if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER || - x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER || - y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) { -@@ -1536,11 +2469,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride, - * @param chroma_offset additive offset applied to the chroma prediction value - */ - --static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, -+static void chroma_mc_uni(const HEVCContext * const s, HEVCLocalContext * const lc, uint8_t *dst0, - ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist, - int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset) - { -- HEVCLocalContext *lc = s->HEVClc; - int pic_width = s->ps.sps->width >> s->ps.sps->hshift[1]; - int pic_height = s->ps.sps->height >> s->ps.sps->vshift[1]; - const Mv *mv = ¤t_mv->mv[reflist]; -@@ -1554,6 +2486,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, - intptr_t _mx = mx << (1 - hshift); - intptr_t _my = my << (1 - vshift); - -+#ifdef DISABLE_MC -+ return; -+#endif ++ av_freep(&pps); ++} + - x_off += mv->x >> (2 + hshift); - y_off += mv->y >> (2 + vshift); - src0 += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift)); -@@ -1601,10 +2537,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0, - * @param current_mv current motion vector structure - * @param cidx chroma component(cb, cr) - */ --static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, -+static void chroma_mc_bi(const HEVCContext * const s, HEVCLocalContext * const lc, -+ uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1, - int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx) - { -- HEVCLocalContext *lc = s->HEVClc; - uint8_t *src1 = ref0->data[cidx+1]; - uint8_t *src2 = ref1->data[cidx+1]; - ptrdiff_t src1stride = ref0->linesize[cidx+1]; -@@ -1618,6 +2554,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF - int hshift = s->ps.sps->hshift[1]; - int vshift = s->ps.sps->vshift[1]; - -+#ifdef DISABLE_MC -+ return; -+#endif ++static int pps_range_extensions(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiPPS *pps, HEVCRpiSPS *sps) { ++ int i; + - intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift); - intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift); - intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift); -@@ -1691,37 +2631,136 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF - _mx1, _my1, block_w); - } - --static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref, -- const Mv *mv, int y0, int height) -+#ifdef RPI -+void ff_hevc_rpi_progress_wait_field(const HEVCContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int val, const int field) ++ if (pps->transform_skip_enabled_flag) { ++ pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2; ++ } ++ pps->cross_component_prediction_enabled_flag = get_bits1(gb); ++ pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb); ++ if (pps->chroma_qp_offset_list_enabled_flag) { ++ pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb); ++ pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb); ++ if (pps->chroma_qp_offset_list_len_minus1 > 5) { ++ av_log(avctx, AV_LOG_ERROR, ++ "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i <= pps->chroma_qp_offset_list_len_minus1; i++) { ++ pps->cb_qp_offset_list[i] = get_se_golomb_long(gb); ++ if (pps->cb_qp_offset_list[i]) { ++ av_log(avctx, AV_LOG_WARNING, ++ "cb_qp_offset_list not tested yet.\n"); ++ } ++ pps->cr_qp_offset_list[i] = get_se_golomb_long(gb); ++ if (pps->cr_qp_offset_list[i]) { ++ av_log(avctx, AV_LOG_WARNING, ++ "cb_qp_offset_list not tested yet.\n"); ++ } ++ } ++ } ++ pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb); ++ pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb); ++ ++ return(0); ++} ++ ++static inline int setup_pps(AVCodecContext *avctx, GetBitContext *gb, ++ HEVCRpiPPS *pps, HEVCRpiSPS *sps) +{ -+ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { -+ HEVCContext *const fs = ref->tf.owner[field]->priv_data; -+ HEVCRPiFrameProgressState * const pstate = fs->progress_states + field; -+ sem_t * sem = NULL; ++ int log2_diff; ++ int pic_area_in_ctbs; ++ int i, j, x, y, ctb_addr_rs, tile_id; + -+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); -+ if (((volatile int *)ref->tf.progress->data)[field] < val) { -+ HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait; ++ // Inferred parameters ++ pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd)); ++ pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd)); ++ pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX)); ++ if (!pps->col_bd || !pps->row_bd || !pps->col_idxX) ++ return AVERROR(ENOMEM); + -+ av_assert1(pwait->req == -1 && pwait->next == NULL); -+ jb->waited = 1; // Remember that we had to wait for later scheduling ++ if (pps->uniform_spacing_flag) { ++ if (!pps->column_width) { ++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); ++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); ++ } ++ if (!pps->column_width || !pps->row_height) ++ return AVERROR(ENOMEM); + -+ pwait->req = val; -+ pwait->next = NULL; -+ if (pstate->first == NULL) -+ pstate->first = pwait; -+ else -+ pstate->last->next = pwait; -+ pstate->last = pwait; -+ sem = &pwait->sem; ++ for (i = 0; i < pps->num_tile_columns; i++) { ++ pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns - ++ (i * sps->ctb_width) / pps->num_tile_columns; + } -+ pthread_mutex_unlock(&pstate->lock); + -+ if (sem != NULL) { -+ rpi_sem_wait(sem); ++ for (i = 0; i < pps->num_tile_rows; i++) { ++ pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows - ++ (i * sps->ctb_height) / pps->num_tile_rows; + } + } -+} + -+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field) -+{ -+ HEVCRPiFrameProgressState *const pstate = s->progress_states + field; ++ pps->col_bd[0] = 0; ++ for (i = 0; i < pps->num_tile_columns; i++) ++ pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i]; + -+ ((int *)s->ref->tf.progress->data)[field] = val; ++ pps->row_bd[0] = 0; ++ for (i = 0; i < pps->num_tile_rows; i++) ++ pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i]; + -+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); -+ { -+ HEVCRPiFrameProgressWait ** ppwait = &pstate->first; -+ HEVCRPiFrameProgressWait * pwait; ++ for (i = 0, j = 0; i < sps->ctb_width; i++) { ++ if (i >= pps->col_bd[j + 1]) ++ j++; ++ pps->col_idxX[i] = j; ++ } + -+ while ((pwait = *ppwait) != NULL) { -+ if (pwait->req > val) -+ { -+ ppwait = &pwait->next; -+ pstate->last = pwait; ++ /** ++ * 6.5 ++ */ ++ pic_area_in_ctbs = sps->ctb_width * sps->ctb_height; ++ ++ pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); ++ pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); ++ pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id)); ++ pps->tile_size = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_size)); ++ pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab)); ++ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || ++ !pps->tile_id || !pps->min_tb_addr_zs_tab) { ++ return AVERROR(ENOMEM); ++ } ++ ++ for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) { ++ int tb_x = ctb_addr_rs % sps->ctb_width; ++ int tb_y = ctb_addr_rs / sps->ctb_width; ++ int tile_x = 0; ++ int tile_y = 0; ++ int val = 0; ++ ++ for (i = 0; i < pps->num_tile_columns; i++) { ++ if (tb_x < pps->col_bd[i + 1]) { ++ tile_x = i; ++ break; + } -+ else -+ { -+ *ppwait = pwait->next; -+ pwait->req = -1; -+ pwait->next = NULL; -+ sem_post(&pwait->sem); ++ } ++ ++ for (i = 0; i < pps->num_tile_rows; i++) { ++ if (tb_y < pps->row_bd[i + 1]) { ++ tile_y = i; ++ break; + } + } ++ ++ for (i = 0; i < tile_x; i++) ++ val += pps->row_height[tile_y] * pps->column_width[i]; ++ for (i = 0; i < tile_y; i++) ++ val += sps->ctb_width * pps->row_height[i]; ++ ++ val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] + ++ tb_x - pps->col_bd[tile_x]; ++ ++ pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val; ++ pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs; + } -+ pthread_mutex_unlock(&pstate->lock); -+} + -+static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate) -+{ -+ pstate->first = NULL; -+ pstate->last = NULL; -+ pthread_mutex_init(&pstate->lock, NULL); -+} ++ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++) ++ for (i = 0; i < pps->num_tile_columns; i++, tile_id++) ++ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++) ++ for (x = pps->col_bd[i]; x < pps->col_bd[i + 1]; x++) ++ pps->tile_id[pps->ctb_addr_rs_to_ts[y * sps->ctb_width + x]] = tile_id; + -+static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait) -+{ -+ pwait->req = -1; -+ pwait->next = NULL; -+ sem_init(&pwait->sem, 0, 0); -+} -+ -+static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate) -+{ -+ av_assert1(pstate->first == NULL); -+ pthread_mutex_destroy(&pstate->lock); -+} ++ pps->tile_pos_rs = av_malloc_array(tile_id, sizeof(*pps->tile_pos_rs)); ++ if (!pps->tile_pos_rs) ++ return AVERROR(ENOMEM); + -+static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait) - { -- if (s->threads_type == FF_THREAD_FRAME ) { -- int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); -+ sem_destroy(&pwait->sem); -+} -+#endif - -- ff_thread_await_progress(&ref->tf, y, 0); -+static void hevc_await_progress(const HEVCContext * const s, HEVCLocalContext * const lc, const HEVCFrame * const ref, -+ const Mv * const mv, const int y0, const int height) -+{ -+ if (s->threads_type == FF_THREAD_FRAME) { -+ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); ++ for (j = 0; j < pps->num_tile_rows; j++) ++ for (i = 0; i < pps->num_tile_columns; i++) ++ { ++ pps->tile_size[j * pps->num_tile_columns + i] = ++ pps->column_width[i] * pps->row_height[j]; ++ pps->tile_pos_rs[j * pps->num_tile_columns + i] = ++ pps->row_bd[j] * sps->ctb_width + pps->col_bd[i]; ++ } + -+#ifdef RPI -+ if (s->enable_rpi) { -+ // *** Move progress to lc -+ int16_t *const pr = lc->jb0->progress + ref->dpb_no; -+ if (*pr < y) { -+ *pr = y; ++ log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size; ++ pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1]; ++ for (y = 0; y < sps->tb_mask+2; y++) { ++ pps->min_tb_addr_zs_tab[y*(sps->tb_mask+2)] = -1; ++ pps->min_tb_addr_zs_tab[y] = -1; ++ } ++ for (y = 0; y < sps->tb_mask+1; y++) { ++ for (x = 0; x < sps->tb_mask+1; x++) { ++ int tb_x = x >> log2_diff; ++ int tb_y = y >> log2_diff; ++ int rs = sps->ctb_width * tb_y + tb_x; ++ int val = pps->ctb_addr_rs_to_ts[rs] << (log2_diff * 2); ++ for (i = 0; i < log2_diff; i++) { ++ int m = 1 << i; ++ val += (m & x ? m * m : 0) + (m & y ? 2 * m * m : 0); + } ++ pps->min_tb_addr_zs[y * (sps->tb_mask+2) + x] = val; + } -+ else -+#endif -+ // It is a const ThreadFrame but the prototype isn't -+ ff_hevc_progress_wait_mv(s, lc->jb0, ref, y); - } - } - --static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, -- int nPbH, int log2_cb_size, int part_idx, -- int merge_idx, MvField *mv) -+static void hevc_luma_mv_mvp_mode(const HEVCContext * const s, HEVCLocalContext * const lc, -+ const int x0, const int y0, const int nPbW, -+ const int nPbH, const int log2_cb_size, const int part_idx, -+ const int merge_idx, MvField * const mv) - { -- HEVCLocalContext *lc = s->HEVClc; - enum InterPredIdc inter_pred_idc = PRED_L0; - int mvp_flag; - -- ff_hevc_set_neighbour_available(s, x0, y0, nPbW, nPbH); -+ ff_hevc_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); - mv->pred_flag = 0; - if (s->sh.slice_type == HEVC_SLICE_B) -- inter_pred_idc = ff_hevc_inter_pred_idc_decode(s, nPbW, nPbH); -+ inter_pred_idc = ff_hevc_inter_pred_idc_decode(lc, nPbW, nPbH); - - if (inter_pred_idc != PRED_L1) { - if (s->sh.nb_refs[L0]) -- mv->ref_idx[0]= ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L0]); -+ mv->ref_idx[0]= ff_hevc_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]); - - mv->pred_flag = PF_L0; -- ff_hevc_hls_mvd_coding(s, x0, y0, 0); -- mvp_flag = ff_hevc_mvp_lx_flag_decode(s); -- ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size, -+ ff_hevc_hls_mvd_coding(lc); -+ mvp_flag = ff_hevc_mvp_lx_flag_decode(lc); -+ ff_hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, - part_idx, merge_idx, mv, mvp_flag, 0); - mv->mv[0].x += lc->pu.mvd.x; - mv->mv[0].y += lc->pu.mvd.y; -@@ -1729,39 +2768,577 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW, - - if (inter_pred_idc != PRED_L0) { - if (s->sh.nb_refs[L1]) -- mv->ref_idx[1]= ff_hevc_ref_idx_lx_decode(s, s->sh.nb_refs[L1]); -+ mv->ref_idx[1]= ff_hevc_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); - - if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) { - AV_ZERO32(&lc->pu.mvd); - } else { -- ff_hevc_hls_mvd_coding(s, x0, y0, 1); -+ ff_hevc_hls_mvd_coding(lc); - } - - mv->pred_flag += PF_L1; -- mvp_flag = ff_hevc_mvp_lx_flag_decode(s); -- ff_hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size, -+ mvp_flag = ff_hevc_mvp_lx_flag_decode(lc); -+ ff_hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, - part_idx, merge_idx, mv, mvp_flag, 1); - mv->mv[1].x += lc->pu.mvd.x; - mv->mv[1].y += lc->pu.mvd.y; - } - } - --static void hls_prediction_unit(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH, -- int log2_cb_size, int partIdx, int idx) ++ } + -+#if RPI_INTER ++ return 0; ++} + -+static HEVCRpiInterPredQ * -+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) ++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps) +{ -+ HEVCRpiInterPredQ * yp = ipe->q + ipe->curr; -+ HEVCRpiInterPredQ * ypt = yp + 1; -+ for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) { -+ if (ypt->load < yp->load) -+ yp = ypt; ++ HEVCRpiSPS *sps = NULL; ++ int i, ret = 0; ++ unsigned int pps_id = 0; ++ ptrdiff_t nal_size; ++ unsigned log2_parallel_merge_level_minus2; ++ ++ AVBufferRef *pps_buf; ++ HEVCRpiPPS *pps = av_mallocz(sizeof(*pps)); ++ ++ if (!pps) ++ return AVERROR(ENOMEM); ++ ++ pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps), ++ hevc_pps_free, NULL, 0); ++ if (!pps_buf) { ++ av_freep(&pps); ++ return AVERROR(ENOMEM); + } + -+ yp->load += load_val; -+ ipe->used_grp = 1; -+ yp->qpu_mc_curr->data[-1] = fn; // Link is always last el of previous cmd ++ av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n"); + -+ return yp; -+} ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(pps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(pps->data)); ++ pps->data_size = sizeof(pps->data); ++ } else { ++ pps->data_size = nal_size; ++ } ++ memcpy(pps->data, gb->buffer, pps->data_size); ++ ++ // Default values ++ pps->loop_filter_across_tiles_enabled_flag = 1; ++ pps->num_tile_columns = 1; ++ pps->num_tile_rows = 1; ++ pps->uniform_spacing_flag = 1; ++ pps->disable_dbf = 0; ++ pps->beta_offset = 0; ++ pps->tc_offset = 0; ++ pps->log2_max_transform_skip_block_size = 2; ++ ++ // Coded parameters ++ pps_id = get_ue_golomb_long(gb); ++ if (pps_id >= HEVC_MAX_PPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->sps_id = get_ue_golomb_long(gb); ++ if (pps->sps_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (!ps->sps_list[pps->sps_id]) { ++ av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data; + ++ pps->dependent_slice_segments_enabled_flag = get_bits1(gb); ++ pps->output_flag_present_flag = get_bits1(gb); ++ pps->num_extra_slice_header_bits = get_bits(gb, 3); + -+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) -+{ -+ for (unsigned int i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const q = ipe->q + i; -+ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base; ++ pps->sign_data_hiding_flag = get_bits1(gb); + -+ q->qpu_mc_curr->data[-1] = q->code_sync; -+ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1); -+ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage ++ pps->cabac_init_present_flag = get_bits1(gb); ++ ++ pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1; ++ pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1; ++ ++ pps->pic_init_qp_minus26 = get_se_golomb(gb); ++ ++ pps->constrained_intra_pred_flag = get_bits1(gb); ++ pps->transform_skip_enabled_flag = get_bits1(gb); ++ ++ pps->cu_qp_delta_enabled_flag = get_bits1(gb); ++ pps->diff_cu_qp_delta_depth = 0; ++ if (pps->cu_qp_delta_enabled_flag) ++ pps->diff_cu_qp_delta_depth = get_ue_golomb_long(gb); ++ ++ if (pps->diff_cu_qp_delta_depth < 0 || ++ pps->diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) { ++ av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n", ++ pps->diff_cu_qp_delta_depth); ++ ret = AVERROR_INVALIDDATA; ++ goto err; + } -+} + -+// Returns 0 on success, -1 if Q is dangerously full -+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) -+{ -+ if (!ipe->used_grp) -+ return 0; ++ pps->cb_qp_offset = get_se_golomb(gb); ++ if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n", ++ pps->cb_qp_offset); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->cr_qp_offset = get_se_golomb(gb); ++ if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n", ++ pps->cr_qp_offset); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb); ++ ++ pps->weighted_pred_flag = get_bits1(gb); ++ pps->weighted_bipred_flag = get_bits1(gb); ++ ++ pps->transquant_bypass_enable_flag = get_bits1(gb); ++ pps->tiles_enabled_flag = get_bits1(gb); ++ pps->entropy_coding_sync_enabled_flag = get_bits1(gb); ++ ++ if (pps->tiles_enabled_flag) { ++ pps->num_tile_columns = get_ue_golomb_long(gb) + 1; ++ pps->num_tile_rows = get_ue_golomb_long(gb) + 1; ++ if (pps->num_tile_columns <= 0 || ++ pps->num_tile_columns >= sps->width) { ++ av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n", ++ pps->num_tile_columns - 1); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (pps->num_tile_rows <= 0 || ++ pps->num_tile_rows >= sps->height) { ++ av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n", ++ pps->num_tile_rows - 1); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } + -+ if ((ipe->curr += ipe->n_grp) >= ipe->n) -+ { -+ ipe->curr = 0; -+ rpi_inter_pred_sync(ipe); ++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); ++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); ++ if (!pps->column_width || !pps->row_height) { ++ ret = AVERROR(ENOMEM); ++ goto err; ++ } ++ ++ pps->uniform_spacing_flag = get_bits1(gb); ++ if (!pps->uniform_spacing_flag) { ++ uint64_t sum = 0; ++ for (i = 0; i < pps->num_tile_columns - 1; i++) { ++ pps->column_width[i] = get_ue_golomb_long(gb) + 1; ++ sum += pps->column_width[i]; ++ } ++ if (sum >= sps->ctb_width) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum; ++ ++ sum = 0; ++ for (i = 0; i < pps->num_tile_rows - 1; i++) { ++ pps->row_height[i] = get_ue_golomb_long(gb) + 1; ++ sum += pps->row_height[i]; ++ } ++ if (sum >= sps->ctb_height) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum; ++ } ++ pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb); + } -+ ipe->used = 1; -+ ipe->used_grp = 0; + -+ for (unsigned int i = 0; i != ipe->n_grp; ++i) { -+ HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr; -+ if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) { -+ return -1; ++ pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb); ++ ++ pps->deblocking_filter_control_present_flag = get_bits1(gb); ++ if (pps->deblocking_filter_control_present_flag) { ++ pps->deblocking_filter_override_enabled_flag = get_bits1(gb); ++ pps->disable_dbf = get_bits1(gb); ++ if (!pps->disable_dbf) { ++ int beta_offset_div2 = get_se_golomb(gb); ++ int tc_offset_div2 = get_se_golomb(gb) ; ++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6) { ++ av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n", ++ beta_offset_div2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (tc_offset_div2 < -6 || tc_offset_div2 > 6) { ++ av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n", ++ tc_offset_div2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->beta_offset = 2 * beta_offset_div2; ++ pps->tc_offset = 2 * tc_offset_div2; + } + } -+ return 0; -+} + -+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) -+{ -+ unsigned int i; ++ pps->scaling_list_data_present_flag = get_bits1(gb); ++ if (pps->scaling_list_data_present_flag) { ++ set_default_scaling_list_data(&pps->scaling_list); ++ ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps); ++ if (ret < 0) ++ goto err; ++ } ++ pps->lists_modification_present_flag = get_bits1(gb); ++ log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb); ++ if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) { ++ av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n", ++ log2_parallel_merge_level_minus2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2; + -+ ipe->curr = 0; -+ ipe->used = 0; -+ ipe->used_grp = 0; -+ for (i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const q = ipe->q + i; -+ q->qpu_mc_curr = q->qpu_mc_base; -+ q->load = 0; -+ q->last_l0 = NULL; -+ q->last_l1 = NULL; ++ pps->slice_header_extension_present_flag = get_bits1(gb); ++ ++ if (get_bits1(gb)) { // pps_extension_present_flag ++ int pps_range_extensions_flag = get_bits1(gb); ++ /* int pps_extension_7bits = */ get_bits(gb, 7); ++ if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) { ++ if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0) ++ goto err; ++ } + } -+} + -+static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, -+ const unsigned int n_max, const unsigned int n_grp, -+ const unsigned int total_size, const unsigned int min_gap) -+{ -+ memset(ipe, 0, sizeof(*ipe)); -+ av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL); -+ ipe->n_grp = n_grp; -+ ipe->min_gap = min_gap; ++ ret = setup_pps(avctx, gb, pps, sps); ++ if (ret < 0) ++ goto err; + -+#if RPI_CACHE_UNIF_MVS -+ gpu_malloc_cached(total_size, &ipe->gptr); -+#else -+ gpu_malloc_uncached(total_size, &ipe->gptr); -+#endif -+} ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread PPS by %d bits\n", -get_bits_left(gb)); ++ goto err; ++ } + ++ remove_pps(ps, pps_id); ++ ps->pps_list[pps_id] = pps_buf; + -+#if RPI_QPU_EMU_Y -+#define get_mc_address_y(f) ((f)->data[0]) -+#else -+#define get_mc_address_y(f) get_vc_address_y(f) -+#endif -+#if RPI_QPU_EMU_C -+#define get_mc_address_u(f) ((f)->data[1]) -+#else -+#define get_mc_address_u(f) get_vc_address_u(f) -+#endif ++ return 0; + -+static inline int offset_depth_adj(const HEVCContext *const s, const int wt) -+{ -+ return s->ps.sps->high_precision_offsets_enabled_flag ? wt : -+ wt << (s->ps.sps->bit_depth - 8); ++err: ++ av_buffer_unref(&pps_buf); ++ return ret; +} + -+static void -+rpi_pred_y(const HEVCContext *const s, HEVCRpiJob * const jb, -+ const int x0, const int y0, -+ const int nPbW, const int nPbH, -+ const Mv *const mv, -+ const int weight_mul, -+ const int weight_offset, -+ AVFrame *const src_frame) ++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type) +{ -+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my << 8) | mx; -+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; -+ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); -+ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; -+ const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul); -+ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; ++ int prev_poc_lsb = pocTid0 % max_poc_lsb; ++ int prev_poc_msb = pocTid0 - prev_poc_lsb; ++ int poc_msb; + -+ if (my_mx == 0) -+ { -+ const int x1 = x0 + (mv->x >> 2); -+ const int y1 = y0 + (mv->y >> 2); -+ const int bh = nPbH; ++ if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2) ++ poc_msb = prev_poc_msb + max_poc_lsb; ++ else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2) ++ poc_msb = prev_poc_msb - max_poc_lsb; ++ else ++ poc_msb = prev_poc_msb; + -+ for (int start_x = 0; start_x < nPbW; start_x += 16) -+ { -+ const int bw = FFMIN(nPbW - start_x, 16); -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; ++ // For BLA picture types, POCmsb is set to 0. ++ if (nal_unit_type == HEVC_NAL_BLA_W_LP || ++ nal_unit_type == HEVC_NAL_BLA_W_RADL || ++ nal_unit_type == HEVC_NAL_BLA_N_LP) ++ poc_msb = 0; + -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ ++ts->y_pred1_x0y0; ++ return poc_msb + poc_lsb; ++} +diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h +new file mode 100644 +index 0000000000..1600076a69 +--- /dev/null ++++ b/libavcodec/rpi_hevc_ps.h +@@ -0,0 +1,437 @@ ++/* ++ * HEVC parameter set parsing ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+ if (nPbW > 8) -+ ++ts->y_pred1_wgt8; -+ else -+ ++ts->y_pred1_wle8; ++#ifndef AVCODEC_RPI_HEVC_PS_H ++#define AVCODEC_RPI_HEVC_PS_H + -+ if (nPbH > 16) -+ ++ts->y_pred1_hgt16; -+ else -+ ++ts->y_pred1_hle16; -+ } -+#endif ++#include + -+ src1->x = x1 + start_x; -+ src1->y = y1; -+ src1->base = src_vc_address_y; -+ cmd_y->w = bw; -+ cmd_y->h = bh; -+ cmd_y->wo1 = wo; -+ cmd_y->dst_addr = dst_addr + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); -+ } -+ } -+ else -+ { -+ const int x1_m3 = x0 + (mv->x >> 2) - 3; -+ const int y1_m3 = y0 + (mv->y >> 2) - 3; -+ const unsigned int bh = nPbH; -+ int start_x = 0; ++#include "libavutil/buffer.h" ++#include "libavutil/pixfmt.h" ++#include "libavutil/rational.h" + -+#if 1 -+ // As Y-pred operates on two independant 8-wide src blocks we can merge -+ // this pred with the previous one if it the previous one is 8 pel wide, -+ // the same height as the current block, immediately to the left of our -+ // current dest block and mono-pred. ++#include "avcodec.h" ++#include "get_bits.h" ++#include "hevc.h" + -+ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p; -+ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) -+ { -+ const int bw = FFMIN(nPbW, 8); -+ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1; ++typedef struct ShortTermRPS { ++ unsigned int num_negative_pics; ++ int num_delta_pocs; ++ int rps_idx_num_delta_pocs; ++ int32_t delta_poc[32]; ++ uint8_t used[32]; ++} ShortTermRPS; + -+ last_y8_src2->x = x1_m3; -+ last_y8_src2->y = y1_m3; -+ last_y8_src2->base = src_vc_address_y; -+ last_y8_p->w += bw; -+ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); -+ last_y8_p->wo2 = wo; ++typedef struct LongTermRPS { ++ int poc[32]; ++ uint8_t used[32]; ++ uint8_t nb_refs; ++} LongTermRPS; + -+ jb->last_y8_p = NULL; -+ jb->last_y8_l1 = NULL; -+ start_x = bw; -+#if RPI_TSTATS -+ ++s->tstats.y_pred1_y8_merge; -+#endif -+ } -+#endif ++typedef struct SliceHeader { ++ unsigned int pps_id; + -+ for (; start_x < nPbW; start_x += 16) -+ { -+ const int bw = FFMIN(nPbW - start_x, 16); -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ if (mx == 0 && my == 0) -+ ++ts->y_pred1_x0y0; -+ else if (mx == 0) -+ ++ts->y_pred1_x0; -+ else if (my == 0) -+ ++ts->y_pred1_y0; -+ else -+ ++ts->y_pred1_xy; ++ ///< address (in raster order) of the first block in the current slice segment ++ unsigned int slice_segment_addr; ++ ///< address (in raster order) of the first block in the current slice ++ unsigned int slice_addr; + -+ if (nPbW > 8) -+ ++ts->y_pred1_wgt8; -+ else -+ ++ts->y_pred1_wle8; ++ enum HEVCSliceType slice_type; + -+ if (nPbH > 16) -+ ++ts->y_pred1_hgt16; -+ else -+ ++ts->y_pred1_hle16; -+ } -+#endif -+ src1->x = x1_m3 + start_x; -+ src1->y = y1_m3; -+ src1->base = src_vc_address_y; -+ if (bw <= 8) -+ { -+ src2->x = MC_DUMMY_X; -+ src2->y = MC_DUMMY_Y; -+#if RPI_QPU_EMU_Y -+ src2->base = s->qpu_dummy_frame_emu; -+#else -+ src2->base = s->qpu_dummy_frame_qpu; -+#endif -+ } -+ else -+ { -+ src2->x = x1_m3 + start_x + 8; -+ src2->y = y1_m3; -+ src2->base = src_vc_address_y; -+ } -+ cmd_y->w = bw; -+ cmd_y->h = bh; -+ cmd_y->mymx21 = my2_mx2_my_mx; -+ cmd_y->wo1 = wo; -+ cmd_y->wo2 = wo; -+ cmd_y->dst_addr = dst_addr + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ int pic_order_cnt_lsb; + -+ if (bw == 8) { -+ jb->last_y8_l1 = src2; -+ jb->last_y8_p = cmd_y; -+ } -+ } -+ } -+} ++ uint8_t first_slice_in_pic_flag; ++ uint8_t dependent_slice_segment_flag; ++ uint8_t pic_output_flag; ++ uint8_t colour_plane_id; + -+static void -+rpi_pred_y_b(const HEVCContext * const s, HEVCRpiJob * const jb, -+ const int x0, const int y0, -+ const int nPbW, const int nPbH, -+ const struct MvField *const mv_field, -+ const AVFrame *const src_frame, -+ const AVFrame *const src_frame2) -+{ -+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); -+ const Mv * const mv = mv_field->mv + 0; -+ const Mv * const mv2 = mv_field->mv + 1; ++ ///< RPS coded in the slice header itself is stored here ++ int short_term_ref_pic_set_sps_flag; ++ int short_term_ref_pic_set_size; ++ ShortTermRPS slice_rps; ++ const ShortTermRPS *short_term_rps; ++ int long_term_ref_pic_set_size; ++ LongTermRPS long_term_rps; ++ unsigned int list_entry_lx[2][32]; ++ ++ uint8_t rpl_modification_flag[2]; ++ uint8_t no_output_of_prior_pics_flag; ++ uint8_t slice_temporal_mvp_enabled_flag; ++ ++ unsigned int nb_refs[2]; ++ ++ uint8_t slice_sample_adaptive_offset_flag[3]; ++ uint8_t mvd_l1_zero_flag; ++ ++ uint8_t cabac_init_flag; ++ uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag ++ uint8_t slice_loop_filter_across_slices_enabled_flag; ++ uint8_t collocated_list; ++ ++ unsigned int collocated_ref_idx; ++ ++ int slice_qp_delta; ++ int slice_cb_qp_offset; ++ int slice_cr_qp_offset; ++ ++ uint8_t cu_chroma_qp_offset_enabled_flag; ++ ++ int beta_offset; ///< beta_offset_div2 * 2 ++ int tc_offset; ///< tc_offset_div2 * 2 ++ ++ unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand ++ ++ unsigned *entry_point_offset; ++ int * offset; ++ int * size; ++ int num_entry_point_offsets; ++ ++ int8_t slice_qp; ++ ++ uint8_t luma_log2_weight_denom; ++ int16_t chroma_log2_weight_denom; ++ ++ int16_t luma_weight_l0[16]; ++ int16_t chroma_weight_l0[16][2]; ++ int16_t chroma_weight_l1[16][2]; ++ int16_t luma_weight_l1[16]; ++ ++ int16_t luma_offset_l0[16]; ++ int16_t chroma_offset_l0[16][2]; ++ ++ int16_t luma_offset_l1[16]; ++ int16_t chroma_offset_l1[16][2]; ++ ++ int slice_ctb_addr_rs; ++} SliceHeader; ++ ++typedef struct HEVCWindow { ++ unsigned int left_offset; ++ unsigned int right_offset; ++ unsigned int top_offset; ++ unsigned int bottom_offset; ++} HEVCWindow; ++ ++typedef struct VUI { ++ AVRational sar; ++ ++ int overscan_info_present_flag; ++ int overscan_appropriate_flag; ++ ++ int video_signal_type_present_flag; ++ int video_format; ++ int video_full_range_flag; ++ int colour_description_present_flag; ++ uint8_t colour_primaries; ++ uint8_t transfer_characteristic; ++ uint8_t matrix_coeffs; ++ ++ int chroma_loc_info_present_flag; ++ int chroma_sample_loc_type_top_field; ++ int chroma_sample_loc_type_bottom_field; ++ int neutra_chroma_indication_flag; ++ ++ int field_seq_flag; ++ int frame_field_info_present_flag; ++ ++ int default_display_window_flag; ++ HEVCWindow def_disp_win; ++ ++ int vui_timing_info_present_flag; ++ uint32_t vui_num_units_in_tick; ++ uint32_t vui_time_scale; ++ int vui_poc_proportional_to_timing_flag; ++ int vui_num_ticks_poc_diff_one_minus1; ++ int vui_hrd_parameters_present_flag; ++ ++ int bitstream_restriction_flag; ++ int tiles_fixed_structure_flag; ++ int motion_vectors_over_pic_boundaries_flag; ++ int restricted_ref_pic_lists_flag; ++ int min_spatial_segmentation_idc; ++ int max_bytes_per_pic_denom; ++ int max_bits_per_min_cu_denom; ++ int log2_max_mv_length_horizontal; ++ int log2_max_mv_length_vertical; ++} VUI; ++ ++typedef struct PTLCommon { ++ uint8_t profile_space; ++ uint8_t tier_flag; ++ uint8_t profile_idc; ++ uint8_t profile_compatibility_flag[32]; ++ uint8_t level_idc; ++ uint8_t progressive_source_flag; ++ uint8_t interlaced_source_flag; ++ uint8_t non_packed_constraint_flag; ++ uint8_t frame_only_constraint_flag; ++} PTLCommon; ++ ++typedef struct PTL { ++ PTLCommon general_ptl; ++ PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS]; ++ ++ uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS]; ++ uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS]; ++} PTL; ++ ++typedef struct HEVCRpiVPS { ++ uint8_t vps_temporal_id_nesting_flag; ++ int vps_max_layers; ++ int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1 ++ ++ PTL ptl; ++ int vps_sub_layer_ordering_info_present_flag; ++ unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS]; ++ unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS]; ++ unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS]; ++ int vps_max_layer_id; ++ int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1 ++ uint8_t vps_timing_info_present_flag; ++ uint32_t vps_num_units_in_tick; ++ uint32_t vps_time_scale; ++ uint8_t vps_poc_proportional_to_timing_flag; ++ int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1 ++ int vps_num_hrd_parameters; ++ ++ uint8_t data[4096]; ++ int data_size; ++} HEVCRpiVPS; ++ ++typedef struct ScalingList { ++ /* This is a little wasteful, since sizeID 0 only needs 8 coeffs, ++ * and size ID 3 only has 2 arrays, not 6. */ ++ uint8_t sl[4][6][64]; ++ uint8_t sl_dc[2][6]; ++} ScalingList; ++ ++typedef struct HEVCRpiSPS { ++ unsigned vps_id; ++ int chroma_format_idc; ++ uint8_t separate_colour_plane_flag; ++ ++ HEVCWindow output_window; ++ ++ HEVCWindow pic_conf_win; ++ ++ int bit_depth; ++ int bit_depth_chroma; ++ int pixel_shift; ++ enum AVPixelFormat pix_fmt; ++ ++ unsigned int log2_max_poc_lsb; ++ int pcm_enabled_flag; ++ ++ int max_sub_layers; ++ struct { ++ int max_dec_pic_buffering; ++ int num_reorder_pics; ++ int max_latency_increase; ++ } temporal_layer[HEVC_MAX_SUB_LAYERS]; ++ uint8_t temporal_id_nesting_flag; ++ ++ VUI vui; ++ PTL ptl; ++ ++ uint8_t scaling_list_enable_flag; ++ ScalingList scaling_list; ++ ++ unsigned int nb_st_rps; ++ ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_RPS_COUNT]; ++ ++ uint8_t amp_enabled_flag; ++ uint8_t sao_enabled; ++ ++ uint8_t long_term_ref_pics_present_flag; ++ uint16_t lt_ref_pic_poc_lsb_sps[32]; ++ uint8_t used_by_curr_pic_lt_sps_flag[32]; ++ uint8_t num_long_term_ref_pics_sps; ++ ++ struct { ++ uint8_t bit_depth; ++ uint8_t bit_depth_chroma; ++ unsigned int log2_min_pcm_cb_size; ++ unsigned int log2_max_pcm_cb_size; ++ uint8_t loop_filter_disable_flag; ++ } pcm; ++ uint8_t sps_temporal_mvp_enabled_flag; ++ uint8_t sps_strong_intra_smoothing_enable_flag; ++ ++ unsigned int log2_min_cb_size; ++ unsigned int log2_diff_max_min_coding_block_size; ++ unsigned int log2_min_tb_size; ++ unsigned int log2_max_trafo_size; ++ unsigned int log2_ctb_size; ++ unsigned int log2_min_pu_size; ++ ++ int max_transform_hierarchy_depth_inter; ++ int max_transform_hierarchy_depth_intra; ++ ++ int transform_skip_rotation_enabled_flag; ++ int transform_skip_context_enabled_flag; ++ int implicit_rdpcm_enabled_flag; ++ int explicit_rdpcm_enabled_flag; ++ int intra_smoothing_disabled_flag; ++ int high_precision_offsets_enabled_flag; ++ int persistent_rice_adaptation_enabled_flag; ++ ++ ///< coded frame dimension in various units ++ int width; ++ int height; ++ int ctb_width; ++ int ctb_height; ++ int ctb_size; ++ int min_cb_width; ++ int min_cb_height; ++ int min_tb_width; ++ int min_tb_height; ++ int min_pu_width; ++ int min_pu_height; ++ int tb_mask; ++ ++ int hshift[3]; ++ int vshift[3]; ++ ++ int qp_bd_offset; ++ ++ uint8_t data[4096]; ++ int data_size; ++} HEVCRpiSPS; ++ ++typedef struct HEVCRpiPPS { ++ unsigned int sps_id; ///< seq_parameter_set_id ++ ++ uint8_t sign_data_hiding_flag; ++ ++ uint8_t cabac_init_present_flag; ++ ++ int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1 ++ int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1 ++ int pic_init_qp_minus26; ++ ++ uint8_t constrained_intra_pred_flag; ++ uint8_t transform_skip_enabled_flag; ++ ++ uint8_t cu_qp_delta_enabled_flag; ++ int diff_cu_qp_delta_depth; ++ ++ int cb_qp_offset; ++ int cr_qp_offset; ++ uint8_t pic_slice_level_chroma_qp_offsets_present_flag; ++ uint8_t weighted_pred_flag; ++ uint8_t weighted_bipred_flag; ++ uint8_t output_flag_present_flag; ++ uint8_t transquant_bypass_enable_flag; ++ ++ uint8_t dependent_slice_segments_enabled_flag; ++ uint8_t tiles_enabled_flag; ++ uint8_t entropy_coding_sync_enabled_flag; ++ ++ int num_tile_columns; ///< num_tile_columns_minus1 + 1 ++ int num_tile_rows; ///< num_tile_rows_minus1 + 1 ++ uint8_t uniform_spacing_flag; ++ uint8_t loop_filter_across_tiles_enabled_flag; ++ ++ uint8_t seq_loop_filter_across_slices_enabled_flag; ++ ++ uint8_t deblocking_filter_control_present_flag; ++ uint8_t deblocking_filter_override_enabled_flag; ++ uint8_t disable_dbf; ++ int beta_offset; ///< beta_offset_div2 * 2 ++ int tc_offset; ///< tc_offset_div2 * 2 ++ ++ uint8_t scaling_list_data_present_flag; ++ ScalingList scaling_list; ++ ++ uint8_t lists_modification_present_flag; ++ int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2 ++ int num_extra_slice_header_bits; ++ uint8_t slice_header_extension_present_flag; ++ uint8_t log2_max_transform_skip_block_size; ++ uint8_t cross_component_prediction_enabled_flag; ++ uint8_t chroma_qp_offset_list_enabled_flag; ++ uint8_t diff_cu_chroma_qp_offset_depth; ++ uint8_t chroma_qp_offset_list_len_minus1; ++ int8_t cb_qp_offset_list[6]; ++ int8_t cr_qp_offset_list[6]; ++ uint8_t log2_sao_offset_scale_luma; ++ uint8_t log2_sao_offset_scale_chroma; ++ ++ // Inferred parameters ++ unsigned int *column_width; ///< ColumnWidth ++ unsigned int *row_height; ///< RowHeight ++ unsigned int *col_bd; ///< ColBd ++ unsigned int *row_bd; ///< RowBd ++ int *col_idxX; ++ ++ int *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS ++ int *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS ++ int *tile_id; ///< TileId ++ int *tile_pos_rs; ///< TilePosRS ++ int *tile_size; ///< TileSize ++ int *min_tb_addr_zs; ///< MinTbAddrZS ++ int *min_tb_addr_zs_tab;///< MinTbAddrZS ++ ++ uint8_t data[4096]; ++ int data_size; ++} HEVCRpiPPS; ++ ++typedef struct HEVCRpiParamSets { ++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; ++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; ++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; ++ ++ /* currently active parameter sets */ ++ const HEVCRpiVPS *vps; ++ const HEVCRpiSPS *sps; ++ const HEVCRpiPPS *pps; ++} HEVCRpiParamSets; ++ ++/** ++ * Parse the SPS from the bitstream into the provided HEVCRpiSPS struct. ++ * ++ * @param sps_id the SPS id will be written here ++ * @param apply_defdispwin if set 1, the default display window from the VUI ++ * will be applied to the video dimensions ++ * @param vps_list if non-NULL, this function will validate that the SPS refers ++ * to an existing VPS ++ */ ++int ff_hevc_rpi_parse_sps(HEVCRpiSPS *sps, GetBitContext *gb, unsigned int *sps_id, ++ int apply_defdispwin, AVBufferRef **vps_list, AVCodecContext *avctx); + -+ const unsigned int mx = mv->x & 3; -+ const unsigned int my = mv->y & 3; -+ const unsigned int my_mx = (my<<8) | mx; -+ const unsigned int mx2 = mv2->x & 3; -+ const unsigned int my2 = mv2->y & 3; -+ const unsigned int my2_mx2 = (my2<<8) | mx2; -+ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; -+ const unsigned int ref_idx0 = mv_field->ref_idx[0]; -+ const unsigned int ref_idx1 = mv_field->ref_idx[1]; -+ const uint32_t wt_offset = -+ offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1; -+ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); -+ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps); ++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps, int apply_defdispwin); ++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps); + -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); -+ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; -+ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); -+ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); -+ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; ++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, ++ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header); + -+ if (my2_mx2_my_mx == 0) -+ { -+ const int x1 = x0 + (mv->x >> 2); -+ const int y1 = y0 + (mv->y >> 2); -+ const int x2 = x0 + (mv2->x >> 2); -+ const int y2 = y0 + (mv2->y >> 2); -+ const int bh = nPbH; ++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id, ++ uint8_t *buf, int buf_size); + -+ // Can do chunks a full 16 wide if we don't want the H filter -+ for (int start_x=0; start_x < nPbW; start_x += 16) -+ { -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ ++ts->y_pred2_x0y0; ++/** ++ * Compute POC of the current frame and return it. ++ */ ++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type); + -+ if (nPbH > 16) -+ ++ts->y_pred2_hgt16; -+ else -+ ++ts->y_pred2_hle16; -+ } -+#endif -+ src1->x = x1 + start_x; -+ src1->y = y1; -+ src1->base = src1_base; -+ src2->x = x2 + start_x; -+ src2->y = y2; -+ src2->base = src2_base; -+ cmd_y->w = FFMIN(nPbW - start_x, 16); -+ cmd_y->h = bh; -+ cmd_y->mymx21 = 0; -+ cmd_y->wo1 = wo1; -+ cmd_y->wo2 = wo2; -+ cmd_y->dst_addr = dst + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); -+ } ++#endif /* AVCODEC_RPI_HEVC_PS_H */ +diff --git a/libavcodec/rpi_hevc_ps_enc.c b/libavcodec/rpi_hevc_ps_enc.c +new file mode 100644 +index 0000000000..7fa6af1cdf +--- /dev/null ++++ b/libavcodec/rpi_hevc_ps_enc.c +@@ -0,0 +1,118 @@ ++/* ++ * HEVC Parameter Set encoding ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "golomb.h" ++#include "rpi_hevc_ps.h" ++#include "put_bits.h" ++ ++static void write_ptl_layer(PutBitContext *pb, PTLCommon *ptl) ++{ ++ int i; ++ ++ put_bits(pb, 2, ptl->profile_space); ++ put_bits(pb, 1, ptl->tier_flag); ++ put_bits(pb, 5, ptl->profile_idc); ++ for (i = 0; i < 32; i++) ++ put_bits(pb, 1, ptl->profile_compatibility_flag[i]); ++ put_bits(pb, 1, ptl->progressive_source_flag); ++ put_bits(pb, 1, ptl->interlaced_source_flag); ++ put_bits(pb, 1, ptl->non_packed_constraint_flag); ++ put_bits(pb, 1, ptl->frame_only_constraint_flag); ++ put_bits32(pb, 0); // reserved ++ put_bits(pb, 12, 0); // reserved ++} ++ ++static void write_ptl(PutBitContext *pb, PTL *ptl, int max_num_sub_layers) ++{ ++ int i; ++ ++ write_ptl_layer(pb, &ptl->general_ptl); ++ put_bits(pb, 8, ptl->general_ptl.level_idc); ++ ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ put_bits(pb, 1, ptl->sub_layer_profile_present_flag[i]); ++ put_bits(pb, 1, ptl->sub_layer_level_present_flag[i]); + } -+ else -+ { -+ // Filter requires a run-up of 3 -+ const int x1 = x0 + (mv->x >> 2) - 3; -+ const int y1 = y0 + (mv->y >> 2) - 3; -+ const int x2 = x0 + (mv2->x >> 2) - 3; -+ const int y2 = y0 + (mv2->y >> 2) - 3; -+ const int bh = nPbH; + -+ for (int start_x=0; start_x < nPbW; start_x += 8) -+ { // B blocks work 8 at a time -+ // B weights aren't doubled as the QPU code does the same -+ // amount of work as it does for P -+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); -+ qpu_mc_src_t *const src1 = yp->last_l0; -+ qpu_mc_src_t *const src2 = yp->last_l1; -+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; -+ const unsigned int mmx = mx | mx2; -+ const unsigned int mmy = my | my2; -+ if (mmx == 0 && mmy == 0) -+ ++ts->y_pred2_x0y0; -+ else if (mmx == 0) -+ ++ts->y_pred2_x0; -+ else if (mmy == 0) -+ ++ts->y_pred2_y0; -+ else -+ ++ts->y_pred2_xy; ++ if (max_num_sub_layers > 1) ++ for (i = max_num_sub_layers - 1; i < 8; i++) ++ put_bits(pb, 2, 0); // reserved + -+ if (nPbH > 16) -+ ++ts->y_pred2_hgt16; -+ else -+ ++ts->y_pred2_hle16; -+ } -+#endif -+ src1->x = x1 + start_x; -+ src1->y = y1; -+ src1->base = src1_base; -+ src2->x = x2 + start_x; -+ src2->y = y2; -+ src2->base = src2_base; -+ cmd_y->w = FFMIN(nPbW - start_x, 8); -+ cmd_y->h = bh; -+ cmd_y->mymx21 = my2_mx2_my_mx; -+ cmd_y->wo1 = wo1; -+ cmd_y->wo2 = wo2; -+ cmd_y->dst_addr = dst + (start_x << xshl); -+ yp->last_l0 = &cmd_y->next_src1; -+ yp->last_l1 = &cmd_y->next_src2; -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); -+ } ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ if (ptl->sub_layer_profile_present_flag[i]) ++ write_ptl_layer(pb, &ptl->sub_layer_ptl[i]); ++ if (ptl->sub_layer_level_present_flag[i]) ++ put_bits(pb, 8, ptl->sub_layer_ptl[i].level_idc); + } +} + -+// h/v shifts fixed at one as that is all the qasm copes with -+static void -+rpi_pred_c(const HEVCContext * const s, HEVCRpiJob * const jb, -+ const unsigned int lx, const int x0_c, const int y0_c, -+ const int nPbW_c, const int nPbH_c, -+ const Mv * const mv, -+ const int16_t * const c_weights, -+ const int16_t * const c_offsets, -+ AVFrame * const src_frame) ++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id, ++ uint8_t *buf, int buf_size) +{ -+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); -+ const int hshift = 1; // = s->ps.sps->hshift[1]; -+ const int vshift = 1; // = s->ps.sps->vshift[1]; ++ PutBitContext pb; ++ int i; + -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; -+ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); -+ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; -+ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; -+ const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]); -+ const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]); -+ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; -+ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; -+ const unsigned int bh = nPbH_c; -+ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; ++ init_put_bits(&pb, buf, buf_size); ++ put_bits(&pb, 4, id); ++ put_bits(&pb, 2, 3); // reserved ++ put_bits(&pb, 6, vps->vps_max_layers - 1); ++ put_bits(&pb, 3, vps->vps_max_sub_layers - 1); ++ put_bits(&pb, 1, vps->vps_temporal_id_nesting_flag); ++ put_bits(&pb, 16, 0xffff); // reserved ++ ++ write_ptl(&pb, &vps->ptl, vps->vps_max_sub_layers); ++ ++ put_bits(&pb, 1, vps->vps_sub_layer_ordering_info_present_flag); ++ for (i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_layers - 1; ++ i < vps->vps_max_sub_layers; i++) { ++ set_ue_golomb(&pb, vps->vps_max_dec_pic_buffering[i] - 1); ++ set_ue_golomb(&pb, vps->vps_num_reorder_pics[i]); ++ set_ue_golomb(&pb, vps->vps_max_latency_increase[i] + 1); ++ } + -+ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) -+ { -+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); -+ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; -+ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; -+ qpu_mc_src_t * const last_lx = *plast_lx; -+ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ put_bits(&pb, 6, vps->vps_max_layer_id); ++ set_ue_golomb(&pb, vps->vps_num_layer_sets - 1); + -+ last_lx->x = x1_c + start_x; -+ last_lx->y = y1_c; -+ last_lx->base = src_base_u; -+ cmd_c->h = bh; -+ cmd_c->w = bw; -+ cmd_c->coeffs_x = x_coeffs; -+ cmd_c->coeffs_y = y_coeffs; -+ cmd_c->wo_u = wo_u; -+ cmd_c->wo_v = wo_v; -+ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); -+ *plast_lx = &cmd_c->next_src; -+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); ++ if (vps->vps_num_layer_sets > 1) { ++ avpriv_report_missing_feature(NULL, "Writing layer_id_included_flag"); ++ return AVERROR_PATCHWELCOME; + } -+ return; ++ ++ put_bits(&pb, 1, vps->vps_timing_info_present_flag); ++ if (vps->vps_timing_info_present_flag) { ++ put_bits32(&pb, vps->vps_num_units_in_tick); ++ put_bits32(&pb, vps->vps_time_scale); ++ put_bits(&pb, 1, vps->vps_poc_proportional_to_timing_flag); ++ if (vps->vps_poc_proportional_to_timing_flag) ++ set_ue_golomb(&pb, vps->vps_num_ticks_poc_diff_one - 1); ++ ++ if (vps->vps_num_hrd_parameters) { ++ avpriv_report_missing_feature(NULL, "Writing HRD parameters"); ++ return AVERROR_PATCHWELCOME; ++ } ++ } ++ ++ put_bits(&pb, 1, 0); // extension flag ++ ++ put_bits(&pb, 1, 1); // stop bit ++ avpriv_align_put_bits(&pb); ++ ++ return put_bits_count(&pb) / 8; +} +diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c +new file mode 100644 +index 0000000000..ef15784317 +--- /dev/null ++++ b/libavcodec/rpi_hevc_refs.c +@@ -0,0 +1,515 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+// h/v shifts fixed at one as that is all the qasm copes with -+static void -+rpi_pred_c_b(const HEVCContext * const s, HEVCRpiJob * const jb, -+ const int x0_c, const int y0_c, -+ const int nPbW_c, const int nPbH_c, -+ const struct MvField * const mv_field, -+ const int16_t * const c_weights, -+ const int16_t * const c_offsets, -+ const int16_t * const c_weights2, -+ const int16_t * const c_offsets2, -+ AVFrame * const src_frame, -+ AVFrame * const src_frame2) ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "internal.h" ++#include "thread.h" ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++ ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags) +{ -+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); -+ const int hshift = 1; // s->ps.sps->hshift[1]; -+ const int vshift = 1; // s->ps.sps->vshift[1]; -+ const Mv * const mv = mv_field->mv + 0; -+ const Mv * const mv2 = mv_field->mv + 1; ++ /* frame->frame can be NULL if context init failed */ ++ if (!frame->frame || !frame->frame->buf[0]) ++ return; + -+ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); -+ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); -+ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; -+ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector -+ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; -+ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ frame->flags &= ~flags; ++ if (!frame->flags) { ++ ff_thread_release_buffer(s->avctx, &frame->tf); + -+ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); -+ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); -+ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; -+ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ av_buffer_unref(&frame->tab_mvf_buf); ++ frame->tab_mvf = NULL; + -+ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; -+ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ av_buffer_unref(&frame->rpl_buf); ++ av_buffer_unref(&frame->rpl_tab_buf); ++ frame->rpl_tab = NULL; ++ frame->refPicList = NULL; + -+ const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]); -+ const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]); ++ frame->collocated_ref = NULL; ++ } ++} + -+ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; -+ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); -+ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); -+ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; -+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; -+ const unsigned int bh = nPbH_c; ++const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref, int x0, int y0) ++{ ++ int x_cb = x0 >> s->ps.sps->log2_ctb_size; ++ int y_cb = y0 >> s->ps.sps->log2_ctb_size; ++ int pic_width_cb = s->ps.sps->ctb_width; ++ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[y_cb * pic_width_cb + x_cb]; ++ return (const RefPicList *)ref->rpl_tab[ctb_addr_ts]; ++} + -+ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) -+ { -+ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s) ++{ ++ int i; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ++ HEVC_FRAME_FLAG_SHORT_REF | ++ HEVC_FRAME_FLAG_LONG_REF); ++} + -+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); -+ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; -+ qpu_mc_src_t * const src_l0 = cp->last_l0; -+ qpu_mc_src_t * const src_l1 = cp->last_l1; ++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s) ++{ ++ int i; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++} + -+ src_l0->x = x1_c + start_x; -+ src_l0->y = y1_c; -+ src_l0->base = src1_base; -+ src_l1->x = x2_c + start_x; -+ src_l1->y = y2_c; -+ src_l1->base = src2_base; ++static HEVCFrame *alloc_frame(HEVCRpiContext *s) ++{ ++ int i, j, ret; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; ++ if (frame->frame->buf[0]) ++ continue; + -+ u[0].h = bh; -+ u[0].w = bw; -+ u[0].coeffs_x1 = coefs0_x; -+ u[0].coeffs_y1 = coefs0_y; -+ u[0].weight_u1 = c_weights[0]; // Weight L0 U -+ u[0].weight_v1 = c_weights[1]; // Weight L0 V -+ u[0].coeffs_x2 = coefs1_x; -+ u[0].coeffs_y2 = coefs1_y; -+ u[0].wo_u2 = wo_u2; -+ u[0].wo_v2 = wo_v2; -+ u[0].dst_addr_c = dst_base_u + (start_x << xshl); ++ ret = ff_thread_get_buffer(s->avctx, &frame->tf, ++ AV_GET_BUFFER_FLAG_REF); ++ if (ret < 0) ++ return NULL; + -+ cp->last_l0 = &u[0].next_src1; -+ cp->last_l1 = &u[0].next_src2; -+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ frame->rpl_buf = av_buffer_allocz(s->pkt.nb_nals * sizeof(RefPicListTab)); ++ if (!frame->rpl_buf) ++ goto fail; ++ ++ frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool); ++ if (!frame->tab_mvf_buf) ++ goto fail; ++ frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data; ++ ++ frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool); ++ if (!frame->rpl_tab_buf) ++ goto fail; ++ frame->rpl_tab = (RefPicListTab **)frame->rpl_tab_buf->data; ++ frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height; ++ for (j = 0; j < frame->ctb_count; j++) ++ frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data; ++ ++ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; ++ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); ++ ++ return frame; ++ ++fail: ++ ff_hevc_rpi_unref_frame(s, frame, ~0); ++ return NULL; + } ++ av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n"); ++ return NULL; +} + ++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc) ++{ ++ HEVCFrame *ref; ++ int i; + -+#endif ++ /* check that this POC doesn't already exist */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; + ++ if (frame->frame->buf[0] && frame->sequence == s->seq_decode && ++ frame->poc == poc) { ++ av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n", ++ poc); ++ return AVERROR_INVALIDDATA; ++ } ++ } + ++ ref = alloc_frame(s); ++ if (!ref) ++ return AVERROR(ENOMEM); + -+static void hls_prediction_unit(const HEVCContext * const s, HEVCLocalContext * const lc, -+ const int x0, const int y0, -+ const int nPbW, const int nPbH, -+ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) - { - #define POS(c_idx, x, y) \ - &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \ - (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)] -- HEVCLocalContext *lc = s->HEVClc; -+#ifdef RPI -+ HEVCRpiJob * const jb = lc->jb0; -+#endif ++ *frame = ref->frame; ++ s->ref = ref; + - int merge_idx = 0; - struct MvField current_mv = {{{ 0 }}}; - - int min_pu_width = s->ps.sps->min_pu_width; - -- MvField *tab_mvf = s->ref->tab_mvf; -- RefPicList *refPicList = s->ref->refPicList; -- HEVCFrame *ref0 = NULL, *ref1 = NULL; -+ MvField * const tab_mvf = s->ref->tab_mvf; -+ const RefPicList *const refPicList = s->ref->refPicList; -+ const HEVCFrame *ref0 = NULL, *ref1 = NULL; - uint8_t *dst0 = POS(0, x0, y0); - uint8_t *dst1 = POS(1, x0, y0); - uint8_t *dst2 = POS(2, x0, y0); -@@ -1771,22 +3348,21 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int y_cb = y0 >> log2_min_cb_size; - int x_pu, y_pu; - int i, j; -- -- int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb); -+ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb); - - if (!skip_flag) -- lc->pu.merge_flag = ff_hevc_merge_flag_decode(s); -+ lc->pu.merge_flag = ff_hevc_merge_flag_decode(lc); - - if (skip_flag || lc->pu.merge_flag) { - if (s->sh.max_num_merge_cand > 1) -- merge_idx = ff_hevc_merge_idx_decode(s); -+ merge_idx = ff_hevc_merge_idx_decode(s, lc); - else - merge_idx = 0; - -- ff_hevc_luma_mv_merge_mode(s, x0, y0, nPbW, nPbH, log2_cb_size, -+ ff_hevc_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, - partIdx, merge_idx, ¤t_mv); - } else { -- hevc_luma_mv_mvp_mode(s, x0, y0, nPbW, nPbH, log2_cb_size, -+ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, - partIdx, merge_idx, ¤t_mv); - } - -@@ -1801,13 +3377,13 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - ref0 = refPicList[0].ref[current_mv.ref_idx[0]]; - if (!ref0) - return; -- hevc_await_progress(s, ref0, ¤t_mv.mv[0], y0, nPbH); -+ hevc_await_progress(s, lc, ref0, ¤t_mv.mv[0], y0, nPbH); - } - if (current_mv.pred_flag & PF_L1) { - ref1 = refPicList[1].ref[current_mv.ref_idx[1]]; - if (!ref1) - return; -- hevc_await_progress(s, ref1, ¤t_mv.mv[1], y0, nPbH); -+ hevc_await_progress(s, lc, ref1, ¤t_mv.mv[1], y0, nPbH); - } - - if (current_mv.pred_flag == PF_L0) { -@@ -1816,16 +3392,33 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int nPbW_c = nPbW >> s->ps.sps->hshift[1]; - int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - -- luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame, -+#if RPI_INTER -+ if (s->enable_rpi) { -+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 0, -+ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], -+ ref0->frame); -+ } else -+#endif -+ { -+ luma_mc_uni(s, lc, dst0, s->frame->linesize[0], ref0->frame, - ¤t_mv.mv[0], x0, y0, nPbW, nPbH, - s->sh.luma_weight_l0[current_mv.ref_idx[0]], - s->sh.luma_offset_l0[current_mv.ref_idx[0]]); ++ if (s->sh.pic_output_flag) ++ ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF; ++ else ++ ref->flags = HEVC_FRAME_FLAG_SHORT_REF; ++ ++ ref->poc = poc; ++ ref->sequence = s->seq_decode; ++ ref->frame->crop_left = s->ps.sps->output_window.left_offset; ++ ref->frame->crop_right = s->ps.sps->output_window.right_offset; ++ ref->frame->crop_top = s->ps.sps->output_window.top_offset; ++ ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset; ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush) ++{ ++ do { ++ int nb_output = 0; ++ int min_poc = INT_MAX; ++ int i, min_idx, ret; ++ ++ if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) { ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; ++ if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc && ++ frame->sequence == s->seq_output) { ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); ++ } ++ } + } - - if (s->ps.sps->chroma_format_idc) { -- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], -+#if RPI_INTER -+ if (s->enable_rpi) { -+ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], -+ ref0->frame); -+ return; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; ++ if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) && ++ frame->sequence == s->seq_output) { ++ nb_output++; ++ if (frame->poc < min_poc || nb_output == 1) { ++ min_poc = frame->poc; ++ min_idx = i; ++ } + } -+#endif -+ chroma_mc_uni(s, lc, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1], - 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]); -- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2], -+ chroma_mc_uni(s, lc, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2], - 0, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]); - } -@@ -1835,17 +3428,34 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int nPbW_c = nPbW >> s->ps.sps->hshift[1]; - int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - -- luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame, -+#if RPI_INTER -+ if (s->enable_rpi) { -+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 1, -+ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], -+ ref1->frame); -+ } else -+#endif -+ { -+ luma_mc_uni(s, lc, dst0, s->frame->linesize[0], ref1->frame, - ¤t_mv.mv[1], x0, y0, nPbW, nPbH, - s->sh.luma_weight_l1[current_mv.ref_idx[1]], - s->sh.luma_offset_l1[current_mv.ref_idx[1]]); + } - - if (s->ps.sps->chroma_format_idc) { -- chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], -+#if RPI_INTER -+ if (s->enable_rpi) { -+ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], -+ ref1->frame); -+ return; ++ ++ /* wait for more frames before output */ ++ if (!flush && s->seq_output == s->seq_decode && s->ps.sps && ++ nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics) ++ return 0; ++ ++ if (nb_output) { ++ HEVCFrame *frame = &s->DPB[min_idx]; ++ if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1) ++ return 0; ++ ++ ret = av_frame_ref(out, frame->frame); ++ if (frame->flags & HEVC_FRAME_FLAG_BUMPING) ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING); ++ else ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); ++ if (ret < 0) ++ return ret; ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "Output frame with POC %d.\n", frame->poc); ++ return 1; ++ } ++ ++ if (s->seq_output != s->seq_decode) ++ s->seq_output = (s->seq_output + 1) & 0xff; ++ else ++ break; ++ } while (1); ++ ++ return 0; ++} ++ ++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s) ++{ ++ int dpb = 0; ++ int min_poc = INT_MAX; ++ int i; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; ++ if ((frame->flags) && ++ frame->sequence == s->seq_output && ++ frame->poc != s->poc) { ++ dpb++; ++ } ++ } ++ ++ if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) { ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; ++ if ((frame->flags) && ++ frame->sequence == s->seq_output && ++ frame->poc != s->poc) { ++ if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) { ++ min_poc = frame->poc; ++ } + } -+#endif -+ chroma_mc_uni(s, lc, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1], - 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]); - -- chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2], -+ chroma_mc_uni(s, lc, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2], - 1, x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, - s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]); - } -@@ -1855,15 +3465,35 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - int nPbW_c = nPbW >> s->ps.sps->hshift[1]; - int nPbH_c = nPbH >> s->ps.sps->vshift[1]; - -- luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame, -+#if RPI_INTER -+ if (s->enable_rpi) { -+ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); -+ } else -+#endif -+ { -+ luma_mc_bi(s, lc, dst0, s->frame->linesize[0], ref0->frame, - ¤t_mv.mv[0], x0, y0, nPbW, nPbH, - ref1->frame, ¤t_mv.mv[1], ¤t_mv); + } - - if (s->ps.sps->chroma_format_idc) { -- chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, -+#if RPI_INTER -+ if (s->enable_rpi) { -+ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c, -+ ¤t_mv, -+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], -+ s->sh.chroma_offset_l0[current_mv.ref_idx[0]], -+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], -+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]], -+ ref0->frame, -+ ref1->frame); -+ return; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; ++ if (frame->flags & HEVC_FRAME_FLAG_OUTPUT && ++ frame->sequence == s->seq_output && ++ frame->poc <= min_poc) { ++ frame->flags |= HEVC_FRAME_FLAG_BUMPING; + } -+#endif -+ chroma_mc_bi(s, lc, dst1, s->frame->linesize[1], ref0->frame, ref1->frame, - x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 0); - -- chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame, -+ chroma_mc_bi(s, lc, dst2, s->frame->linesize[2], ref0->frame, ref1->frame, - x0_c, y0_c, nPbW_c, nPbH_c, ¤t_mv, 1); - } - } -@@ -1872,10 +3502,9 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0, - /** - * 8.4.1 - */ --static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size, -+static int luma_intra_pred_mode(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int pu_size, - int prev_intra_luma_pred_flag) - { -- HEVCLocalContext *lc = s->HEVClc; - int x_pu = x0 >> s->ps.sps->log2_min_pu_size; - int y_pu = y0 >> s->ps.sps->log2_min_pu_size; - int min_pu_width = s->ps.sps->min_pu_width; -@@ -1952,7 +3581,7 @@ static int luma_intra_pred_mode(HEVCContext *s, int x0, int y0, int pu_size, - return intra_pred_mode; - } - --static av_always_inline void set_ct_depth(HEVCContext *s, int x0, int y0, -+static av_always_inline void set_ct_depth(const HEVCContext * const s, int x0, int y0, - int log2_cb_size, int ct_depth) - { - int length = (1 << log2_cb_size) >> s->ps.sps->log2_min_cb_size; -@@ -1969,10 +3598,9 @@ static const uint8_t tab_mode_idx[] = { - 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, - 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31}; - --static void intra_prediction_unit(HEVCContext *s, int x0, int y0, -- int log2_cb_size) -+static void intra_prediction_unit(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0, -+ const int log2_cb_size) - { -- HEVCLocalContext *lc = s->HEVClc; - static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 }; - uint8_t prev_intra_luma_pred_flag[4]; - int split = lc->cu.part_mode == PART_NxN; -@@ -1983,17 +3611,17 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0, - - for (i = 0; i < side; i++) - for (j = 0; j < side; j++) -- prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_prev_intra_luma_pred_flag_decode(s); -+ prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_prev_intra_luma_pred_flag_decode(lc); - - for (i = 0; i < side; i++) { - for (j = 0; j < side; j++) { - if (prev_intra_luma_pred_flag[2 * i + j]) -- lc->pu.mpm_idx = ff_hevc_mpm_idx_decode(s); -+ lc->pu.mpm_idx = ff_hevc_mpm_idx_decode(lc); - else -- lc->pu.rem_intra_luma_pred_mode = ff_hevc_rem_intra_luma_pred_mode_decode(s); -+ lc->pu.rem_intra_luma_pred_mode = ff_hevc_rem_intra_luma_pred_mode_decode(lc); - - lc->pu.intra_pred_mode[2 * i + j] = -- luma_intra_pred_mode(s, x0 + pb_size * j, y0 + pb_size * i, pb_size, -+ luma_intra_pred_mode(s, lc, x0 + pb_size * j, y0 + pb_size * i, pb_size, - prev_intra_luma_pred_flag[2 * i + j]); - } - } -@@ -2001,7 +3629,7 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0, - if (s->ps.sps->chroma_format_idc == 3) { - for (i = 0; i < side; i++) { - for (j = 0; j < side; j++) { -- lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s); -+ lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(lc); - if (chroma_mode != 4) { - if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode]) - lc->pu.intra_pred_mode_c[2 * i + j] = 34; -@@ -2014,7 +3642,7 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0, - } - } else if (s->ps.sps->chroma_format_idc == 2) { - int mode_idx; -- lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s); -+ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(lc); - if (chroma_mode != 4) { - if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) - mode_idx = 34; -@@ -2025,7 +3653,7 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0, - } - lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx]; - } else if (s->ps.sps->chroma_format_idc != 0) { -- chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(s); -+ chroma_mode = ff_hevc_intra_chroma_pred_mode_decode(lc); - if (chroma_mode != 4) { - if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) - lc->pu.intra_pred_mode_c[0] = 34; -@@ -2037,11 +3665,10 @@ static void intra_prediction_unit(HEVCContext *s, int x0, int y0, - } - } - --static void intra_prediction_unit_default_value(HEVCContext *s, -+static void intra_prediction_unit_default_value(const HEVCContext * const s, HEVCLocalContext * const lc, - int x0, int y0, - int log2_cb_size) - { -- HEVCLocalContext *lc = s->HEVClc; - int pb_size = 1 << log2_cb_size; - int size_in_pus = pb_size >> s->ps.sps->log2_min_pu_size; - int min_pu_width = s->ps.sps->min_pu_width; -@@ -2060,10 +3687,9 @@ static void intra_prediction_unit_default_value(HEVCContext *s, - tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA; - } - --static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) -+static int hls_coding_unit(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int log2_cb_size) - { - int cb_size = 1 << log2_cb_size; -- HEVCLocalContext *lc = s->HEVClc; - int log2_min_cb_size = s->ps.sps->log2_min_cb_size; - int length = cb_size >> log2_min_cb_size; - int min_cb_width = s->ps.sps->min_cb_width; -@@ -2083,14 +3709,14 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) - for (x = 0; x < 4; x++) - lc->pu.intra_pred_mode[x] = 1; - if (s->ps.pps->transquant_bypass_enable_flag) { -- lc->cu.cu_transquant_bypass_flag = ff_hevc_cu_transquant_bypass_flag_decode(s); -+ lc->cu.cu_transquant_bypass_flag = ff_hevc_cu_transquant_bypass_flag_decode(lc); - if (lc->cu.cu_transquant_bypass_flag) - set_deblocking_bypass(s, x0, y0, log2_cb_size); - } else - lc->cu.cu_transquant_bypass_flag = 0; - - if (s->sh.slice_type != HEVC_SLICE_I) { -- uint8_t skip_flag = ff_hevc_skip_flag_decode(s, x0, y0, x_cb, y_cb); -+ uint8_t skip_flag = ff_hevc_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb); - - x = y_cb * min_cb_width + x_cb; - for (y = 0; y < length; y++) { -@@ -2107,19 +3733,19 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) - } - - if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) { -- hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); -- intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); -+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); - - if (!s->sh.disable_deblocking_filter_flag) -- ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size); -+ ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size); - } else { - int pcm_flag = 0; - - if (s->sh.slice_type != HEVC_SLICE_I) -- lc->cu.pred_mode = ff_hevc_pred_mode_decode(s); -+ lc->cu.pred_mode = ff_hevc_pred_mode_decode(lc); - if (lc->cu.pred_mode != MODE_INTRA || - log2_cb_size == s->ps.sps->log2_min_cb_size) { -- lc->cu.part_mode = ff_hevc_part_mode_decode(s, log2_cb_size); -+ lc->cu.part_mode = ff_hevc_part_mode_decode(s, lc, log2_cb_size); - lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN && - lc->cu.pred_mode == MODE_INTRA; - } -@@ -2128,54 +3754,56 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) - if (lc->cu.part_mode == PART_2Nx2N && s->ps.sps->pcm_enabled_flag && - log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size && - log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size) { -- pcm_flag = ff_hevc_pcm_flag_decode(s); -+ pcm_flag = ff_hevc_pcm_flag_decode(lc); - } - if (pcm_flag) { -- intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); -- ret = hls_pcm_sample(s, x0, y0, log2_cb_size); -+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); -+ ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size); - if (s->ps.sps->pcm.loop_filter_disable_flag) -+ { - set_deblocking_bypass(s, x0, y0, log2_cb_size); -+ } - - if (ret < 0) - return ret; - } else { -- intra_prediction_unit(s, x0, y0, log2_cb_size); -+ intra_prediction_unit(s, lc, x0, y0, log2_cb_size); - } - } else { -- intra_prediction_unit_default_value(s, x0, y0, log2_cb_size); -+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); - switch (lc->cu.part_mode) { - case PART_2Nx2N: -- hls_prediction_unit(s, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); - break; - case PART_2NxN: -- hls_prediction_unit(s, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx); -- hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx); -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx); -+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx); - break; - case PART_Nx2N: -- hls_prediction_unit(s, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1); -- hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1); -+ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1); -+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1); - break; - case PART_2NxnU: -- hls_prediction_unit(s, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx); -- hls_prediction_unit(s, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx); -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx); -+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx); - break; - case PART_2NxnD: -- hls_prediction_unit(s, x0, y0, cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx); -- hls_prediction_unit(s, x0, y0 + cb_size * 3 / 4, cb_size, cb_size / 4, log2_cb_size, 1, idx); -+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx); -+ hls_prediction_unit(s, lc, x0, y0 + cb_size * 3 / 4, cb_size, cb_size / 4, log2_cb_size, 1, idx); - break; - case PART_nLx2N: -- hls_prediction_unit(s, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2); -- hls_prediction_unit(s, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2); -+ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2); -+ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2); - break; - case PART_nRx2N: -- hls_prediction_unit(s, x0, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2); -- hls_prediction_unit(s, x0 + cb_size * 3 / 4, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2); -+ hls_prediction_unit(s, lc, x0, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2); -+ hls_prediction_unit(s, lc, x0 + cb_size * 3 / 4, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2); - break; - case PART_NxN: -- hls_prediction_unit(s, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1); -- hls_prediction_unit(s, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1); -- hls_prediction_unit(s, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1); -- hls_prediction_unit(s, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1); -+ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1); -+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1); -+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1); -+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1); - break; - } - } -@@ -2185,27 +3813,27 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) - - if (lc->cu.pred_mode != MODE_INTRA && - !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) { -- rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(s); -+ rqt_root_cbf = ff_hevc_no_residual_syntax_flag_decode(lc); - } - if (rqt_root_cbf) { - const static int cbf[2] = { 0 }; - lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ? - s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag : - s->ps.sps->max_transform_hierarchy_depth_inter; -- ret = hls_transform_tree(s, x0, y0, x0, y0, x0, y0, -+ ret = hls_transform_tree(s, lc, x0, y0, x0, y0, x0, y0, - log2_cb_size, - log2_cb_size, 0, 0, cbf, cbf); - if (ret < 0) - return ret; - } else { - if (!s->sh.disable_deblocking_filter_flag) -- ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size); -+ ff_hevc_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size); - } - } - } - - if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0) -- ff_hevc_set_qPy(s, x0, y0, log2_cb_size); -+ ff_hevc_set_qPy(s, lc, x0, y0, log2_cb_size); - - x = y_cb * min_cb_width + x_cb; - for (y = 0; y < length; y++) { -@@ -2218,217 +3846,1445 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size) - lc->qPy_pred = lc->qp_y; - } - -- set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth); -+ set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth); ++ } + -+ return 0; ++ dpb--; ++ } +} + -+// Returns: -+// < 0 Error -+// 0 More data wanted -+// 1 EoSlice / EoPicture -+static int hls_coding_quadtree(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0, -+ const int log2_cb_size, const int cb_depth) ++static int init_slice_rpl(HEVCRpiContext *s) +{ -+ const int cb_size = 1 << log2_cb_size; -+ int ret; -+ int split_cu; ++ HEVCFrame *frame = s->ref; ++ int ctb_count = frame->ctb_count; ++ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ int i; + -+ lc->ct_depth = cb_depth; -+ if (x0 + cb_size <= s->ps.sps->width && -+ y0 + cb_size <= s->ps.sps->height && -+ log2_cb_size > s->ps.sps->log2_min_cb_size) { -+ split_cu = ff_hevc_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0); -+ } else { -+ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size); -+ } -+ if (s->ps.pps->cu_qp_delta_enabled_flag && -+ log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth) { -+ lc->tu.is_cu_qp_delta_coded = 0; -+ lc->tu.cu_qp_delta = 0; -+ } ++ if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab)) ++ return AVERROR_INVALIDDATA; + -+ lc->tu.is_cu_chroma_qp_offset_coded = !(s->sh.cu_chroma_qp_offset_enabled_flag && -+ log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth); -+ lc->tu.cu_qp_offset_cb = 0; -+ lc->tu.cu_qp_offset_cr = 0; ++ for (i = ctb_addr_ts; i < ctb_count; i++) ++ frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx; + -+ if (split_cu) { -+ int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1; -+ const int cb_size_split = cb_size >> 1; -+ const int x1 = x0 + cb_size_split; -+ const int y1 = y0 + cb_size_split; ++ frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts]; + -+ int more_data = 0; ++ return 0; ++} + -+ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; ++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s) ++{ ++ SliceHeader *sh = &s->sh; + -+ if (more_data && x1 < s->ps.sps->width) { -+ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; -+ } -+ if (more_data && y1 < s->ps.sps->height) { -+ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; -+ } -+ if (more_data && x1 < s->ps.sps->width && -+ y1 < s->ps.sps->height) { -+ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1); -+ if (more_data < 0) -+ return more_data; ++ uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1; ++ uint8_t list_idx; ++ int i, j, ret; ++ ++ ret = init_slice_rpl(s); ++ if (ret < 0) ++ return ret; ++ ++ if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs + ++ s->rps[LT_CURR].nb_refs)) { ++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ for (list_idx = 0; list_idx < nb_list; list_idx++) { ++ RefPicList rpl_tmp = { { 0 } }; ++ RefPicList *rpl = &s->ref->refPicList[list_idx]; ++ ++ /* The order of the elements is ++ * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and ++ * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */ ++ int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF, ++ list_idx ? ST_CURR_BEF : ST_CURR_AFT, ++ LT_CURR }; ++ ++ /* concatenate the candidate lists for the current frame */ ++ while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) { ++ for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) { ++ RefPicList *rps = &s->rps[cand_lists[i]]; ++ for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) { ++ rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j]; ++ rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j]; ++ rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2; ++ rpl_tmp.nb_refs++; ++ } ++ } + } + -+ if(((x0 + (1<qPy_pred = lc->qp_y; ++ /* reorder the references if necessary */ ++ if (sh->rpl_modification_flag[list_idx]) { ++ for (i = 0; i < sh->nb_refs[list_idx]; i++) { ++ int idx = sh->list_entry_lx[list_idx][i]; + -+ if (more_data) -+ return ((x1 + cb_size_split) < s->ps.sps->width || -+ (y1 + cb_size_split) < s->ps.sps->height); -+ else -+ return 0; -+ } else { -+ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size); -+ if (ret < 0) -+ return ret; -+ if ((!((x0 + cb_size) % -+ (1 << (s->ps.sps->log2_ctb_size))) || -+ (x0 + cb_size >= s->ps.sps->width)) && -+ (!((y0 + cb_size) % -+ (1 << (s->ps.sps->log2_ctb_size))) || -+ (y0 + cb_size >= s->ps.sps->height))) { -+ int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(lc); -+ return !end_of_slice_flag; ++ if (idx >= rpl_tmp.nb_refs) { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rpl->list[i] = rpl_tmp.list[idx]; ++ rpl->ref[i] = rpl_tmp.ref[idx]; ++ rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx]; ++ rpl->nb_refs++; ++ } + } else { -+ return 1; ++ memcpy(rpl, &rpl_tmp, sizeof(*rpl)); ++ rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]); + } ++ ++ if (sh->collocated_list == list_idx && ++ sh->collocated_ref_idx < rpl->nb_refs) ++ s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx]; + } + -+ return 0; // NEVER ++ return 0; +} + -+static void hls_decode_neighbour(const HEVCContext * const s, HEVCLocalContext * const lc, -+ const int x_ctb, const int y_ctb, const int ctb_addr_ts) ++static HEVCFrame *find_ref_idx(HEVCRpiContext *s, int poc) +{ -+ const int ctb_size = 1 << s->ps.sps->log2_ctb_size; -+ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ const int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice -+ const int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size]; -+ -+ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; ++ int i; ++ int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1; + -+ lc->end_of_tiles_x = idxX + 1 >= s->ps.pps->num_tile_columns ? s->ps.sps->width : -+ (s->ps.pps->col_bd[idxX + 1] << s->ps.sps->log2_ctb_size); ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *ref = &s->DPB[i]; ++ if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) { ++ if ((ref->poc & LtMask) == poc) ++ return ref; ++ } ++ } + -+ if (ctb_addr_ts == 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1] || -+ (s->ps.pps->entropy_coding_sync_enabled_flag && (x_ctb >> s->ps.sps->log2_ctb_size) == s->ps.pps->col_bd[idxX])) -+ { -+// lc->first_qp_group = 1; -+ lc->qPy_pred = s->sh.slice_qp; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *ref = &s->DPB[i]; ++ if (ref->frame->buf[0] && ref->sequence == s->seq_decode) { ++ if (ref->poc == poc || (ref->poc & LtMask) == poc) ++ return ref; ++ } + } + -+ lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); ++ if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s)) ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Could not find ref with POC %d\n", poc); ++ return NULL; ++} + -+ lc->boundary_flags = 0; ++static void mark_ref(HEVCFrame *frame, int flag) ++{ ++ frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF); ++ frame->flags |= flag; ++} + -+ if (x_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]]) -+ lc->boundary_flags |= BOUNDARY_LEFT_TILE; -+ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) -+ lc->boundary_flags |= BOUNDARY_LEFT_SLICE; -+ if (y_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]) -+ lc->boundary_flags |= BOUNDARY_UPPER_TILE; -+ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width]) -+ lc->boundary_flags |= BOUNDARY_UPPER_SLICE; ++static HEVCFrame *generate_missing_ref(HEVCRpiContext *s, int poc) ++{ ++ HEVCFrame *frame; ++ int i, x, y; + -+ lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0; -+ lc->ctb_up_flag = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0; -+ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && -+ (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width); ++ frame = alloc_frame(s); ++ if (!frame) ++ return NULL; + -+ lc->ctb_up_right_flag = ((y_ctb > 0) && (x_ctb + ctb_size) < lc->end_of_tiles_x && -+ (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) && -+ (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]])); -+} ++ if (!s->ps.sps->pixel_shift) { ++ for (i = 0; frame->frame->buf[i]; i++) ++ memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1), ++ frame->frame->buf[i]->size); ++ } else { ++ for (i = 0; frame->frame->data[i]; i++) ++ for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++) ++ for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) { ++ AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x, ++ 1 << (s->ps.sps->bit_depth - 1)); ++ } ++ } + -+#ifdef RPI ++ frame->poc = poc; ++ frame->sequence = s->seq_decode; ++ frame->flags = 0; + -+#if 0 -+static inline void ts_to_xy(const HEVCContext * const s, const unsigned int ctb_ts, unsigned int * const px, unsigned int * const py) -+{ -+ const unsigned int ctb_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_ts]; -+ const unsigned int ctb_width = s->ps.sps->ctb_width; -+ *px = (ctb_rs % ctb_width) << s->ps.sps->log2_ctb_size; -+ *py = (ctb_rs / ctb_width) << s->ps.sps->log2_ctb_size; ++ ff_hevc_rpi_progress_set_all_done(frame); ++ ++ return frame; +} -+#endif + -+static void rpi_execute_dblk_cmds(HEVCContext * const s, HEVCRpiJob * const jb) ++/* add a reference with the given poc to the list and mark it as used in DPB */ ++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list, ++ int poc, int ref_flag) +{ -+ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; -+ const unsigned int x0 = FFMAX(jb->bounds.x, ctb_size) - ctb_size; -+ const unsigned int y0 = FFMAX(jb->bounds.y, ctb_size) - ctb_size; -+ const unsigned int bound_r = jb->bounds.x + jb->bounds.w; -+ const unsigned int bound_b = jb->bounds.y + jb->bounds.h; -+ const int x_end = (bound_r >= s->ps.sps->width); -+ const int y_end = (bound_b >= s->ps.sps->height); -+ const unsigned int xr = bound_r - (x_end ? 0 : ctb_size); -+ const unsigned int yb = bound_b - (y_end ? 0 : ctb_size); -+ unsigned int x, y; ++ HEVCFrame *ref = find_ref_idx(s, poc); + -+ for (y = y0; y < yb; y += ctb_size ) { -+ for (x = x0; x < xr; x += ctb_size ) { -+ ff_hevc_hls_filter(s, x, y, ctb_size); -+ } ++ if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS) ++ return AVERROR_INVALIDDATA; ++ ++ if (!ref) { ++ ref = generate_missing_ref(s, poc); ++ if (!ref) ++ return AVERROR(ENOMEM); + } + -+ // Flush (SAO) -+ if (y > y0) { -+ const int tile_end = y_end || -+ s->ps.pps->tile_id[jb->ctu_ts_last] != s->ps.pps->tile_id[jb->ctu_ts_last + 1]; -+ const unsigned int xl = x0 > ctb_size ? x0 - ctb_size : 0; -+ const unsigned int yt = y0 > ctb_size ? y0 - ctb_size : 0; -+ const unsigned int yb = tile_end ? bound_b : y - ctb_size; ++ list->list[list->nb_refs] = ref->poc; ++ list->ref[list->nb_refs] = ref; ++ list->nb_refs++; + -+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, -+ xl, yt, bound_r - xl, yb - yt, -+ s->ps.sps->vshift[1], 1, 1); -+ rpi_cache_flush_finish(rfe); -+ } ++ mark_ref(ref, ref_flag); ++ return 0; ++} + -+ // Signal -+ if (s->threads_type == FF_THREAD_FRAME && x_end && y0 > 0) { -+ ff_hevc_progress_signal_recon(s, y_end ? INT_MAX : y0 - 1); ++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s) ++{ ++ const ShortTermRPS *short_rps = s->sh.short_term_rps; ++ const LongTermRPS *long_rps = &s->sh.long_term_rps; ++ RefPicList *rps = s->rps; ++ int i, ret = 0; ++ ++ if (!short_rps) { ++ rps[0].nb_refs = rps[1].nb_refs = 0; ++ return 0; + } + -+ // Job done now -+ // ? Move outside this fn -+ job_free(s->jbc, jb); -+} ++ /* clear the reference flags on all frames except the current one */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCFrame *frame = &s->DPB[i]; + -+#if 0 -+static void rpi_execute_transform(HEVCContext *s) -+{ -+ int i=2; -+ int job = s->pass1_job; -+ /*int j; -+ int16_t *coeffs = s->coeffs_buf_arm[job][i]; -+ for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) { -+ s->hevcdsp.idct[4-2](coeffs, 16); -+ } -+ i=3; -+ coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i]; -+ for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) { -+ s->hevcdsp.idct[5-2](coeffs, 32); -+ }*/ -+ -+ rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+ s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], -+ s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], -+ s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]); -+ //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0); -+ //gpu_cache_flush(&s->coeffs_buf_accelerated); -+ //vpu_wait(s->vpu_id); -+ -+ for(i=0;i<4;i++) -+ s->num_coeffs[job][i] = 0; -+} -+#endif ++ if (frame == s->ref) ++ continue; + ++ mark_ref(frame, 0); ++ } + -+#define RPI_OPT_SEP_PRED 0 ++ for (i = 0; i < NB_RPS_TYPE; i++) ++ rps[i].nb_refs = 0; + ++ /* add the short refs */ ++ for (i = 0; i < short_rps->num_delta_pocs; i++) { ++ int poc = s->poc + short_rps->delta_poc[i]; ++ int list; + -+// I-pred, transform_and_add for all blocks types done here -+// All ARM -+#if RPI_OPT_SEP_PRED -+static void rpi_execute_pred_cmds(const HEVCContext *const s, HEVCRpiJob * const jb, const int do_luma, const int do_chroma) -+#else -+static void rpi_execute_pred_cmds(HEVCContext * const s, HEVCRpiJob * const jb) -+#endif -+{ -+ unsigned int i; -+ HEVCRpiIntraPredEnv * const iap = &jb->intra; -+ const HEVCPredCmd *cmd = iap->cmds; -+ -+ for(i = iap->n; i > 0; i--, cmd++) { -+// printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job); -+#if RPI_OPT_SEP_PRED -+ if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) { -+ continue; -+ } -+#endif ++ if (!short_rps->used[i]) ++ list = ST_FOLL; ++ else if (i < short_rps->num_negative_pics) ++ list = ST_CURR_BEF; ++ else ++ list = ST_CURR_AFT; + -+ switch (cmd->type) -+ { -+ case RPI_PRED_INTRA: -+ { -+ HEVCLocalContextIntra lci; // Abbreviated local context -+ HEVCLocalContext * const lc = (HEVCLocalContext *)&lci; -+ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode; -+ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; -+ lc->na.cand_left = (cmd->na >> 3) & 1; -+ lc->na.cand_up_left = (cmd->na >> 2) & 1; -+ lc->na.cand_up = (cmd->na >> 1) & 1; -+ lc->na.cand_up_right = (cmd->na >> 0) & 1; -+ if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0) -+ s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); -+ else -+ s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); -+ break; -+ } -+ -+ case RPI_PRED_ADD_RESIDUAL: -+ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); -+ break; -+ case RPI_PRED_ADD_DC: -+ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); -+ break; -+#if RPI_HEVC_SAND -+ case RPI_PRED_ADD_RESIDUAL_U: -+ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); -+ break; -+ case RPI_PRED_ADD_RESIDUAL_V: -+ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); -+ break; -+ case RPI_PRED_ADD_RESIDUAL_C: -+ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); -+ break; -+ case RPI_PRED_ADD_DC_U: -+ case RPI_PRED_ADD_DC_V: -+ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); -+ break; -+#endif ++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF); ++ if (ret < 0) ++ goto fail; ++ } + -+ case RPI_PRED_I_PCM: -+ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); -+ break; -+ -+ default: -+ av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); -+ abort(); -+ } -+ } -+#if RPI_OPT_SEP_PRED -+ if (do_luma) -+#endif -+ { -+ iap->n = 0; -+ } -+} ++ /* add the long refs */ ++ for (i = 0; i < long_rps->nb_refs; i++) { ++ int poc = long_rps->poc[i]; ++ int list = long_rps->used[i] ? LT_CURR : LT_FOLL; + ++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF); ++ if (ret < 0) ++ goto fail; ++ } + -+#endif ++fail: ++ /* release any frames that are now unused */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0); + -+#ifdef RPI ++ return ret; ++} + -+// Set initial uniform job values & zero ctu_count -+static void rpi_begin(const HEVCContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first) ++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s) +{ -+#if RPI_INTER -+ unsigned int i; -+ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; -+ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; -+ const HEVCSPS * const sps = s->ps.sps; -+ -+ const uint16_t pic_width_y = sps->width; -+ const uint16_t pic_height_y = sps->height; -+ -+ const uint16_t pic_width_c = sps->width >> sps->hshift[1]; -+ const uint16_t pic_height_c = sps->height >> sps->vshift[1]; -+ -+ // We expect the pointer to change if we use another sps -+ if (sps != jb->sps) -+ { -+ worker_pic_free_one(jb); ++ int ret = 0; ++ int i; ++ const ShortTermRPS *rps = s->sh.short_term_rps; ++ LongTermRPS *long_rps = &s->sh.long_term_rps; ++ ++ if (rps) { ++ for (i = 0; i < rps->num_negative_pics; i++) ++ ret += !!rps->used[i]; ++ for (; i < rps->num_delta_pocs; i++) ++ ret += !!rps->used[i]; ++ } + -+ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma); -+ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma); ++ if (long_rps) { ++ for (i = 0; i < long_rps->nb_refs; i++) ++ ret += !!long_rps->used[i]; ++ } ++ return ret; ++} +diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c +new file mode 100644 +index 0000000000..c98b0804ed +--- /dev/null ++++ b/libavcodec/rpi_hevc_sei.c +@@ -0,0 +1,364 @@ ++/* ++ * HEVC Supplementary Enhancement Information messages ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2013 Vittorio Giovara ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+ { -+ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * RPI_MAX_WIDTH; -+ const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1]; -+ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma); ++#include "golomb.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++ ++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb) ++{ ++ int cIdx, i; ++ uint8_t hash_type; ++ //uint16_t picture_crc; ++ //uint32_t picture_checksum; ++ hash_type = get_bits(gb, 8); ++ ++ for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) { ++ if (hash_type == 0) { ++ s->is_md5 = 1; ++ for (i = 0; i < 16; i++) ++ s->md5[cIdx][i] = get_bits(gb, 8); ++ } else if (hash_type == 1) { ++ // picture_crc = get_bits(gb, 16); ++ skip_bits(gb, 16); ++ } else if (hash_type == 2) { ++ // picture_checksum = get_bits_long(gb, 32); ++ skip_bits(gb, 32); + } ++ } ++ return 0; ++} + -+ jb->sps = sps; ++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb) ++{ ++ int i; ++ // Mastering primaries ++ for (i = 0; i < 3; i++) { ++ s->display_primaries[i][0] = get_bits(gb, 16); ++ s->display_primaries[i][1] = get_bits(gb, 16); + } ++ // White point (x, y) ++ s->white_point[0] = get_bits(gb, 16); ++ s->white_point[1] = get_bits(gb, 16); ++ ++ // Max and min luminance of mastering display ++ s->max_luminance = get_bits_long(gb, 32); ++ s->min_luminance = get_bits_long(gb, 32); ++ ++ // As this SEI message comes before the first frame that references it, ++ // initialize the flag to 2 and decrement on IRAP access unit so it ++ // persists for the coded video sequence (e.g., between two IRAPs) ++ s->present = 2; ++ return 0; ++} + -+ jb->waited = 0; -+ jb->ctu_ts_first = ctu_ts_first; -+ jb->ctu_ts_last = -1; ++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb) ++{ ++ // Max and average light levels ++ s->max_content_light_level = get_bits_long(gb, 16); ++ s->max_pic_average_light_level = get_bits_long(gb, 16); ++ // As this SEI message comes before the first frame that references it, ++ // initialize the flag to 2 and decrement on IRAP access unit so it ++ // persists for the coded video sequence (e.g., between two IRAPs) ++ s->present = 2; ++ return 0; ++} + -+ rpi_inter_pred_reset(cipe); -+ for (i = 0; i < cipe->n; i++) { -+ HEVCRpiInterPredQ * const cp = cipe->q + i; -+ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; ++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb) ++{ ++ get_ue_golomb_long(gb); // frame_packing_arrangement_id ++ s->present = !get_bits1(gb); + -+ u->next_src1.x = 0; -+ u->next_src1.y = 0; -+ u->next_src1.base = 0; -+ u->pic_cw = pic_width_c; -+ u->pic_ch = pic_height_c; -+ u->stride2 = av_rpi_sand_frame_stride2(s->frame); -+ u->stride1 = av_rpi_sand_frame_stride1(s->frame); -+ u->wdenom = s->sh.chroma_log2_weight_denom; -+ cp->last_l0 = &u->next_src1; ++ if (s->present) { ++ s->arrangement_type = get_bits(gb, 7); ++ s->quincunx_subsampling = get_bits1(gb); ++ s->content_interpretation_type = get_bits(gb, 6); + -+ u->next_fn = 0; -+ u->next_src2.x = 0; -+ u->next_src2.y = 0; -+ u->next_src2.base = 0; -+ cp->last_l1 = &u->next_src2; ++ // the following skips spatial_flipping_flag frame0_flipped_flag ++ // field_views_flag current_frame_is_frame0_flag ++ // frame0_self_contained_flag frame1_self_contained_flag ++ skip_bits(gb, 6); + -+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ if (!s->quincunx_subsampling && s->arrangement_type != 5) ++ skip_bits(gb, 16); // frame[01]_grid_position_[xy] ++ skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte ++ skip_bits1(gb); // frame_packing_arrangement_persistence_flag + } ++ skip_bits1(gb); // upsampled_aspect_ratio_flag ++ return 0; ++} + -+ rpi_inter_pred_reset(yipe); -+ for (i = 0; i < yipe->n; i++) { -+ HEVCRpiInterPredQ * const yp = yipe->q + i; -+ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; ++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb) ++{ ++ s->present = !get_bits1(gb); + -+ y->next_src1.x = 0; -+ y->next_src1.y = 0; -+ y->next_src1.base = 0; -+ y->next_src2.x = 0; -+ y->next_src2.y = 0; -+ y->next_src2.base = 0; -+ y->pic_h = pic_height_y; -+ y->pic_w = pic_width_y; -+ y->stride2 = av_rpi_sand_frame_stride2(s->frame); -+ y->stride1 = av_rpi_sand_frame_stride1(s->frame); -+ y->wdenom = s->sh.luma_log2_weight_denom; -+ y->next_fn = 0; -+ yp->last_l0 = &y->next_src1; -+ yp->last_l1 = &y->next_src2; ++ if (s->present) { ++ s->hflip = get_bits1(gb); // hor_flip ++ s->vflip = get_bits1(gb); // ver_flip + -+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); ++ s->anticlockwise_rotation = get_bits(gb, 16); ++ skip_bits1(gb); // display_orientation_persistence_flag + } + -+ jb->last_y8_p = NULL; -+ jb->last_y8_l1 = NULL; ++ return 0; ++} + -+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) { -+ jb->progress[i] = -1; -+ } ++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps, ++ void *logctx, int size) ++{ ++ HEVCSEIPictureTiming *h = &s->picture_timing; ++ HEVCRpiSPS *sps; + -+ worker_pic_reset(&jb->coeffs); ++ if (!ps->sps_list[s->active_seq_parameter_set_id]) ++ return(AVERROR(ENOMEM)); ++ sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data; + -+#endif -+} -+#endif ++ if (sps->vui.frame_field_info_present_flag) { ++ int pic_struct = get_bits(gb, 4); ++ h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN; ++ if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) { ++ av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n"); ++ h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD; ++ } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) { ++ av_log(logctx, AV_LOG_DEBUG, "TOP Field\n"); ++ h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD; ++ } ++ get_bits(gb, 2); // source_scan_type ++ get_bits(gb, 1); // duplicate_flag ++ skip_bits1(gb); ++ size--; ++ } ++ skip_bits_long(gb, 8 * size); + ++ return 0; ++} + -+#if RPI_INTER -+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C -+static unsigned int mc_terminate_add_qpu(const HEVCContext * const s, -+ const vpu_qpu_job_h vqj, -+ rpi_cache_flush_env_t * const rfe, -+ HEVCRpiInterPredEnv * const ipe) ++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb, ++ int size) +{ -+ unsigned int i; -+ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; -+ unsigned int max_block = 0; -+ -+ if (!ipe->used) { -+ return 0; -+ } ++ int flag; ++ int user_data_type_code; ++ int cc_count; + -+ if (ipe->curr != 0) { -+ rpi_inter_pred_sync(ipe); -+ } ++ if (size < 3) ++ return AVERROR(EINVAL); + -+ // Add final commands to Q -+ for(i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const yp = ipe->q + i; -+ qpu_mc_src_t *const p0 = yp->last_l0; -+ qpu_mc_src_t *const p1 = yp->last_l1; -+ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; ++ user_data_type_code = get_bits(gb, 8); ++ if (user_data_type_code == 0x3) { ++ skip_bits(gb, 1); // reserved + -+ if (block_size > max_block) -+ max_block = block_size; ++ flag = get_bits(gb, 1); // process_cc_data_flag ++ if (flag) { ++ skip_bits(gb, 1); ++ cc_count = get_bits(gb, 5); ++ skip_bits(gb, 8); // reserved ++ size -= 2; + -+ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ if (cc_count && size >= cc_count * 3) { ++ const uint64_t new_size = (s->a53_caption_size + cc_count ++ * UINT64_C(3)); ++ int i, ret; + -+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched -+ p0->x = MC_DUMMY_X; -+ p0->y = MC_DUMMY_Y; -+ p0->base = s->qpu_dummy_frame_qpu; -+ p1->x = MC_DUMMY_X; -+ p1->y = MC_DUMMY_Y; -+ p1->base = s->qpu_dummy_frame_qpu; ++ if (new_size > INT_MAX) ++ return AVERROR(EINVAL); + -+ yp->last_l0 = NULL; -+ yp->last_l1 = NULL; ++ /* Allow merging of the cc data from two fields. */ ++ ret = av_reallocp(&s->a53_caption, new_size); ++ if (ret < 0) ++ return ret; + -+ // Add to mailbox list -+ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); -+ mail[i][1] = yp->code_setup; ++ for (i = 0; i < cc_count; i++) { ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ } ++ skip_bits(gb, 8); // marker_bits ++ } ++ } ++ } else { ++ int i; ++ for (i = 0; i < size - 1; i++) ++ skip_bits(gb, 8); + } + -+#if RPI_CACHE_UNIF_MVS -+ // We don't need invalidate here as the uniforms aren't changed by the QPU -+ // and leaving them in ARM cache avoids (pointless) pre-reads when writing -+ // new values which seems to give us a small performance advantage -+ // -+ // In most cases we will not have a completely packed set of uniforms and as -+ // we have a 2d invalidate we writeback all uniform Qs to the depth of the -+ // fullest -+ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, -+ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, -+ ipe->n, ipe->max_fill + ipe->min_gap); -+#endif -+ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); -+ -+ return 1; ++ return 0; +} -+#endif + -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+static unsigned int mc_terminate_add_emu(const HEVCContext * const s, -+ const vpu_qpu_job_h vqj, -+ rpi_cache_flush_env_t * const rfe, -+ HEVCRpiInterPredEnv * const ipe) ++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb, ++ int size) +{ -+ unsigned int i; -+ if (!ipe->used) { -+ return 0; -+ } ++ uint32_t country_code; ++ uint32_t user_identifier; + -+ if (ipe->curr != 0) { -+ rpi_inter_pred_sync(ipe); -+ } ++ if (size < 7) ++ return AVERROR(EINVAL); ++ size -= 7; + -+ // Add final commands to Q -+ for(i = 0; i != ipe->n; ++i) { -+ HEVCRpiInterPredQ * const yp = ipe->q + i; -+ qpu_mc_src_t *const p0 = yp->last_l0; -+ qpu_mc_src_t *const p1 = yp->last_l1; ++ country_code = get_bits(gb, 8); ++ if (country_code == 0xFF) { ++ skip_bits(gb, 8); ++ size--; ++ } + -+ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ skip_bits(gb, 8); ++ skip_bits(gb, 8); + -+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched -+ p0->x = MC_DUMMY_X; -+ p0->y = MC_DUMMY_Y; -+ p0->base = s->qpu_dummy_frame_emu; -+ p1->x = MC_DUMMY_X; -+ p1->y = MC_DUMMY_Y; -+ p1->base = s->qpu_dummy_frame_emu; ++ user_identifier = get_bits_long(gb, 32); + -+ yp->last_l0 = NULL; -+ yp->last_l1 = NULL; ++ switch (user_identifier) { ++ case MKBETAG('G', 'A', '9', '4'): ++ return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size); ++ default: ++ skip_bits_long(gb, size * 8); ++ break; + } -+ -+ return 1; ++ return 0; +} -+#endif + ++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx) ++{ ++ int num_sps_ids_minus1; ++ int i; ++ unsigned active_seq_parameter_set_id; + -+#if RPI_QPU_EMU_Y -+#define mc_terminate_add_y mc_terminate_add_emu -+#else -+#define mc_terminate_add_y mc_terminate_add_qpu -+#endif -+#if RPI_QPU_EMU_C -+#define mc_terminate_add_c mc_terminate_add_emu -+#else -+#define mc_terminate_add_c mc_terminate_add_qpu -+#endif -+#endif ++ get_bits(gb, 4); // active_video_parameter_set_id ++ get_bits(gb, 1); // self_contained_cvs_flag ++ get_bits(gb, 1); // num_sps_ids_minus1 ++ num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1 ++ ++ if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) { ++ av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ active_seq_parameter_set_id = get_ue_golomb_long(gb); ++ if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id); ++ return AVERROR_INVALIDDATA; ++ } ++ s->active_seq_parameter_set_id = active_seq_parameter_set_id; + -+#ifdef RPI ++ for (i = 1; i <= num_sps_ids_minus1; i++) ++ get_ue_golomb_long(gb); // active_seq_parameter_set_id[i] + ++ return 0; ++} + -+static void flush_frame(HEVCContext *s,AVFrame *frame) ++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb) +{ -+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); -+ rpi_cache_flush_finish(rfe); ++ s->present = 1; ++ s->preferred_transfer_characteristics = get_bits(gb, 8); ++ return 0; ++} ++ ++static int decode_nal_sei_prefix(GetBitContext *gb, HEVCSEIContext *s, const HEVCRpiParamSets *ps, ++ int type, int size, void *logctx) ++{ ++ switch (type) { ++ case 256: // Mismatched value from HM 8.1 ++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); ++ case HEVC_SEI_TYPE_FRAME_PACKING: ++ return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb); ++ case HEVC_SEI_TYPE_DISPLAY_ORIENTATION: ++ return decode_nal_sei_display_orientation(&s->display_orientation, gb); ++ case HEVC_SEI_TYPE_PICTURE_TIMING: ++ return decode_nal_sei_pic_timing(s, gb, ps, logctx, size); ++ case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO: ++ return decode_nal_sei_mastering_display_info(&s->mastering_display, gb); ++ case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO: ++ return decode_nal_sei_content_light_info(&s->content_light, gb); ++ case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS: ++ return decode_nal_sei_active_parameter_sets(s, gb, logctx); ++ case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35: ++ return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size); ++ case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS: ++ return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb); ++ default: ++ av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type); ++ skip_bits_long(gb, 8 * size); ++ return 0; ++ } +} + -+static void job_gen_bounds(const HEVCContext * const s, HEVCRpiJob * const jb) ++static int decode_nal_sei_suffix(GetBitContext *gb, HEVCSEIContext *s, ++ int type, int size, void *logctx) +{ -+ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first]; -+ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last]; -+ const unsigned int ctb_width = s->ps.sps->ctb_width; -+ RpiBlk *const bounds = &jb->bounds; -+ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last); -+ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size; -+ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size; -+ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size; -+ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size; ++ switch (type) { ++ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH: ++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); ++ default: ++ av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type); ++ skip_bits_long(gb, 8 * size); ++ return 0; ++ } +} + -+#if RPI_PASSES == 2 -+static void worker_core2(HEVCContext * const s, HEVCRpiJob * const jb) ++static int decode_nal_sei_message(GetBitContext *gb, HEVCSEIContext *s, ++ const HEVCRpiParamSets *ps, int nal_unit_type, ++ void *logctx) +{ -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s, jb); -+ -+ // Perform deblocking for CTBs in this row -+ rpi_execute_dblk_cmds(s, jb); -+} -+#endif -+ -+ -+// Core execution tasks -+static void worker_core(HEVCContext * const s0, HEVCRpiJob * const jb) -+{ -+ const HEVCContext * const s = s0; -+ -+#if RPI_OPT_SEP_PRED -+ vpu_qpu_wait_h sync_c; -+#endif -+ vpu_qpu_wait_h sync_y; -+ -+ int pred_y, pred_c; -+ -+ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); -+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); -+ -+ { -+ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; -+ if (cf->s[3].n + cf->s[2].n != 0) -+ { -+ const unsigned int csize = sizeof(cf->s[3].buf[0]); -+ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; -+ vpu_qpu_job_add_vpu(vqj, -+ vpu_get_fn(s->ps.sps->bit_depth), -+ vpu_get_constants(), -+ cf->gptr.vc, -+ cf->s[2].n >> 8, -+ cf->gptr.vc + offset32, -+ cf->s[3].n >> 10, -+ 0); ++ int payload_type = 0; ++ int payload_size = 0; ++ int byte = 0xFF; ++ av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n"); + -+ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); -+ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); -+ } ++ while (byte == 0xFF) { ++ byte = get_bits(gb, 8); ++ payload_type += byte; + } -+ -+ pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip); -+ -+// We can take a sync here and try to locally overlap QPU processing with ARM -+// but testing showed a slightly negative benefit with noticable extra complexity -+#if RPI_OPT_SEP_PRED -+ vpu_qpu_job_add_sync_this(vqj, &sync_c); -+#endif -+ -+ pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip); -+ -+ vpu_qpu_job_add_sync_this(vqj, &sync_y); -+ -+ rpi_cache_flush_execute(rfe); -+ -+ // Await progress as required -+ { -+ unsigned int i; -+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) { -+ if (jb->progress[i] >= 0) { -+ ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]); -+ } -+ } ++ byte = 0xFF; ++ while (byte == 0xFF) { ++ byte = get_bits(gb, 8); ++ payload_size += byte; + } -+ -+ vpu_qpu_job_finish(vqj); -+ -+ // We always work on a rectangular block -+ if (pred_y || pred_c) -+ { -+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, -+ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h, -+ s->ps.sps->vshift[1], pred_y, pred_c); ++ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { ++ return decode_nal_sei_prefix(gb, s, ps, payload_type, payload_size, logctx); ++ } else { /* nal_unit_type == NAL_SEI_SUFFIX */ ++ return decode_nal_sei_suffix(gb, s, payload_type, payload_size, logctx); + } -+ -+ // If we have emulated VPU ops - do it here -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+ if (av_rpi_is_sand8_frame(s->frame)) -+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C -+ rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); -+#elif RPI_QPU_EMU_Y -+ rpi_shader_c8(s, &jb->luma_ip, NULL); -+#else -+ rpi_shader_c8(s, NULL, &jb->chroma_ip); -+#endif -+ else -+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C -+ rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); -+#elif RPI_QPU_EMU_Y -+ rpi_shader_c16(s, &jb->luma_ip, NULL); -+#else -+ rpi_shader_c16(s, NULL, &jb->chroma_ip); -+#endif -+#endif -+ -+#if RPI_OPT_SEP_PRED -+#error Needs fixup for worker core 2 -+ // Wait for transform completion -+ vpu_qpu_wait(&sync_c); -+ -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s, jb, 0, 1); -+ -+ // Wait for transform completion -+ vpu_qpu_wait(&sync_y); -+ -+ // Perform intra prediction and residual reconstruction -+ rpi_execute_pred_cmds(s, jb, 1, 0); -+#else -+ // Wait for transform completion -+ vpu_qpu_wait(&sync_y); -+ -+#endif -+ -+ rpi_cache_flush_finish(rfe); +} + -+#endif -+ -+static int slice_start(const HEVCContext * const s, HEVCLocalContext *const lc) ++static int more_rbsp_data(GetBitContext *gb) +{ -+ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; -+ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns; -+ -+ // Check for obvious disasters -+ if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { -+ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ if (s->sh.dependent_slice_segment_flag) { -+ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; -+ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { -+ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ } -+ -+ if (!s->ps.pps->entropy_coding_sync_enabled_flag && -+ s->ps.pps->tile_id[ctb_addr_ts] + s->sh.num_entry_point_offsets >= tiles) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ // Tiled stuff must start at start of tile if it has multiple entry points -+ if (!s->ps.pps->entropy_coding_sync_enabled_flag && -+ s->sh.num_entry_point_offsets != 0 && -+ s->sh.slice_ctb_addr_rs != s->ps.pps->tile_pos_rs[s->ps.pps->tile_id[ctb_addr_ts]]) -+ { -+ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ -+ // Setup any required decode vars -+ if (!s->sh.dependent_slice_segment_flag) -+ lc->qPy_pred = s->sh.slice_qp; -+ -+ lc->qp_y = s->sh.slice_qp; -+ -+ // General setup -+ lc->wpp_init = 0; -+#ifdef RPI -+ lc->bt_line_no = 0; -+ lc->ts = ctb_addr_ts; -+#endif -+ return 0; ++ return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80; +} + -+static int gen_entry_points(HEVCContext * const s, const H2645NAL * const nal) ++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ const HEVCRpiParamSets *ps, int type) +{ -+ const GetBitContext * const gb = &s->HEVClc->gb; -+ int i, j; -+ -+ const unsigned int length = nal->size; -+ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte -+ unsigned int cmpt; -+ unsigned int startheader; -+ -+ if (s->sh.num_entry_point_offsets == 0) { -+ return 0; -+ } -+ -+ for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) { -+ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { -+ startheader--; -+ cmpt++; -+ } -+ } ++ int ret; + -+ for (i = 1; i < s->sh.num_entry_point_offsets; i++) { -+ offset += (s->sh.entry_point_offset[i - 1] - cmpt); -+ for (j = 0, cmpt = 0, startheader = offset -+ + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) { -+ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { -+ startheader--; -+ cmpt++; -+ } -+ } -+ s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt; -+ s->sh.offset[i - 1] = offset; -+ } -+ if (s->sh.num_entry_point_offsets != 0) { -+ offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt; -+ if (length < offset) { -+ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); -+ return AVERROR_INVALIDDATA; -+ } -+ s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset; -+ s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset; -+ } -+ s->data = nal->data; -+ return 0; ++ do { ++ ret = decode_nal_sei_message(gb, s, ps, type, logctx); ++ if (ret < 0) ++ return ret; ++ } while (more_rbsp_data(gb)); ++ return 1; +} + -+ -+#ifdef RPI -+ -+// Return -+// < 0 Error -+// 0 OK -+// -+// jb->ctu_ts_last < 0 Job still filling -+// jb->ctu_ts_last >= 0 Job ready -+ -+static int fill_job(HEVCContext * const s, HEVCLocalContext *const lc, unsigned int max_blocks) ++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s) +{ -+ const int ctb_size = (1 << s->ps.sps->log2_ctb_size); -+ HEVCRpiJob * const jb = lc->jb0; -+ int more_data = 1; -+ int ctb_addr_ts = lc->ts; ++ s->a53_caption.a53_caption_size = 0; ++ av_freep(&s->a53_caption.a53_caption); ++} +diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h +new file mode 100644 +index 0000000000..41e4a20127 +--- /dev/null ++++ b/libavcodec/rpi_hevc_sei.h +@@ -0,0 +1,135 @@ ++/* ++ * HEVC Supplementary Enhancement Information messages ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+ lc->unit_done = 0; -+#if 0 -+ // Generate some errors for testing -+ { -+ static int z = 0; -+ if (++z > 100) { -+ z = 0; -+ return -1; -+ } -+ } -+#endif -+ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) -+ { -+ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; -+ const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; -+ int q_full; ++#ifndef AVCODEC_RPI_HEVC_SEI_H ++#define AVCODEC_RPI_HEVC_SEI_H + -+ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); ++#include + -+ ff_hevc_cabac_init(s, lc, ctb_addr_ts); ++#include "libavutil/md5.h" + -+ hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); ++#include "get_bits.h" + -+ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; -+ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; -+ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; ++/** ++ * SEI message types ++ */ ++typedef enum { ++ HEVC_SEI_TYPE_BUFFERING_PERIOD = 0, ++ HEVC_SEI_TYPE_PICTURE_TIMING = 1, ++ HEVC_SEI_TYPE_PAN_SCAN_RECT = 2, ++ HEVC_SEI_TYPE_FILLER_PAYLOAD = 3, ++ HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4, ++ HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5, ++ HEVC_SEI_TYPE_RECOVERY_POINT = 6, ++ HEVC_SEI_TYPE_SCENE_INFO = 9, ++ HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15, ++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16, ++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17, ++ HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19, ++ HEVC_SEI_TYPE_POST_FILTER_HINT = 22, ++ HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23, ++ HEVC_SEI_TYPE_FRAME_PACKING = 45, ++ HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47, ++ HEVC_SEI_TYPE_SOP_DESCRIPTION = 128, ++ HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129, ++ HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130, ++ HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131, ++ HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132, ++ HEVC_SEI_TYPE_SCALABLE_NESTING = 133, ++ HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134, ++ HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137, ++ HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144, ++ HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147, ++} HEVC_SEI_Type; ++ ++typedef struct HEVCSEIPictureHash { ++ struct AVMD5 *md5_ctx; ++ uint8_t md5[3][16]; ++ uint8_t is_md5; ++} HEVCSEIPictureHash; ++ ++typedef struct HEVCSEIFramePacking { ++ int present; ++ int arrangement_type; ++ int content_interpretation_type; ++ int quincunx_subsampling; ++} HEVCSEIFramePacking; ++ ++typedef struct HEVCSEIDisplayOrientation { ++ int present; ++ int anticlockwise_rotation; ++ int hflip, vflip; ++} HEVCSEIDisplayOrientation; ++ ++typedef struct HEVCSEIPictureTiming { ++ int picture_struct; ++} HEVCSEIPictureTiming; ++ ++typedef struct HEVCSEIA53Caption { ++ int a53_caption_size; ++ uint8_t *a53_caption; ++} HEVCSEIA53Caption; ++ ++typedef struct HEVCSEIMasteringDisplay { ++ int present; ++ uint16_t display_primaries[3][2]; ++ uint16_t white_point[2]; ++ uint32_t max_luminance; ++ uint32_t min_luminance; ++} HEVCSEIMasteringDisplay; ++ ++typedef struct HEVCSEIContentLight { ++ int present; ++ uint16_t max_content_light_level; ++ uint16_t max_pic_average_light_level; ++} HEVCSEIContentLight; ++ ++typedef struct HEVCSEIAlternativeTransfer { ++ int present; ++ int preferred_transfer_characteristics; ++} HEVCSEIAlternativeTransfer; ++ ++typedef struct HEVCSEIContext { ++ HEVCSEIPictureHash picture_hash; ++ HEVCSEIFramePacking frame_packing; ++ HEVCSEIDisplayOrientation display_orientation; ++ HEVCSEIPictureTiming picture_timing; ++ HEVCSEIA53Caption a53_caption; ++ HEVCSEIMasteringDisplay mastering_display; ++ HEVCSEIContentLight content_light; ++ int active_seq_parameter_set_id; ++ HEVCSEIAlternativeTransfer alternative_transfer; ++} HEVCSEIContext; ++ ++struct HEVCRpiParamSets; ++ ++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ const struct HEVCRpiParamSets *ps, int type); ++ ++/** ++ * Reset SEI values that are stored on the Context. ++ * e.g. Caption data that was extracted during NAL ++ * parsing. ++ * ++ * @param s HEVCRpiContext. ++ */ ++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s); + -+ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); ++#endif /* AVCODEC_RPI_HEVC_SEI_H */ +diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c +new file mode 100644 +index 0000000000..4f1d6c71f2 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.c +@@ -0,0 +1,1570 @@ ++#include "rpi_hevc_shader.h" + -+ if (more_data < 0) { -+ s->tab_slice_address[ctb_addr_rs] = -1; -+ return more_data; -+ } ++#ifdef _MSC_VER ++ #include ++ /* cast through uintptr_t to avoid warnings */ ++ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X)) ++#else ++ #define POINTER_TO_UINT(X) ((unsigned int)(X)) ++#endif + -+ // Inc TS to next. -+ // N.B. None of the other position vars have changed -+ ctb_addr_ts++; -+ ff_hevc_save_states(s, lc, ctb_addr_ts); -+ -+ // Report progress so we can use our MVs in other frames -+ if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) { -+ ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1); -+ } -+ -+ // * None of the 1st 3 tests for q_full should succeed in the current world... -+ q_full = ((ctb_addr_ts - jb->ctu_ts_first) >= s->max_ctu_count); -+ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0) -+ q_full = 1; -+ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0) -+ q_full = 1; -+ if (q_full) { -+ // * This is very annoying (and slow) to cope with in WPP so -+ // we treat it as an error there (no known stream tiggers this -+ // with the current buffer sizes). Non-wpp should cope fine. -+ av_log(s, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); -+ } -+ -+ if (q_full || -+ ctb_addr_ts >= s->ps.sps->ctb_size || -+ s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts] != ctb_addr_rs + 1 || -+ s->ps.pps->tile_id[ctb_addr_ts - 1] != s->ps.pps->tile_id[ctb_addr_ts] || -+ x_ctb + ctb_size >= s->ps.sps->width) -+ { -+ // Do job -+ // Prep for submission -+ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced -+ job_gen_bounds(s, jb); -+ break; -+ } -+ -+ // If max_blocks started as 0 then this will never be true -+ if (--max_blocks == 0) -+ break; -+ } -+ -+ lc->unit_done = (more_data <= 0); -+ lc->ts = ctb_addr_ts; -+ return 0; -+} -+ -+static void bt_lc_init(HEVCContext * const s, HEVCLocalContext * const lc, const unsigned int n) -+{ -+ lc->context = s; -+ lc->jb0 = NULL; -+ lc->lc_n = n; -+ lc->bt_terminate = 0; -+ lc->bt_psem_out = NULL; -+ sem_init(&lc->bt_sem_in, 0, 0); -+} -+ -+#define TRACE_WPP 0 -+#if RPI_EXTRA_BIT_THREADS > 0 -+static inline unsigned int line_ts_width(const HEVCContext * const s, unsigned int ts) -+{ -+ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts]; -+ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]]; -+} -+ -+// Move local context parameters from an aux bit thread back to the main -+// thread at the end of a slice as processing is going to continue there. -+static void movlc(HEVCLocalContext *const dst_lc, HEVCLocalContext *const src_lc, const int is_dep) -+{ -+ if (src_lc == dst_lc) { -+ return; -+ } -+ -+ // Move the job -+ // We will still have an active job if the final line terminates early -+ // Dest should always be null by now -+ av_assert1(dst_lc->jb0 == NULL); -+ dst_lc->jb0 = src_lc->jb0; -+ src_lc->jb0 = NULL; -+ -+ // Always need to store where we are in the bitstream -+ dst_lc->ts = src_lc->ts; -+ dst_lc->gb = src_lc->gb; -+ // Need to store context if we might have a dependent seg -+ if (is_dep) -+ { -+ dst_lc->qPy_pred = src_lc->qPy_pred; -+ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state)); -+ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff)); -+ } -+} -+ -+static inline int wait_bt_sem_in(HEVCLocalContext * const lc) -+{ -+ rpi_sem_wait(&lc->bt_sem_in); -+ return lc->bt_terminate; -+} -+ -+// Do one WPP line -+// Will not work correctly over horizontal tile boundries - vertical should be OK -+static int rpi_run_one_line(HEVCContext *const s, HEVCLocalContext * const lc, const int is_first) -+{ -+ const int is_tile = lc->bt_is_tile; -+ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts]; -+ const unsigned int line = lc->bt_line_no; -+ const unsigned int line_inc = lc->bt_line_inc; -+ const int is_last = (line >= lc->bt_last_line); -+ -+ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width); -+ const unsigned int ts_next = -+ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ? -+ INT_MAX : -+ is_tile ? -+ s->ps.pps->ctb_addr_rs_to_ts[s->ps.pps->tile_pos_rs[tile_id + line_inc]] : -+ lc->ts + lc->bt_line_width * line_inc; -+ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work) -+ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2; -+ unsigned int ts_prev; -+ int loop_n = 0; -+ int err = 0; -+ -+ av_assert1(line <= s->sh.num_entry_point_offsets); -+ -+#if TRACE_WPP -+ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__, -+ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id, -+ line, lc->bt_last_line, s->sh.num_entry_point_offsets, -+ lc->ts, ts_eol, ts_next, partial_size, lc->jb0); -+#endif -+ if (line != 0) -+ { -+ const uint8_t * const data = s->data + s->sh.offset[line - 1]; -+ const unsigned int len = s->sh.size[line - 1]; -+ if ((err = init_get_bits8(&lc->gb, data, len)) < 0) -+ return err; -+ -+ ff_init_cabac_decoder(&lc->cc, data, len); -+ -+ lc->wpp_init = 1; // Stop ff_hevc_cabac_init trying to read non-existant termination bits -+ } -+ -+ // We should never be processing a dependent slice here -+ lc->qPy_pred = s->sh.slice_qp; //????? needed -+ lc->qp_y = s->sh.slice_qp; // **** (ENTP_C fails without this) -+ -+ do -+ { -+ if (!is_last && loop_n > 1) { -+#if TRACE_WPP -+ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out); -+#endif -+ sem_post(lc->bt_psem_out); -+ } -+ if (!is_first && loop_n != 0) -+ { -+#if TRACE_WPP -+ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in); -+#endif -+ if (wait_bt_sem_in(lc) != 0) -+ return AVERROR_EXIT; -+ } -+ -+#if TRACE_WPP -+ { -+ int n; -+ sem_getvalue(&lc->bt_sem_in, &n); -+ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in); -+ } -+#endif -+ -+ ts_prev = lc->ts; -+ -+ // If we have had an error - do no further decode but do continue -+ // moving signals around so the other threads continue to operate -+ // correctly (or at least as correctly as they can with this line missing) -+ // -+ // Errors in WPP/Tile are less fatal than normal as we have a good idea -+ // of how to restart on the next line so there is no need to give up totally -+ if (err != 0) -+ { -+ lc->unit_done = 0; -+ lc->ts += partial_size; -+ } -+ else -+ { -+ worker_pass0_ready(s, lc); -+ -+ if ((err = fill_job(s, lc, partial_size)) < 0 || -+ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done))) -+ { -+ if (err == 0) { -+ av_log(s, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); -+ err = AVERROR_INVALIDDATA; -+ } -+ worker_free(s, lc); -+ lc->ts = ts_prev + partial_size; // Pretend we did all that -+ lc->unit_done = 0; -+ } -+ else if (is_tile) -+ { -+ worker_submit_job(s, lc); -+ } -+ } -+ -+ ++loop_n; -+ } while (lc->ts < ts_eol && !lc->unit_done); -+ -+ // If we are on the last line & we didn't get a whole line we must wait for -+ // and sink the sem_posts from the line above / tile to the left. -+ while ((ts_prev += partial_size) < ts_eol) -+ { -+#if TRACE_WPP -+ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in); -+#endif -+ if (wait_bt_sem_in(lc) != 0) -+ return AVERROR_EXIT; -+ } -+ -+ lc->bt_line_no += line_inc; -+ -+ if (!is_tile && err == 0) -+ worker_submit_job(s, lc); -+ -+ if (!is_last) { -+ lc->ts = ts_next; -+ -+#if TRACE_WPP -+ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out); -+#endif -+ sem_post(lc->bt_psem_out); -+ if (loop_n > 1) { -+#if TRACE_WPP -+ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out); ++#ifdef __cplusplus ++extern "C" { /* the types are probably wrong... */ +#endif -+ sem_post(lc->bt_psem_out); -+ } -+ } -+ else -+ { -+ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); -+ -+ // When all done poke the thread 0 sem_in one final time -+#if TRACE_WPP -+ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in); ++#ifdef __cplusplus ++} +#endif -+ sem_post(&s->HEVClcList[0]->bt_sem_in); -+ } + -+#if TRACE_WPP -+ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag); ++#ifdef _MSC_VER ++__declspec(align(8)) ++#elif defined(__GNUC__) ++__attribute__((aligned(8))) +#endif -+ return err; -+} -+ -+static void wpp_setup_lcs(HEVCContext * const s) -+{ -+ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; -+ const unsigned int line_width = line_ts_width(s, ts); -+ -+ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i) -+ { -+ HEVCLocalContext * const lc = s->HEVClcList[i]; -+ lc->ts = ts; -+ lc->bt_is_tile = 0; -+ lc->bt_line_no = i; -+ lc->bt_line_width = line_width; -+ lc->bt_last_line = s->sh.num_entry_point_offsets; -+ lc->bt_line_inc = RPI_BIT_THREADS; -+ ts += line_width; -+ } -+} -+ -+ -+// Can only process tile single row at once -+static void tile_one_row_setup_lcs(HEVCContext * const s, unsigned int slice_row) -+{ -+ const HEVCPPS * const pps = s->ps.pps; -+ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; -+ const unsigned int tile0 = pps->tile_id[ts0]; -+ const unsigned int col0 = tile0 % pps->num_tile_columns; -+ -+ const unsigned int col = (slice_row == 0) ? col0 : 0; -+ unsigned int line = slice_row * pps->num_tile_columns - col0 + col; -+ const unsigned int last_line = FFMIN( -+ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets); -+ -+ const unsigned int par = -+ FFMIN(RPI_BIT_THREADS, last_line + 1 - line); -+#if TRACE_WPP -+ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row, -+ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line); -+#endif -+ for (unsigned int i = 0; i != par; ++i, ++line) -+ { -+ HEVCLocalContext * const lc = s->HEVClcList[i]; -+ const unsigned int tile = tile0 + line; -+ -+ lc->ts = pps->ctb_addr_rs_to_ts[pps->tile_pos_rs[tile]]; -+ lc->bt_line_no = line; -+ lc->bt_is_tile = 1; -+ lc->bt_line_width = line_ts_width(s, lc->ts); -+ lc->bt_last_line = last_line; -+ lc->bt_line_inc = par; -+ } -+} -+ -+ -+static void * bit_thread(void * v) -+{ -+ HEVCLocalContext * const lc = v; -+ HEVCContext *const s = lc->context; -+ -+ while (wait_bt_sem_in(lc) == 0) -+ { -+ int err; -+ -+ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp -+ if (lc->bt_terminate) { -+ av_log(s, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); -+ break; -+ } -+ av_log(s, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); -+ } -+ } -+ -+ return NULL; -+} -+ -+static int bit_threads_start(HEVCContext * const s) -+{ -+ if (s->bt_started) -+ return 0; -+ -+ for (int i = 1; i < RPI_BIT_THREADS; ++i) -+ { -+ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS] -+ if (s->HEVClcList[i] == NULL) { -+ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL) -+ return -1; -+ } -+ -+ bt_lc_init(s, s->HEVClcList[i], i); -+ job_lc_init(s->HEVClcList[i]); -+ } -+ -+ // Link the sems in a circle -+ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i) -+ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in; -+ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in; -+ -+ // Init all lc before starting any threads -+ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) -+ { -+ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0) -+ return -1; -+ } - -+ s->bt_started = 1; - return 0; - } - --static int hls_coding_quadtree(HEVCContext *s, int x0, int y0, -- int log2_cb_size, int cb_depth) -+static int bit_threads_kill(HEVCContext * const s) - { -- HEVCLocalContext *lc = s->HEVClc; -- const int cb_size = 1 << log2_cb_size; -- int ret; -- int split_cu; -+ if (!s->bt_started) -+ return 0; -+ s->bt_started = 0; - -- lc->ct_depth = cb_depth; -- if (x0 + cb_size <= s->ps.sps->width && -- y0 + cb_size <= s->ps.sps->height && -- log2_cb_size > s->ps.sps->log2_min_cb_size) { -- split_cu = ff_hevc_split_coding_unit_flag_decode(s, cb_depth, x0, y0); -- } else { -- split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size); -- } -- if (s->ps.pps->cu_qp_delta_enabled_flag && -- log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth) { -- lc->tu.is_cu_qp_delta_coded = 0; -- lc->tu.cu_qp_delta = 0; -+ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) -+ { -+ HEVCLocalContext *const lc = s->HEVClcList[i + 1]; -+ if (lc == NULL) -+ break; -+ -+ lc->bt_terminate = 1; -+ sem_post(&lc->bt_sem_in); -+ pthread_join(s->bit_threads[i], NULL); -+ -+ sem_destroy(&lc->bt_sem_in); -+ job_lc_kill(lc); - } -+ return 0; -+} -+#endif -+ -+ -+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread) -+{ -+ HEVCContext * const s = avctxt->priv_data; -+ HEVCLocalContext * const lc = s->HEVClc; -+ int err; -+ -+ // Start of slice -+ if ((err = slice_start(s, lc)) != 0) -+ return err; -+ -+#if RPI_EXTRA_BIT_THREADS > 0 -+ -+ if (s->sh.num_entry_point_offsets != 0 && -+ s->ps.pps->num_tile_columns > 1) -+ { -+ unsigned int slice_row = 0; -+ -+#if TRACE_WPP -+ printf("%s: Do Tiles\n", __func__); -+#endif -+ // Generate & start extra bit threads if they aren't already running -+ bit_threads_start(s); -+ -+ do -+ { -+ // Reset lc lines etc. -+ tile_one_row_setup_lcs(s, slice_row); - -- if (s->sh.cu_chroma_qp_offset_enabled_flag && -- log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth) { -- lc->tu.is_cu_chroma_qp_offset_coded = 0; -+#if TRACE_WPP -+ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n", -+ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); -+#endif -+ -+ rpi_run_one_line(s, lc, 1); // Kicks off the other threads -+#if TRACE_WPP -+ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n", -+ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); -+#endif -+ -+ while (lc->bt_line_no <= lc->bt_last_line) { -+ rpi_sem_wait(&lc->bt_sem_in); -+ rpi_run_one_line(s, lc, 0); -+ } -+#if TRACE_WPP -+ printf("%s: Done body\n", __func__); -+#endif -+ -+ // Wait for everything else to finish -+ rpi_sem_wait(&lc->bt_sem_in); -+ -+ ++slice_row; -+ } while (lc->bt_last_line < s->sh.num_entry_point_offsets); -+ -+ -+#if TRACE_WPP -+ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); -+#endif - } -+ else - -- if (split_cu) { -- int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1; -- const int cb_size_split = cb_size >> 1; -- const int x1 = x0 + cb_size_split; -- const int y1 = y0 + cb_size_split; -+ // * We only cope with WPP in a single column -+ // Probably want to deal with that case as tiles rather than WPP anyway -+ // *** Decode error recovery NIF - we currently just crash :-( -+ // ?? Not actually sure that the main code deals with WPP + multi-col correctly ?? -+ if (s->ps.pps->entropy_coding_sync_enabled_flag && -+ s->ps.pps->num_tile_columns == 1 && -+ s->sh.num_entry_point_offsets != 0) -+ { -+#if TRACE_WPP -+ printf("%s: Do WPP\n", __func__); -+#endif -+ // Generate & start extra bit threads if they aren't already running -+ bit_threads_start(s); - -- int more_data = 0; -+ // Reset lc lines etc. -+ wpp_setup_lcs(s); - -- more_data = hls_coding_quadtree(s, x0, y0, log2_cb_size - 1, cb_depth + 1); -- if (more_data < 0) -- return more_data; -+ rpi_run_one_line(s, lc, 1); // Kicks off the other threads -+#if TRACE_WPP -+ printf("%s: Done 1st\n", __func__); -+#endif - -- if (more_data && x1 < s->ps.sps->width) { -- more_data = hls_coding_quadtree(s, x1, y0, log2_cb_size - 1, cb_depth + 1); -- if (more_data < 0) -- return more_data; -- } -- if (more_data && y1 < s->ps.sps->height) { -- more_data = hls_coding_quadtree(s, x0, y1, log2_cb_size - 1, cb_depth + 1); -- if (more_data < 0) -- return more_data; -- } -- if (more_data && x1 < s->ps.sps->width && -- y1 < s->ps.sps->height) { -- more_data = hls_coding_quadtree(s, x1, y1, log2_cb_size - 1, cb_depth + 1); -- if (more_data < 0) -- return more_data; -+ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) { -+ rpi_sem_wait(&lc->bt_sem_in); -+ rpi_run_one_line(s, lc, 0); - } -+#if TRACE_WPP -+ printf("%s: Done body\n", __func__); -+#endif - -- if(((x0 + (1<qPy_pred = lc->qp_y; -+ // Wait for everything else to finish -+ rpi_sem_wait(&lc->bt_sem_in); - -- if (more_data) -- return ((x1 + cb_size_split) < s->ps.sps->width || -- (y1 + cb_size_split) < s->ps.sps->height); -- else -- return 0; -- } else { -- ret = hls_coding_unit(s, x0, y0, log2_cb_size); -- if (ret < 0) -- return ret; -- if ((!((x0 + cb_size) % -- (1 << (s->ps.sps->log2_ctb_size))) || -- (x0 + cb_size >= s->ps.sps->width)) && -- (!((y0 + cb_size) % -- (1 << (s->ps.sps->log2_ctb_size))) || -- (y0 + cb_size >= s->ps.sps->height))) { -- int end_of_slice_flag = ff_hevc_end_of_slice_flag_decode(s); -- return !end_of_slice_flag; -- } else { -- return 1; -- } -+#if TRACE_WPP -+ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); -+#endif - } -+ else -+#endif -+ { -+#if TRACE_WPP -+ printf("%s: Single start: ts=%d\n", __func__, lc->ts); -+#endif -+ // Single bit thread -+ do { -+ // Make sure we have space to prepare the next job -+ worker_pass0_ready(s, lc); - -- return 0; --} -+ if ((err = fill_job(s, lc, 0)) < 0) -+ goto fail; - --static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb, -- int ctb_addr_ts) --{ -- HEVCLocalContext *lc = s->HEVClc; -- int ctb_size = 1 << s->ps.sps->log2_ctb_size; -- int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -- int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr; -+ worker_submit_job(s, lc); -+ } while (!lc->unit_done); - -- s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; -+#if TRACE_WPP -+ printf("%s: Single end: ts=%d\n", __func__, lc->ts); -+#endif -+ } - -- if (s->ps.pps->entropy_coding_sync_enabled_flag) { -- if (x_ctb == 0 && (y_ctb & (ctb_size - 1)) == 0) -- lc->first_qp_group = 1; -- lc->end_of_tiles_x = s->ps.sps->width; -- } else if (s->ps.pps->tiles_enabled_flag) { -- if (ctb_addr_ts && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) { -- int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size]; -- lc->end_of_tiles_x = x_ctb + (s->ps.pps->column_width[idxX] << s->ps.sps->log2_ctb_size); -- lc->first_qp_group = 1; -- } -- } else { -- lc->end_of_tiles_x = s->ps.sps->width; -+ // If we have reached the end of the frame then wait for the worker to finish all its jobs -+ if (lc->ts >= s->ps.sps->ctb_size) { -+ worker_wait(s, lc); - } - -- lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); -+#if RPI_TSTATS -+ { -+ HEVCRpiStats *const ts = &s->tstats; - -- lc->boundary_flags = 0; -- if (s->ps.pps->tiles_enabled_flag) { -- if (x_ctb > 0 && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]]) -- lc->boundary_flags |= BOUNDARY_LEFT_TILE; -- if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) -- lc->boundary_flags |= BOUNDARY_LEFT_SLICE; -- if (y_ctb > 0 && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]) -- lc->boundary_flags |= BOUNDARY_UPPER_TILE; -- if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width]) -- lc->boundary_flags |= BOUNDARY_UPPER_SLICE; -- } else { -- if (ctb_addr_in_slice <= 0) -- lc->boundary_flags |= BOUNDARY_LEFT_SLICE; -- if (ctb_addr_in_slice < s->ps.sps->ctb_width) -- lc->boundary_flags |= BOUNDARY_UPPER_SLICE; -+ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", -+ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, -+ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, -+ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, -+ ts->y_pred2_hgt16, ts->y_pred2_hle16); -+ memset(ts, 0, sizeof(*ts)); - } -+#endif -+ -+ return lc->ts; - -- lc->ctb_left_flag = ((x_ctb > 0) && (ctb_addr_in_slice > 0) && !(lc->boundary_flags & BOUNDARY_LEFT_TILE)); -- lc->ctb_up_flag = ((y_ctb > 0) && (ctb_addr_in_slice >= s->ps.sps->ctb_width) && !(lc->boundary_flags & BOUNDARY_UPPER_TILE)); -- lc->ctb_up_right_flag = ((y_ctb > 0) && (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]])); -- lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0) && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]])); -+fail: -+ // Cleanup -+ av_log(s, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); -+ // Free our job & wait for temination -+ worker_free(s, lc); -+ worker_wait(s, lc); -+ return err; - } - -+ -+#endif // RPI -+ - static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread) - { -- HEVCContext *s = avctxt->priv_data; -+ HEVCContext * const s = avctxt->priv_data; -+ HEVCLocalContext *const lc = s->HEVClc; - int ctb_size = 1 << s->ps.sps->log2_ctb_size; - int more_data = 1; - int x_ctb = 0; - int y_ctb = 0; - int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; -- int ret; -- -- if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { -- av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); -- return AVERROR_INVALIDDATA; -- } -+ int err; - -- if (s->sh.dependent_slice_segment_flag) { -- int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; -- if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { -- av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n"); -- return AVERROR_INVALIDDATA; -- } -- } -+ // Start of slice -+ if ((err = slice_start(s, lc)) != 0) -+ return err; - - while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) { -- int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; -+ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; - -- x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; -- y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size; -- hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts); -+ x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; -+ y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; -+ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); - -- ret = ff_hevc_cabac_init(s, ctb_addr_ts); -- if (ret < 0) { -+ err = ff_hevc_cabac_init(s, lc, ctb_addr_ts); -+ if (err < 0) { - s->tab_slice_address[ctb_addr_rs] = -1; -- return ret; -+ return err; - } - -- hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); -+ hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); - - s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; - s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; - s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; - -- more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); -+ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); -+ - if (more_data < 0) { - s->tab_slice_address[ctb_addr_rs] = -1; - return more_data; - } - -- - ctb_addr_ts++; -- ff_hevc_save_states(s, ctb_addr_ts); -+ ff_hevc_save_states(s, lc, ctb_addr_ts); - ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size); - } - -- if (x_ctb + ctb_size >= s->ps.sps->width && -- y_ctb + ctb_size >= s->ps.sps->height) -- ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); -- - return ctb_addr_ts; - } - --static int hls_slice_data(HEVCContext *s) -+static int hls_slice_data(HEVCContext * const s, const H2645NAL * const nal) - { -- int arg[2]; -- int ret[2]; -+#ifdef RPI -+ // * We don't support cross_component_prediction_enabled_flag but as that -+ // must be 0 unless we have 4:4:4 there is no point testing for it as we -+ // only deal with sand which is never 4:4:4 -+ // [support wouldn't be hard] -+ // -+ // *** Really this wants to be set further out than here - we do not -+ // expect this to change mid-stream -+ -+ s->enable_rpi = -+ ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) || -+ (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10)); -+ -+ if (s->enable_rpi) -+ { -+ int err; -+ if ((err = gen_entry_points(s, nal)) < 0) -+ return err; -+ -+ return rpi_decode_entry(s->avctx, NULL); -+ } -+ else -+#endif -+ { -+ int arg[2]; -+ int ret[2]; - -- arg[0] = 0; -- arg[1] = 1; -+ arg[0] = 0; -+ arg[1] = 1; - -- s->avctx->execute(s->avctx, hls_decode_entry, arg, ret , 1, sizeof(int)); -- return ret[0]; -+ s->avctx->execute(s->avctx, hls_decode_entry, arg, ret , 1, sizeof(int)); -+ return ret[0]; -+ } - } -+ - static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int job, int self_id) - { - HEVCContext *s1 = avctxt->priv_data, *s; -@@ -2445,6 +5301,10 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int - s = s1->sList[self_id]; - lc = s->HEVClc; - -+#ifdef RPI -+ s->enable_rpi = 0; -+#endif -+ - if(ctb_row) { - ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]); - if (ret < 0) -@@ -2456,7 +5316,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int - int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; - int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; - -- hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts); -+ hls_decode_neighbour(s, s->HEVClc, x_ctb, y_ctb, ctb_addr_ts); - - ff_thread_await_progress2(s->avctx, ctb_row, thread, SHIFT_CTB_WPP); - -@@ -2465,11 +5325,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int - return 0; - } - -- ret = ff_hevc_cabac_init(s, ctb_addr_ts); -+ ret = ff_hevc_cabac_init(s, s->HEVClc, ctb_addr_ts); - if (ret < 0) - goto error; -- hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); -- more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); -+ hls_sao_param(s, s->HEVClc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); -+ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); - - if (more_data < 0) { - ret = more_data; -@@ -2478,7 +5338,7 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int - - ctb_addr_ts++; - -- ff_hevc_save_states(s, ctb_addr_ts); -+ ff_hevc_save_states(s, s->HEVClc, ctb_addr_ts); - ff_thread_report_progress2(s->avctx, ctb_row, thread, 1); - ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size); - -@@ -2489,7 +5349,6 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int - } - - if ((x_ctb+ctb_size) >= s->ps.sps->width && (y_ctb+ctb_size) >= s->ps.sps->height ) { -- ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size); - ff_thread_report_progress2(s->avctx, ctb_row , thread, SHIFT_CTB_WPP); - return ctb_addr_ts; - } -@@ -2512,14 +5371,16 @@ error: - - static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal) - { -- const uint8_t *data = nal->data; -- int length = nal->size; -- HEVCLocalContext *lc = s->HEVClc; -+// const uint8_t *data = nal->data; -+// int length = nal->size; -+// HEVCLocalContext *lc = s->HEVClc; - int *ret = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int)); - int *arg = av_malloc_array(s->sh.num_entry_point_offsets + 1, sizeof(int)); -- int64_t offset; -- int64_t startheader, cmpt = 0; -- int i, j, res = 0; -+// int64_t offset; -+// int64_t startheader, cmpt = 0; -+ int i; -+// int j; -+ int res = 0; - - if (!ret || !arg) { - av_free(ret); -@@ -2547,6 +5408,10 @@ static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal) - } - } - -+#if 1 -+ if ((res = gen_entry_points(s, nal)) != 0) -+ goto error; -+#else - offset = (lc->gb.index >> 3); - - for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) { -@@ -2581,9 +5446,10 @@ static int hls_slice_data_wpp(HEVCContext *s, const H2645NAL *nal) - - } - s->data = data; -+#endif - - for (i = 1; i < s->threads_number; i++) { -- s->sList[i]->HEVClc->first_qp_group = 1; -+// s->sList[i]->HEVClc->first_qp_group = 1; - s->sList[i]->HEVClc->qp_y = s->sList[0]->HEVClc->qp_y; - memcpy(s->sList[i], s, sizeof(HEVCContext)); - s->sList[i]->HEVClc = s->HEVClcList[i]; -@@ -2745,9 +5611,8 @@ static int set_side_data(HEVCContext *s) - return 0; - } - --static int hevc_frame_start(HEVCContext *s) -+static int hevc_frame_start(HEVCContext * const s) - { -- HEVCLocalContext *lc = s->HEVClc; - int pic_size_in_ctb = ((s->ps.sps->width >> s->ps.sps->log2_min_cb_size) + 1) * - ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); - int ret; -@@ -2763,9 +5628,6 @@ static int hevc_frame_start(HEVCContext *s) - - s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos); - -- if (s->ps.pps->tiles_enabled_flag) -- lc->end_of_tiles_x = s->ps.pps->column_width[0] << s->ps.sps->log2_ctb_size; -- - ret = ff_hevc_set_new_ref(s, &s->frame, s->poc); - if (ret < 0) - goto fail; -@@ -2806,8 +5668,7 @@ fail: - - static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) - { -- HEVCLocalContext *lc = s->HEVClc; -- GetBitContext *gb = &lc->gb; -+ GetBitContext * const gb = &s->HEVClc->gb; - int ctb_addr_ts, ret; - - *gb = nal->gb; -@@ -2857,6 +5718,37 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) - if (ret < 0) - return ret; - -+ // The definition of _N unit types is "non-reference for other frames -+ // with the same temporal_id" so they may/will be ref frames for pics -+ // with a higher temporal_id. -+ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || -+ !(s->nal_unit_type == HEVC_NAL_TRAIL_N || -+ s->nal_unit_type == HEVC_NAL_TSA_N || -+ s->nal_unit_type == HEVC_NAL_STSA_N || -+ s->nal_unit_type == HEVC_NAL_RADL_N || -+ s->nal_unit_type == HEVC_NAL_RASL_N); -+#ifdef RPI -+ s->offload_recon = s->used_for_ref; -+// s->offload_recon = 0; -+#endif -+ -+#if DEBUG_DECODE_N -+ { -+ static int z = 0; -+ if (IS_IDR(s)) { -+ z = 1; -+ } -+ if (z != 0 && z++ > DEBUG_DECODE_N) { -+ s->is_decoded = 0; -+ break; -+ } -+ } -+#endif -+ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { -+ s->is_decoded = 0; -+ break; -+ } -+ - if (s->sh.first_slice_in_pic_flag) { - if (s->max_ra == INT_MAX) { - if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { -@@ -2915,7 +5807,7 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal) - if (s->threads_number > 1 && s->sh.num_entry_point_offsets > 0) - ctb_addr_ts = hls_slice_data_wpp(s, nal); - else -- ctb_addr_ts = hls_slice_data(s); -+ ctb_addr_ts = hls_slice_data(s, nal); - if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) { - s->is_decoded = 1; - } -@@ -2988,10 +5880,22 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length) - } - } - --fail: -- if (s->ref && s->threads_type == FF_THREAD_FRAME) -- ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); -- -+fail: // Also success path -+ if (s->ref != NULL) { -+ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) { -+ ff_hevc_progress_signal_all_done(s); -+ } -+#ifdef RPI -+ // * Flush frame will become confused if we pass it something -+ // that doesn't have an expected number of planes (e.g. 400) -+ // So only flush if we are sure we can. -+ else if (s->enable_rpi) { -+ // Flush frame to real memory as we expect to be able to pass -+ // it straight on to mmal -+ flush_frame(s, s->frame); -+ } -+#endif -+ } - return ret; - } - -@@ -3193,9 +6097,174 @@ fail: - return AVERROR(ENOMEM); - } - -+#ifdef RPI -+ -+ -+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) -+{ -+ av_freep(&ipe->q); -+ gpu_free(&ipe->gptr); -+} -+ -+static HEVCRpiJob * job_new(void) -+{ -+ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob)); -+ -+ // **** Offload init? -+ -+ ff_hevc_rpi_progress_init_wait(&jb->progress_wait); -+ -+ jb->intra.n = 0; -+ jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS); -+ -+ // * Sizeof the union structure might be overkill but at the moment it -+ // is correct (it certainly isn't going to be too small) -+ // *** really should add per ctu sync words to be accurate -+ -+ rpi_inter_pred_alloc(&jb->chroma_ip, -+ QPU_N_MAX, QPU_N_GRP, -+ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t), -+ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t)); -+ rpi_inter_pred_alloc(&jb->luma_ip, -+ QPU_N_MAX, QPU_N_GRP, -+ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t), -+ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t)); -+ -+ return jb; -+} -+ -+static void job_delete(HEVCRpiJob * const jb) -+{ -+ worker_pic_free_one(jb); -+ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); -+ av_freep(&jb->intra.cmds); -+ rpi_free_inter_pred(&jb->chroma_ip); -+ rpi_free_inter_pred(&jb->luma_ip); -+} -+ -+static void jbg_delete(HEVCRpiJobGlobal * const jbg) -+{ -+ HEVCRpiJob * jb; -+ -+ if (jbg == NULL) -+ return; -+ -+ jb = jbg->free1; -+ while (jb != NULL) -+ { -+ HEVCRpiJob * const jb2 = jb; -+ jb = jb2->next; -+ job_delete(jb2); -+ } -+ -+ pthread_mutex_destroy(&jbg->lock); -+ av_free(jbg); -+} -+ -+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count) -+{ -+ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal)); -+ if (jbg == NULL) -+ return NULL; -+ -+ pthread_mutex_init(&jbg->lock, NULL); -+ -+ while (job_count-- != 0) -+ { -+ HEVCRpiJob * const jb = job_new(); -+ if (jb == NULL) -+ goto fail; -+ -+ jb->next = jbg->free1; -+ jbg->free1 = jb; -+ } -+ -+ return jbg; -+ -+fail: -+ jbg_delete(jbg); -+ return NULL; -+} -+ -+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc) -+{ -+ HEVCRpiJobGlobal * jbg; -+ -+ if (jbc == NULL) -+ return; -+ -+ jbg = jbc->jbg; -+ -+ if (jbc->jb1 != NULL) -+ job_delete(jbc->jb1); -+ -+ pthread_mutex_destroy(&jbc->in_lock); -+ pthread_mutex_destroy(&jbc->out_lock); -+ sem_destroy(&jbc->sem_out); -+ av_free(jbc); -+ -+ // Deref the global job context -+ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1) -+ jbg_delete(jbg); -+} -+ -+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg) -+{ -+ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl)); -+ -+ if (jbc == NULL) -+ return NULL; -+ -+ jbc->jbg = jbg; -+ atomic_fetch_add(&jbg->ref_count, 1); -+ -+ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS); -+ pthread_mutex_init(&jbc->in_lock, NULL); -+ pthread_mutex_init(&jbc->out_lock, NULL); -+ -+ if ((jbc->jb1 = job_new()) == NULL) -+ goto fail; -+ jbc->jb1->jbc_local = jbc; -+ -+ return jbc; -+ -+fail: -+ rpi_job_ctl_delete(jbc); -+ return NULL; -+} -+ -+ -+ -+static av_cold void hevc_init_worker(HEVCContext * const s) -+{ -+#if RPI_PASSES == 2 -+ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1); -+#elif RPI_PASSES == 3 -+ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2); -+ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1); -+#else -+#error Passes confused -+#endif -+ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0); -+ -+ pass_queues_start_all(s); -+} -+ -+static av_cold void hevc_exit_worker(HEVCContext *s) -+{ -+ pass_queues_term_all(s); -+ -+ pass_queues_kill_all(s); -+ -+ rpi_job_ctl_delete(s->jbc); -+ s->jbc = NULL; -+} -+ -+#endif -+ - static av_cold int hevc_decode_free(AVCodecContext *avctx) - { -- HEVCContext *s = avctx->priv_data; -+ HEVCContext * const s = avctx->priv_data; - int i; - - pic_arrays_free(s); -@@ -3204,10 +6273,22 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) - - av_freep(&s->cabac_state); - -- for (i = 0; i < 3; i++) { -- av_freep(&s->sao_pixel_buffer_h[i]); -- av_freep(&s->sao_pixel_buffer_v[i]); -+#ifdef RPI -+#if RPI_EXTRA_BIT_THREADS -+ bit_threads_kill(s); -+#endif -+ -+ hevc_exit_worker(s); -+ vpu_qpu_term(); -+ for (i = 0; i != 2; ++i) { -+ ff_hevc_rpi_progress_kill_state(s->progress_states + i); - } -+ job_lc_kill(s->HEVClc); -+ av_rpi_zc_uninit(avctx); -+#endif -+ -+ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] -+ av_freep(&s->sao_pixel_buffer_v[0]); - av_frame_free(&s->output_frame); - - for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { -@@ -3230,21 +6311,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx) - av_freep(&s->sh.size); - - for (i = 1; i < s->threads_number; i++) { -- HEVCLocalContext *lc = s->HEVClcList[i]; -- if (lc) { -- av_freep(&s->HEVClcList[i]); -+ if (s->sList[i] != NULL) { - av_freep(&s->sList[i]); - } - } -- if (s->HEVClc == s->HEVClcList[0]) -- s->HEVClc = NULL; -- av_freep(&s->HEVClcList[0]); -+ -+ // Free separately from sLists as used that way by RPI WPP -+ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) { -+ av_freep(s->HEVClcList + i); -+ } -+ s->HEVClc = NULL; // Allocated as part of HEVClcList - - ff_h2645_packet_uninit(&s->pkt); - - return 0; - } - -+ - static av_cold int hevc_init_context(AVCodecContext *avctx) - { - HEVCContext *s = avctx->priv_data; -@@ -3258,6 +6341,38 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) - s->HEVClcList[0] = s->HEVClc; - s->sList[0] = s; - -+#ifdef RPI -+ // Whilst FFmpegs init fn is only called once the close fn is called as -+ // many times as we have threads (init_thread_copy is called for the -+ // threads). So to match init & term put the init here where it will be -+ // called by both init & copy -+ av_rpi_zc_init(avctx); -+ -+ if (vpu_qpu_init() != 0) -+ goto fail; -+ -+#if RPI_INTER -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+ { -+ static const uint32_t dframe[1] = {0x80808080}; -+ s->qpu_dummy_frame_emu = (const uint8_t *)dframe; -+ } -+#endif -+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C -+ s->qpu_dummy_frame_qpu = qpu_fn(mc_start); // Use our code as a dummy frame -+#endif -+#endif -+ //gpu_malloc_uncached(2048*64,&s->dummy); -+ -+ s->enable_rpi = 0; -+ bt_lc_init(s, s->HEVClc, 0); -+ job_lc_init(s->HEVClc); -+ -+ for (i = 0; i != 2; ++i) { -+ ff_hevc_rpi_progress_init_state(s->progress_states + i); -+ } -+#endif -+ - s->cabac_state = av_malloc(HEVC_CONTEXTS); - if (!s->cabac_state) - goto fail; -@@ -3271,6 +6386,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) - if (!s->DPB[i].frame) - goto fail; - s->DPB[i].tf.f = s->DPB[i].frame; -+ s->DPB[i].dpb_no = i; - } - - s->max_ra = INT_MAX; -@@ -3289,6 +6405,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx) - return 0; - - fail: -+ av_log(s, AV_LOG_ERROR, "%s: Failed\n", __func__); - hevc_decode_free(avctx); - return AVERROR(ENOMEM); - } -@@ -3372,6 +6489,14 @@ static int hevc_update_thread_context(AVCodecContext *dst, - s->sei.content_light = s0->sei.content_light; - s->sei.alternative_transfer = s0->sei.alternative_transfer; - -+#ifdef RPI -+ if (s->jbc == NULL) -+ { -+ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL); -+ hevc_init_worker(s); -+ } -+#endif -+ - return 0; - } - -@@ -3382,10 +6507,31 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) - - avctx->internal->allocate_progress = 1; - -+#ifdef RPI -+ { -+ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5)); -+ if (jbg == NULL) -+ { -+ av_log(s, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); -+ return -1; -+ } -+ -+ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) -+ { -+ av_log(s, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); -+ return -1; -+ } -+ } -+#endif -+ - ret = hevc_init_context(avctx); - if (ret < 0) - return ret; - -+#ifdef RPI -+ hevc_init_worker(s); -+#endif -+ - s->enable_parallel_tiles = 0; - s->sei.picture_timing.picture_struct = 0; - s->eos = 1; -@@ -3406,9 +6552,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx) - } - - if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) -- s->threads_type = FF_THREAD_FRAME; -- else -- s->threads_type = FF_THREAD_SLICE; -+ s->threads_type = FF_THREAD_FRAME; -+ else -+ s->threads_type = FF_THREAD_SLICE; - - return 0; - } -@@ -3467,7 +6613,16 @@ AVCodec ff_hevc_decoder = { - .update_thread_context = hevc_update_thread_context, - .init_thread_copy = hevc_init_thread_copy, - .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | -+#if 0 -+ // Debugging is often easier without threads getting in the way -+ 0, -+#warning H265 threading turned off -+#elif defined(RPI) -+ // We only have decent optimisation for frame - so only admit to that -+ AV_CODEC_CAP_FRAME_THREADS, -+#else - AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS, -+#endif - .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING, - .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), - }; -diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h -index 293beb7083..38caf6388a 100644 ---- a/libavcodec/hevcdec.h -+++ b/libavcodec/hevcdec.h -@@ -24,6 +24,9 @@ - #define AVCODEC_HEVCDEC_H - - #include -+#ifdef RPI -+#include -+#endif - - #include "libavutil/buffer.h" - -@@ -55,6 +58,8 @@ - - #define MRG_MAX_NUM_CANDS 5 - -+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64 -+ - #define L0 0 - #define L1 1 - -@@ -251,17 +256,6 @@ typedef struct CodingUnit { - uint8_t cu_transquant_bypass_flag; - } CodingUnit; - --typedef struct Mv { -- int16_t x; ///< horizontal component of motion vector -- int16_t y; ///< vertical component of motion vector --} Mv; -- --typedef struct MvField { -- DECLARE_ALIGNED(4, Mv, mv)[2]; -- int8_t ref_idx[2]; -- int8_t pred_flag; --} MvField; -- - typedef struct NeighbourAvailable { - int cand_bottom_left; - int cand_left; -@@ -298,8 +292,8 @@ typedef struct TransformUnit { - } TransformUnit; - - typedef struct DBParams { -- int beta_offset; -- int tc_offset; -+ int8_t beta_offset; // -12 to +12 -+ int8_t tc_offset; // -12 to +12 - } DBParams; - - #define HEVC_FRAME_FLAG_OUTPUT (1 << 0) -@@ -307,6 +301,8 @@ typedef struct DBParams { - #define HEVC_FRAME_FLAG_LONG_REF (1 << 2) - #define HEVC_FRAME_FLAG_BUMPING (1 << 3) - -+struct HEVCRpiJob; -+ - typedef struct HEVCFrame { - AVFrame *frame; - ThreadFrame tf; -@@ -334,14 +330,59 @@ typedef struct HEVCFrame { - * A combination of HEVC_FRAME_FLAG_* - */ - uint8_t flags; -+ -+ // Entry no in DPB - can be used as a small unique -+ // frame identifier (within the current thread) -+ uint8_t dpb_no; - } HEVCFrame; - -+#ifdef RPI -+typedef struct HEVCLocalContextIntra { -+ TransformUnit tu; -+ NeighbourAvailable na; -+} HEVCLocalContextIntra; -+#endif -+ - typedef struct HEVCLocalContext { -+ TransformUnit tu; // Moved to start to match HEVCLocalContextIntra (yuk!) -+ NeighbourAvailable na; -+ -+#ifdef RPI -+ // Vars that allow us to locate everything from just an lc -+ struct HEVCContext * context; // ??? make const ??? -+ unsigned int lc_n; // lc list el no -+ -+ // Job wait links -+ struct HEVCLocalContext * jw_next; -+ struct HEVCLocalContext * jw_prev; -+ struct HEVCLocalContext * ljw_next; -+ struct HEVCLocalContext * ljw_prev; -+ struct HEVCRpiJob * volatile jw_job; -+ sem_t jw_sem; -+ -+ // ?? Wrap in structure ?? -+ sem_t bt_sem_in; -+ sem_t * bt_psem_out; -+ volatile int bt_terminate; -+ unsigned int ts; -+ unsigned int bt_last_line; // Last line in this bit_thread chunk -+ unsigned int bt_line_no; -+ unsigned int bt_line_width; -+ unsigned int bt_line_inc; -+ -+ struct HEVCRpiJob * jb0; -+ char unit_done; // Set once we have dealt with this slice -+// char max_done; -+ char bt_is_tile; -+ char last_progress_good; -+#endif -+ char wpp_init; // WPP/Tile bitstream init has happened -+ - uint8_t cabac_state[HEVC_CONTEXTS]; - - uint8_t stat_coeff[4]; - -- uint8_t first_qp_group; -+// uint8_t first_qp_group; - - GetBitContext gb; - CABACContext cc; -@@ -351,8 +392,6 @@ typedef struct HEVCLocalContext { - - int qPy_pred; - -- TransformUnit tu; -- - uint8_t ctb_left_flag; - uint8_t ctb_up_flag; - uint8_t ctb_up_right_flag; -@@ -368,7 +407,6 @@ typedef struct HEVCLocalContext { - int ct_depth; - CodingUnit cu; - PredictionUnit pu; -- NeighbourAvailable na; - - #define BOUNDARY_LEFT_SLICE (1 << 0) - #define BOUNDARY_LEFT_TILE (1 << 1) -@@ -379,6 +417,242 @@ typedef struct HEVCLocalContext { - int boundary_flags; - } HEVCLocalContext; - -+#ifdef RPI -+ -+// This is the number of _extra_ bit threads - the we will have -+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing -+//#define RPI_EXTRA_BIT_THREADS 0 -+#define RPI_EXTRA_BIT_THREADS 2 -+ -+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code -+// Various buffer sizes depend on this so do not over allocate -+#define RPI_MAX_WIDTH 2048 -+ -+// Each block can have an intra prediction and an add_residual command -+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH -+#if RPI_HEVC_SAND -+// Sand only has 2 planes (Y/C) -+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(RPI_MAX_WIDTH/4)) -+#else -+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*3*(RPI_MAX_WIDTH/4)) -+#endif -+ -+#ifdef RPI_DEBLOCK_VPU -+// Worst case is 16x16 CTUs -+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16) -+#endif -+ -+// Command for intra prediction and transform_add of predictions to coefficients -+enum rpi_pred_cmd_e -+{ -+ RPI_PRED_ADD_RESIDUAL, -+ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx -+ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx -+ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V -+ RPI_PRED_ADD_DC, -+ RPI_PRED_ADD_DC_U, // Both U & V are effectively C -+ RPI_PRED_ADD_DC_V, -+ RPI_PRED_INTRA, -+ RPI_PRED_I_PCM, -+ RPI_PRED_CMD_MAX -+}; -+ -+typedef struct HEVCPredCmd { -+ uint8_t type; -+ uint8_t size; // log2 "size" used by all variants -+ uint8_t na; // i_pred - but left here as they pack well -+ uint8_t c_idx; // i_pred -+ union { -+ struct { // TRANSFORM_ADD -+ uint8_t * dst; -+ const int16_t * buf; -+ uint16_t stride; // Should be good enough for all pic fmts we use -+ int16_t dc; -+ } ta; -+ struct { -+ uint8_t * dst; -+ uint32_t stride; -+ int dc; -+ } dc; -+ struct { // INTRA -+ uint16_t x; -+ uint16_t y; -+ enum IntraPredMode mode; -+ } i_pred; -+ struct { // I_PCM -+ uint16_t x; -+ uint16_t y; -+ const void * src; -+ uint32_t src_len; -+ } i_pcm; -+ }; -+} HEVCPredCmd; -+ -+#endif -+ -+#ifdef RPI -+ -+union qpu_mc_pred_cmd_s; -+struct qpu_mc_pred_y_p_s; -+struct qpu_mc_src_s; -+ -+typedef struct HEVCRpiInterPredQ -+{ -+ union qpu_mc_pred_cmd_u *qpu_mc_base; -+ union qpu_mc_pred_cmd_u *qpu_mc_curr; -+ struct qpu_mc_src_s *last_l0; -+ struct qpu_mc_src_s *last_l1; -+ unsigned int load; -+ uint32_t code_setup; -+ uint32_t code_sync; -+ uint32_t code_exit; -+} HEVCRpiInterPredQ; -+ -+typedef struct HEVCRpiInterPredEnv -+{ -+ HEVCRpiInterPredQ * q; -+ uint8_t n; // Number of Qs -+ uint8_t n_grp; // Number of Q in a group -+ uint8_t curr; // Current Q number (0..n-1) -+ uint8_t used; // 0 if nothing in any Q, 1 otherwise -+ uint8_t used_grp; // 0 if nothing in any Q in the current group -+ unsigned int max_fill; -+ unsigned int min_gap; -+ GPU_MEM_PTR_T gptr; -+} HEVCRpiInterPredEnv; -+ -+typedef struct HEVCRpiIntraPredEnv { -+ unsigned int n; // Number of commands -+ HEVCPredCmd * cmds; -+} HEVCRpiIntraPredEnv; -+ -+typedef struct HEVCRpiCoeffEnv { -+ unsigned int n; -+ int16_t * buf; -+} HEVCRpiCoeffEnv; -+ -+typedef struct HEVCRpiCoeffsEnv { -+ HEVCRpiCoeffEnv s[4]; -+ GPU_MEM_PTR_T gptr; -+ void * mptr; -+} HEVCRpiCoeffsEnv; -+ -+typedef struct HEVCRPiFrameProgressWait { -+ int req; -+ struct HEVCRPiFrameProgressWait * next; -+ sem_t sem; -+} HEVCRPiFrameProgressWait; -+ -+typedef struct HEVCRPiFrameProgressState { -+ struct HEVCRPiFrameProgressWait * first; -+ struct HEVCRPiFrameProgressWait * last; -+ pthread_mutex_t lock; -+} HEVCRPiFrameProgressState; -+ -+typedef struct RpiBlk -+{ -+ unsigned int x; -+ unsigned int y; -+ unsigned int w; -+ unsigned int h; -+} RpiBlk; -+ -+typedef struct HEVCRpiJob { -+ struct HEVCRpiJob * next; // Free chain -+ struct HEVCRpiJobCtl * jbc_local; -+ const HEVCSPS * sps; // sps used to set up this job -+ -+ int waited; -+ int ctu_ts_first; -+ int ctu_ts_last; -+ RpiBlk bounds; // Bounding box of job -+ -+ struct qpu_mc_pred_y_p_s * last_y8_p; -+ struct qpu_mc_src_s * last_y8_l1; -+ -+ HEVCRpiInterPredEnv chroma_ip; -+ HEVCRpiInterPredEnv luma_ip; -+ int16_t progress[32]; // index by dpb_no -+ HEVCRpiIntraPredEnv intra; -+ HEVCRpiCoeffsEnv coeffs; -+ HEVCRPiFrameProgressWait progress_wait; -+} HEVCRpiJob; -+ -+struct HEVCContext; -+ -+typedef void HEVCRpiWorkerFn(struct HEVCContext * const s, HEVCRpiJob * const jb); -+ -+typedef struct HEVCRpiPassQueue -+{ -+// int pending; -+ volatile int terminate; -+ sem_t sem_in; -+ sem_t * psem_out; -+ void * job_n; // cas takes void * so we need to store as such (but really int) -+ struct HEVCContext * context; // Context pointer as we get to pass a single "void * this" to the thread -+ HEVCRpiWorkerFn * worker; -+ pthread_t thread; -+ uint8_t pass_n; // Pass number - debug -+ uint8_t started; -+} HEVCRpiPassQueue; -+ -+ -+struct HEVCRpiJobGlobal; -+ -+typedef struct HEVCRpiJobCtl -+{ -+ sem_t sem_out; -+ -+ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated -+ struct HEVCRpiJobGlobal * jbg; -+ -+ HEVCLocalContext * lcw_head; -+ HEVCLocalContext * lcw_tail; -+ -+ pthread_mutex_t in_lock; -+ int offload_in; -+ pthread_mutex_t out_lock; -+ int offload_out; -+ -+ HEVCRpiJob *offloadq[RPI_MAX_JOBS]; -+} HEVCRpiJobCtl; -+ -+ -+typedef struct HEVCRpiJobGlobal -+{ -+ intptr_t ref_count; -+ pthread_mutex_t lock; -+ HEVCRpiJob * free1; -+ HEVCLocalContext * wait_head; -+ HEVCLocalContext * wait_tail; -+ -+} HEVCRpiJobGlobal; -+ -+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1) -+ -+#if RPI_TSTATS -+typedef struct HEVCRpiStats { -+ int y_pred1_y8_merge; -+ int y_pred1_xy; -+ int y_pred1_x0; -+ int y_pred1_y0; -+ int y_pred1_x0y0; -+ int y_pred1_wle8; -+ int y_pred1_wgt8; -+ int y_pred1_hle16; -+ int y_pred1_hgt16; -+ int y_pred2_xy; -+ int y_pred2_x0; -+ int y_pred2_y0; -+ int y_pred2_x0y0; -+ int y_pred2_hle16; -+ int y_pred2_hgt16; -+} HEVCRpiStats; -+#endif -+ -+#define RPI_PASSES 3 -+#endif -+ - typedef struct HEVCContext { - const AVClass *c; // needed by private avoptions - AVCodecContext *avctx; -@@ -394,6 +668,63 @@ typedef struct HEVCContext { - int width; - int height; - -+ char used_for_ref; // rpi -+#ifdef RPI -+ char offload_recon; -+ char enable_rpi; -+ int max_ctu_count; // Number of CTUs when we trigger a round of processing -+ -+ HEVCRpiJobCtl * jbc; -+ -+ HEVCRpiPassQueue passq[RPI_PASSES]; -+#if RPI_TSTATS -+ HEVCRpiStats tstats; -+#endif -+#if RPI_INTER -+ -+ // Function pointers -+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C -+ const uint8_t * qpu_dummy_frame_emu; -+#endif -+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C -+ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory -+#endif -+ HEVCRpiQpu qpu; -+#endif -+ -+#ifdef RPI_DEBLOCK_VPU -+#define RPI_DEBLOCK_VPU_Q_COUNT 2 -+ int enable_rpi_deblock; -+ -+ int uv_setup_width; -+ int uv_setup_height; -+ int setup_width; // Number of 16x16 blocks across the image -+ int setup_height; // Number of 16x16 blocks down the image -+ -+ struct dblk_vpu_q_s -+ { -+ GPU_MEM_PTR_T deblock_vpu_gmem; -+ -+ uint8_t (*y_setup_arm)[2][2][2][4]; -+ uint8_t (*y_setup_vc)[2][2][2][4]; -+ -+ uint8_t (*uv_setup_arm)[2][2][2][4]; // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned -+ uint8_t (*uv_setup_vc)[2][2][2][4]; -+ -+ int (*vpu_cmds_arm)[6]; // r0-r5 for each command -+ int vpu_cmds_vc; -+ -+ vpu_qpu_wait_h cmd_id; -+ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT]; -+ -+ struct dblk_vpu_q_s * dvq; -+ unsigned int dvq_n; -+ -+#endif -+ HEVCLocalContextIntra HEVClcIntra; -+ HEVCRPiFrameProgressState progress_states[2]; -+#endif -+ - uint8_t *cabac_state; - - /** 1 if the independent slice segment header was successfully parsed */ -@@ -482,6 +813,14 @@ typedef struct HEVCContext { - int nuh_layer_id; - - HEVCSEIContext sei; -+ -+#ifdef RPI -+#if RPI_EXTRA_BIT_THREADS > 0 -+ int bt_started; -+ // This simply contains thread descriptors - task setup is held elsewhere -+ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS]; -+#endif -+#endif - } HEVCContext; - - /** -@@ -494,7 +833,7 @@ void ff_hevc_clear_refs(HEVCContext *s); - */ - void ff_hevc_flush_dpb(HEVCContext *s); - --RefPicList *ff_hevc_get_ref_list(HEVCContext *s, HEVCFrame *frame, -+const RefPicList *ff_hevc_get_ref_list(const HEVCContext * const s, const HEVCFrame * const ref, - int x0, int y0); - - /** -@@ -507,6 +846,39 @@ int ff_hevc_frame_rps(HEVCContext *s); - */ - int ff_hevc_slice_rpl(HEVCContext *s); - -+void ff_hevc_save_states(HEVCContext *s, const HEVCLocalContext * const lc, int ctb_addr_ts); -+int ff_hevc_cabac_init(const HEVCContext * const s, HEVCLocalContext *const lc, int ctb_addr_ts); -+int ff_hevc_sao_merge_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_sao_type_idx_decode(HEVCLocalContext * const lc); -+int ff_hevc_sao_band_position_decode(HEVCLocalContext * const lc); -+int ff_hevc_sao_offset_abs_decode(const HEVCContext * const s, HEVCLocalContext * const lc); -+int ff_hevc_sao_offset_sign_decode(HEVCLocalContext * const lc); -+int ff_hevc_sao_eo_class_decode(HEVCLocalContext * const lc); -+int ff_hevc_end_of_slice_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_cu_transquant_bypass_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_skip_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc, -+ const int x0, const int y0, const int x_cb, const int y_cb); -+int ff_hevc_pred_mode_decode(HEVCLocalContext * const lc); -+int ff_hevc_split_coding_unit_flag_decode(const HEVCContext * const s, HEVCLocalContext * const lc, const int ct_depth, -+ const int x0, const int y0); -+int ff_hevc_part_mode_decode(const HEVCContext * const s, HEVCLocalContext * const lc, const int log2_cb_size); -+int ff_hevc_pcm_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_prev_intra_luma_pred_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_mpm_idx_decode(HEVCLocalContext * const lc); -+int ff_hevc_rem_intra_luma_pred_mode_decode(HEVCLocalContext * const lc); -+int ff_hevc_intra_chroma_pred_mode_decode(HEVCLocalContext * const lc); -+int ff_hevc_merge_idx_decode(const HEVCContext * const s, HEVCLocalContext * const lc); -+int ff_hevc_merge_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_inter_pred_idc_decode(HEVCLocalContext * const lc, int nPbW, int nPbH); -+int ff_hevc_ref_idx_lx_decode(HEVCLocalContext * const lc, const int num_ref_idx_lx); -+int ff_hevc_mvp_lx_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_no_residual_syntax_flag_decode(HEVCLocalContext * const lc); -+int ff_hevc_split_transform_flag_decode(HEVCLocalContext * const lc, const int log2_trafo_size); -+int ff_hevc_cbf_cb_cr_decode(HEVCLocalContext * const lc, const int trafo_depth); -+int ff_hevc_cbf_luma_decode(HEVCLocalContext * const lc, const int trafo_depth); -+int ff_hevc_log2_res_scale_abs(HEVCLocalContext * const lc, const int idx); -+int ff_hevc_res_scale_sign_flag(HEVCLocalContext *const lc, const int idx); -+#if 0 - void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts); - int ff_hevc_cabac_init(HEVCContext *s, int ctb_addr_ts); - int ff_hevc_sao_merge_flag_decode(HEVCContext *s); -@@ -539,6 +911,8 @@ int ff_hevc_cbf_cb_cr_decode(HEVCContext *s, int trafo_depth); - int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth); - int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx); - int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx); -+>>>>>>> n3.4 -+#endif - - /** - * Get the number of candidate references for the current frame. -@@ -557,33 +931,119 @@ void ff_hevc_bump_frame(HEVCContext *s); - - void ff_hevc_unref_frame(HEVCContext *s, HEVCFrame *frame, int flags); - --void ff_hevc_set_neighbour_available(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH); --void ff_hevc_luma_mv_merge_mode(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH, int log2_cb_size, -- int part_idx, int merge_idx, MvField *mv); --void ff_hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, -- int nPbW, int nPbH, int log2_cb_size, -- int part_idx, int merge_idx, -- MvField *mv, int mvp_lx_flag, int LX); --void ff_hevc_set_qPy(HEVCContext *s, int xBase, int yBase, -- int log2_cb_size); --void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0, -+void ff_hevc_set_neighbour_available(const HEVCContext * const s, HEVCLocalContext * const lc, const int x0, const int y0, -+ const int nPbW, const int nPbH); -+void ff_hevc_luma_mv_merge_mode(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, -+ int merge_idx, MvField * const mv); -+void ff_hevc_luma_mv_mvp_mode(const HEVCContext * const s, HEVCLocalContext *lc, int x0, int y0, int nPbW, -+ int nPbH, int log2_cb_size, int part_idx, -+ int merge_idx, MvField * const mv, -+ int mvp_lx_flag, int LX); -+void ff_hevc_set_qPy(const HEVCContext * const s, HEVCLocalContext * const lc, int xBase, int yBase, int log2_cb_size); -+void ff_hevc_deblocking_boundary_strengths(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, - int log2_trafo_size); --int ff_hevc_cu_qp_delta_sign_flag(HEVCContext *s); --int ff_hevc_cu_qp_delta_abs(HEVCContext *s); --int ff_hevc_cu_chroma_qp_offset_flag(HEVCContext *s); --int ff_hevc_cu_chroma_qp_offset_idx(HEVCContext *s); --void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size); -+int ff_hevc_cu_qp_delta_sign_flag(HEVCLocalContext * const lc); -+int ff_hevc_cu_qp_delta_abs(HEVCLocalContext * const lc); -+int ff_hevc_cu_chroma_qp_offset_flag(HEVCLocalContext * const lc); -+int ff_hevc_cu_chroma_qp_offset_idx(const HEVCContext * const s, HEVCLocalContext * const lc); -+void ff_hevc_hls_filter(HEVCContext * const s, const int x, const int y, const int ctb_size); - void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size); --void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0, -- int log2_trafo_size, enum ScanType scan_idx, -- int c_idx); -+void ff_hevc_hls_residual_coding(const HEVCContext * const s, HEVCLocalContext * const lc, -+ const int x0, const int y0, -+ const int log2_trafo_size, const enum ScanType scan_idx, -+ const int c_idx); -+ -+void ff_hevc_hls_mvd_coding(HEVCLocalContext * const lc); - --void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size); - - extern const uint8_t ff_hevc_qpel_extra_before[4]; - extern const uint8_t ff_hevc_qpel_extra_after[4]; - extern const uint8_t ff_hevc_qpel_extra[4]; - -+#ifdef RPI -+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n); -+ -+// arm/hevc_misc_neon.S -+// Neon coeff zap fn -+#if HAVE_NEON -+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); -+#endif -+ -+void ff_hevc_rpi_progress_wait_field(const HEVCContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int val, const int field); -+ -+void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field); -+ -+// All of these expect that s->threads_type == FF_THREAD_FRAME -+ -+static inline void ff_hevc_progress_wait_mv(const HEVCContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int y) -+{ -+ if (s->enable_rpi) -+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); -+ else -+ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0); -+} -+ -+static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y) -+{ -+ if (s->enable_rpi && s->used_for_ref) -+ ff_hevc_rpi_progress_signal_field(s, y, 1); -+} -+ -+static inline void ff_hevc_progress_wait_recon(const HEVCContext * const s, HEVCRpiJob * const jb, -+ const HEVCFrame * const ref, const int y) -+{ -+ if (s->enable_rpi) -+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); -+ else -+ ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0); -+} -+ -+static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y) -+{ -+ if (s->used_for_ref) -+ { -+ if (s->enable_rpi) -+ ff_hevc_rpi_progress_signal_field(s, y, 0); -+ else -+ ff_thread_report_progress(&s->ref->tf, y, 0); -+ } -+} -+ -+static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s) -+{ -+ if (s->enable_rpi) -+ { -+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); -+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); -+ } -+ else -+ ff_thread_report_progress(&s->ref->tf, INT_MAX, 0); -+} -+ -+#else -+ -+// Use #define as that allows us to discard "jb" which won't exist in non-RPI world -+#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0) -+#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0) -+#define ff_hevc_progress_signal_mv(s, y) -+#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0) -+#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0) -+ -+#endif -+ -+// Set all done - signal nothing (used in missing refs) -+// Works for both rpi & non-rpi -+static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref) -+{ -+ if (ref->tf.progress != NULL) -+ { -+ int * const p = (int *)&ref->tf.progress->data; -+ p[0] = INT_MAX; -+ p[1] = INT_MAX; -+ } -+} -+ - #endif /* AVCODEC_HEVCDEC_H */ -diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c -index 76ae72b6d4..85c1f03718 100644 ---- a/libavcodec/hevcdsp.c -+++ b/libavcodec/hevcdsp.c -@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = { - #include "hevcdsp_template.c" - #undef BIT_DEPTH - -+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc, -+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, -+ const MvField *curr, const MvField *neigh, uint8_t *bs) -+{ -+ for (; pus > 0; pus--) { -+ int strength, out; -+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; -+ int curr_refL1 = curr_rpl1[curr->ref_idx[1]]; -+ int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]]; -+ int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]]; -+ -+#if 1 // This more directly matches the original implementation -+ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { -+ // same L0 and L1 -+ if (curr_refL0 == neigh_refL0 && -+ curr_refL0 == curr_refL1 && -+ neigh_refL0 == neigh_refL1) { -+ if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) && -+ (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)) -+ strength = 1; -+ else -+ strength = 0; -+ } else if (neigh_refL0 == curr_refL0 && -+ neigh_refL1 == curr_refL1) { -+ if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) -+ strength = 1; -+ else -+ strength = 0; -+ } else if (neigh_refL1 == curr_refL0 && -+ neigh_refL0 == curr_refL1) { -+ if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || -+ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4) -+ strength = 1; -+ else -+ strength = 0; -+ } else { -+ strength = 1; -+ } -+ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV -+ Mv curr_mv0, neigh_mv0; -+ -+ if (curr->pred_flag & 1) { -+ curr_mv0 = curr->mv[0]; -+ } else { -+ curr_mv0 = curr->mv[1]; -+ curr_refL0 = curr_refL1; -+ } -+ -+ if (neigh->pred_flag & 1) { -+ neigh_mv0 = neigh->mv[0]; -+ } else { -+ neigh_mv0 = neigh->mv[1]; -+ neigh_refL0 = neigh_refL1; -+ } -+ -+ if (curr_refL0 == neigh_refL0) { -+ if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4) -+ strength = 1; -+ else -+ strength = 0; -+ } else -+ strength = 1; -+ } else -+ strength = 1; -+#else // This has exactly the same effect, but is more suitable for vectorisation -+ Mv curr_mv[2]; -+ Mv neigh_mv[2]; -+ memcpy(curr_mv, curr->mv, sizeof curr_mv); -+ memcpy(neigh_mv, neigh->mv, sizeof neigh_mv); -+ -+ if (!(curr->pred_flag & 2)) { -+ curr_mv[1] = curr_mv[0]; -+ curr_refL1 = curr_refL0; -+ } -+ if (!(neigh->pred_flag & 2)) { -+ neigh_mv[1] = neigh_mv[0]; -+ neigh_refL1 = neigh_refL0; -+ } -+ if (!(curr->pred_flag & 1)) { -+ curr_mv[0] = curr_mv[1]; -+ curr_refL0 = curr_refL1; -+ } -+ if (!(neigh->pred_flag & 1)) { -+ neigh_mv[0] = neigh_mv[1]; -+ neigh_refL0 = neigh_refL1; -+ } -+ -+ strength = 1; -+ -+ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) | -+ (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) | -+ (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4); -+ -+ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) | -+ (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) | -+ (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4); -+ -+ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2); -+#endif -+ -+ curr += in_inc / sizeof (MvField); -+ neigh += in_inc / sizeof (MvField); -+ -+ for (out = dup; out > 0; out--) -+ { -+ *bs = strength; -+ bs += out_inc; -+ } -+ } -+} -+ - void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) - { - #undef FUNC -@@ -193,12 +307,54 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) - PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ - PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) - -+#if !RPI_HEVC_SAND -+#define SLICED_LOOP_FILTERS(depth) -+#define SLICED_ADD_RESIDUAL(depth) -+#define SLICED_SAO(depth) -+#else -+#define SLICED_ADD_RESIDUAL(depth)\ -+ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ -+ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ -+ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ -+ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ -+ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ -+ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ -+ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ -+ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ -+ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ -+ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ -+ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ -+ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ -+ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ -+ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ -+ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ -+ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ -+ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) -+#define SLICED_LOOP_FILTERS(depth)\ -+ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ -+ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ -+ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) -+#define SLICED_SAO(depth)\ -+ for (i = 0; i != SAO_FILTER_N; ++i) { \ -+ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ -+ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ -+ } \ -+ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ -+ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) -+ -+#endif -+ - #define HEVC_DSP(depth) \ - hevcdsp->put_pcm = FUNC(put_pcm, depth); \ - hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \ - hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ - hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ - hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ -+ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ -+ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ -+ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ -+ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ -+ SLICED_ADD_RESIDUAL(depth); \ - hevcdsp->dequant = FUNC(dequant, depth); \ - hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ - hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ -@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) - hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ - hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ - \ -- hevcdsp->sao_band_filter[0] = \ -- hevcdsp->sao_band_filter[1] = \ -- hevcdsp->sao_band_filter[2] = \ -- hevcdsp->sao_band_filter[3] = \ -- hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth); \ -- hevcdsp->sao_edge_filter[0] = \ -- hevcdsp->sao_edge_filter[1] = \ -- hevcdsp->sao_edge_filter[2] = \ -- hevcdsp->sao_edge_filter[3] = \ -- hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth); \ -+ for (i = 0; i != SAO_FILTER_N; ++i) { \ -+ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ -+ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ -+ } \ - hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ - hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ -+ SLICED_SAO(depth); \ - \ - QPEL_FUNCS(depth); \ - QPEL_UNI_FUNCS(depth); \ -@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) - EPEL_UNI_FUNCS(depth); \ - EPEL_BI_FUNCS(depth); \ - \ -+ SLICED_LOOP_FILTERS(depth); \ - hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ - hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ - hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ -@@ -257,6 +409,8 @@ int i = 0; - break; - } - -+ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; -+ - if (ARCH_PPC) - ff_hevc_dsp_init_ppc(hevcdsp, bit_depth); - if (ARCH_X86) -diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h -index dc48ebca11..25ed9a447a 100644 ---- a/libavcodec/hevcdsp.h -+++ b/libavcodec/hevcdsp.h -@@ -25,28 +25,56 @@ - #ifndef AVCODEC_HEVCDSP_H - #define AVCODEC_HEVCDSP_H - -+#include "hevc.h" - #include "get_bits.h" - - #define MAX_PB_SIZE 64 - - typedef struct SAOParams { -- int offset_abs[3][4]; ///< sao_offset_abs -- int offset_sign[3][4]; ///< sao_offset_sign -+// int offset_abs[3][4]; ///< sao_offset_abs -+// int offset_sign[3][4]; ///< sao_offset_sign - - uint8_t band_position[3]; ///< sao_band_position -- -- int eo_class[3]; ///< sao_eo_class -+ uint8_t eo_class[3]; ///< sao_eo_class -+ uint8_t type_idx[3]; ///< sao_type_idx - - int16_t offset_val[3][5]; ///> 16; -+ const int dc_u = (dc << 16) >> 16; -+ -+ stride /= sizeof(pixel); -+ -+ for (y = 0; y < size; y++) { -+ for (x = 0; x < size * 2; x += 2) { -+ dst[x] = av_clip_pixel(dst[x] + dc_u); -+ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); -+ } -+ dst += stride; -+ } -+} -+ -+ -+#endif -+ - static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, - ptrdiff_t stride) - { -@@ -82,6 +208,132 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, - FUNC(add_residual)(_dst, res, stride, 32); - } - -+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 4); -+} -+ -+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 8); -+} -+ -+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 16); -+} -+ -+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) -+{ -+ FUNC(add_residual_dc)(_dst, stride, dc, 32); -+} -+ -+#if RPI_HEVC_SAND -+// -- U -- (plaited) -+ -+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); -+} -+ -+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); -+} -+ -+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); -+} -+ -+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_u) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+// -- V -- (plaited) -+ -+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); -+} -+ -+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); -+} -+ -+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); -+} -+ -+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride, int dc_v) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+// -- C -- (plaited - both U & V) -+ -+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual_c)(_dst, res, stride, 4); -+} -+ -+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual_c)(_dst, res, stride, 8); -+} -+ -+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ FUNC(add_residual_c)(_dst, res, stride, 16); -+} -+ -+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, -+ ptrdiff_t stride) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ FUNC(add_residual_dc_c)(_dst, stride, dc, 4); -+} -+ -+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ FUNC(add_residual_dc_c)(_dst, stride, dc, 8); -+} -+ -+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ FUNC(add_residual_dc_c)(_dst, stride, dc, 16); -+} -+ -+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) -+{ -+ // Should never occur for 420, which is all that sand supports -+ av_assert0(0); -+} -+ -+#endif -+ -+ - static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) - { - int16_t *coeffs = (int16_t *) _coeffs; -@@ -352,6 +604,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride - } - } - -+ -+#if BIT_DEPTH == 10 -+#if RPI_HEVC_SAND -+// We need a 32 bit variation for the _c restores so hijack bit depth 10 -+#undef pixel -+#undef BIT_DEPTH -+#define pixel uint32_t -+#define BIT_DEPTH 32 -+#endif -+// All 16 bit variations are the same -+#define sao_edge_restore_0_10 sao_edge_restore_0_9 -+#define sao_edge_restore_1_10 sao_edge_restore_1_9 -+#define sao_edge_restore_0_11 sao_edge_restore_0_9 -+#define sao_edge_restore_1_11 sao_edge_restore_1_9 -+#define sao_edge_restore_0_12 sao_edge_restore_0_9 -+#define sao_edge_restore_1_12 sao_edge_restore_1_9 -+#define sao_edge_restore_0_13 sao_edge_restore_0_9 -+#define sao_edge_restore_1_13 sao_edge_restore_1_9 -+#define sao_edge_restore_0_14 sao_edge_restore_0_9 -+#define sao_edge_restore_1_14 sao_edge_restore_1_9 -+#define sao_edge_restore_0_15 sao_edge_restore_0_9 -+#define sao_edge_restore_1_15 sao_edge_restore_1_9 -+#define sao_edge_restore_0_16 sao_edge_restore_0_9 -+#define sao_edge_restore_1_16 sao_edge_restore_1_9 -+#endif -+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 - static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, - ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, - int *borders, int _width, int _height, -@@ -361,7 +639,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, - int x, y; - pixel *dst = (pixel *)_dst; - pixel *src = (pixel *)_src; -- int16_t *sao_offset_val = sao->offset_val[c_idx]; - int sao_eo_class = sao->eo_class[c_idx]; - int init_x = 0, width = _width, height = _height; - -@@ -370,33 +647,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, - - if (sao_eo_class != SAO_EO_VERT) { - if (borders[0]) { -- int offset_val = sao_offset_val[0]; - for (y = 0; y < height; y++) { -- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val); -+ dst[y * stride_dst] = src[y * stride_src]; - } - init_x = 1; - } - if (borders[2]) { -- int offset_val = sao_offset_val[0]; - int offset = width - 1; - for (x = 0; x < height; x++) { -- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val); -+ dst[x * stride_dst + offset] = src[x * stride_src + offset]; - } - width--; - } - } - if (sao_eo_class != SAO_EO_HORIZ) { - if (borders[1]) { -- int offset_val = sao_offset_val[0]; - for (x = init_x; x < width; x++) -- dst[x] = av_clip_pixel(src[x] + offset_val); -+ dst[x] = src[x]; - } - if (borders[3]) { -- int offset_val = sao_offset_val[0]; - ptrdiff_t y_stride_dst = stride_dst * (height - 1); - ptrdiff_t y_stride_src = stride_src * (height - 1); - for (x = init_x; x < width; x++) -- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val); -+ dst[x + y_stride_dst] = src[x + y_stride_src]; - height--; - } - } -@@ -411,7 +684,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, - int x, y; - pixel *dst = (pixel *)_dst; - pixel *src = (pixel *)_src; -- int16_t *sao_offset_val = sao->offset_val[c_idx]; - int sao_eo_class = sao->eo_class[c_idx]; - int init_x = 0, init_y = 0, width = _width, height = _height; - -@@ -420,34 +692,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, - - if (sao_eo_class != SAO_EO_VERT) { - if (borders[0]) { -- int offset_val = sao_offset_val[0]; - for (y = 0; y < height; y++) { -- dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val); -+ dst[y * stride_dst] = src[y * stride_src]; - } - init_x = 1; - } - if (borders[2]) { -- int offset_val = sao_offset_val[0]; - int offset = width - 1; - for (x = 0; x < height; x++) { -- dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val); -+ dst[x * stride_dst + offset] = src[x * stride_src + offset]; - } - width--; - } - } - if (sao_eo_class != SAO_EO_HORIZ) { - if (borders[1]) { -- int offset_val = sao_offset_val[0]; - for (x = init_x; x < width; x++) -- dst[x] = av_clip_pixel(src[x] + offset_val); -+ dst[x] = src[x]; - init_y = 1; - } - if (borders[3]) { -- int offset_val = sao_offset_val[0]; - ptrdiff_t y_stride_dst = stride_dst * (height - 1); - ptrdiff_t y_stride_src = stride_src * (height - 1); - for (x = init_x; x < width; x++) -- dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val); -+ dst[x + y_stride_dst] = src[x + y_stride_src]; - height--; - } - } -@@ -487,6 +755,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, - - } - } -+#endif -+#if BIT_DEPTH == 32 -+#undef BIT_DEPTH -+#undef pixel -+#define BIT_DEPTH 10 -+#define pixel uint16_t -+#endif -+ -+// --- Plaited chroma versions -+ -+#if RPI_HEVC_SAND -+ -+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, -+ ptrdiff_t stride_dst, ptrdiff_t stride_src, -+ const int16_t *sao_offset_val_u, int sao_left_class_u, -+ const int16_t *sao_offset_val_v, int sao_left_class_v, -+ int width, int height) -+{ -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int offset_table_u[32] = { 0 }; -+ int offset_table_v[32] = { 0 }; -+ int k, y, x; -+ int shift = BIT_DEPTH - 5; -+ -+ stride_dst /= sizeof(pixel); -+ stride_src /= sizeof(pixel); -+ width *= 2; -+ -+ for (k = 0; k < 4; k++) -+ { -+ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; -+ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; -+ } -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x += 2) -+ { -+// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); -+// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); -+ // *** & 31 shouldn't be wanted but just now we generate broken input that -+ // crashes us in 10-bit world -+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); -+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); -+ } -+ dst += stride_dst; -+ src += stride_src; -+ } -+} -+ -+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, -+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, -+ int eo, int width, int height) { -+ -+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; -+ static const int8_t pos[4][2][2] = { -+ { { -1, 0 }, { 1, 0 } }, // horizontal -+ { { 0, -1 }, { 0, 1 } }, // vertical -+ { { -1, -1 }, { 1, 1 } }, // 45 degree -+ { { 1, -1 }, { -1, 1 } }, // 135 degree -+ }; -+ pixel *dst = (pixel *)_dst; -+ pixel *src = (pixel *)_src; -+ int a_stride, b_stride; -+ int x, y; -+ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); -+ -+ stride_dst /= sizeof(pixel); -+ width *= 2; -+ -+ av_assert0(width <= 64); -+ -+ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; -+ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; -+ for (y = 0; y < height; y++) { -+ for (x = 0; x < width; x += 2) { -+ int diff0u = CMP(src[x], src[x + a_stride]); -+ int diff1u = CMP(src[x], src[x + b_stride]); -+ int offset_valu = edge_idx[2 + diff0u + diff1u]; -+ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); -+ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); -+ int offset_valv = edge_idx[2 + diff0v + diff1v]; -+ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); -+ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); -+ } -+ src += stride_src; -+ dst += stride_dst; -+ } -+} -+ -+// Do once -+#if BIT_DEPTH == 8 -+// Any old 2 byte 'normal' restore will work for these -+#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 -+#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 -+// We need 32 bit for 9 bit+ -+#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 -+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 -+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 -+#endif -+ -+#endif // RPI_HEVC_SAND -+ - - #undef CMP - -@@ -1690,3 +2073,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, - #undef TQ1 - #undef TQ2 - #undef TQ3 -+ -+#if RPI_HEVC_SAND -+ -+// line zero -+#define P3 pix_l[0 * xstride] -+#define P2 pix_l[1 * xstride] -+#define P1 pix_l[2 * xstride] -+#define P0 pix_l[3 * xstride] -+#define Q0 pix_r[0 * xstride] -+#define Q1 pix_r[1 * xstride] -+#define Q2 pix_r[2 * xstride] -+#define Q3 pix_r[3 * xstride] -+ -+// line three. used only for deblocking decision -+#define TP3 pix_l[0 * xstride + 3 * ystride] -+#define TP2 pix_l[1 * xstride + 3 * ystride] -+#define TP1 pix_l[2 * xstride + 3 * ystride] -+#define TP0 pix_l[3 * xstride + 3 * ystride] -+#define TQ0 pix_r[0 * xstride + 3 * ystride] -+#define TQ1 pix_r[1 * xstride + 3 * ystride] -+#define TQ2 pix_r[2 * xstride + 3 * ystride] -+#define TQ3 pix_r[3 * xstride + 3 * ystride] -+ -+// This is identical to hevc_loop_filter_luma except that the P/Q -+// components are on separate pointers -+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, -+ unsigned int _stride, unsigned int beta, const int32_t _tc[2], -+ const uint8_t _no_p[2], const uint8_t _no_q[2], -+ uint8_t * _pix_l) -+{ -+ int d, j; -+ pixel *pix_l = (pixel *)_pix_l; -+ pixel *pix_r = (pixel *)_pix_r; -+ const ptrdiff_t xstride = 1; -+ const ptrdiff_t ystride = _stride / sizeof(pixel); -+ -+ beta <<= BIT_DEPTH - 8; -+ -+ for (j = 0; j < 2; j++) { -+ const int dp0 = abs(P2 - 2 * P1 + P0); -+ const int dq0 = abs(Q2 - 2 * Q1 + Q0); -+ const int dp3 = abs(TP2 - 2 * TP1 + TP0); -+ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); -+ const int d0 = dp0 + dq0; -+ const int d3 = dp3 + dq3; -+ const int tc = _tc[j] << (BIT_DEPTH - 8); -+ const int no_p = _no_p[j]; -+ const int no_q = _no_q[j]; -+ -+ if (d0 + d3 >= beta) { -+ pix_l += 4 * ystride; -+ pix_r += 4 * ystride; -+ continue; -+ } else { -+ const int beta_3 = beta >> 3; -+ const int beta_2 = beta >> 2; -+ const int tc25 = ((tc * 5 + 1) >> 1); -+ -+ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && -+ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && -+ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { -+ // strong filtering -+ const int tc2 = tc << 1; -+ for (d = 0; d < 4; d++) { -+ const int p3 = P3; -+ const int p2 = P2; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ const int q2 = Q2; -+ const int q3 = Q3; -+ if (!no_p) { -+ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); -+ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); -+ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); -+ } -+ if (!no_q) { -+ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); -+ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); -+ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); -+ } -+ pix_l += ystride; -+ pix_r += ystride; -+ } -+ } else { // normal filtering -+ int nd_p = 1; -+ int nd_q = 1; -+ const int tc_2 = tc >> 1; -+ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) -+ nd_p = 2; -+ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) -+ nd_q = 2; -+ -+ for (d = 0; d < 4; d++) { -+ const int p2 = P2; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ const int q2 = Q2; -+ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; -+ if (abs(delta0) < 10 * tc) { -+ delta0 = av_clip(delta0, -tc, tc); -+ if (!no_p) -+ P0 = av_clip_pixel(p0 + delta0); -+ if (!no_q) -+ Q0 = av_clip_pixel(q0 - delta0); -+ if (!no_p && nd_p > 1) { -+ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); -+ P1 = av_clip_pixel(p1 + deltap1); -+ } -+ if (!no_q && nd_q > 1) { -+ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); -+ Q1 = av_clip_pixel(q1 + deltaq1); -+ } -+ } -+ pix_l += ystride; -+ pix_r += ystride; -+ } -+ } -+ } -+ } -+} -+ -+#undef TP3 -+#undef TP2 -+#undef TP1 -+#undef TP0 -+#undef TQ0 -+#undef TQ1 -+#undef TQ2 -+#undef TQ3 -+ -+#undef P3 -+#undef P2 -+#undef P1 -+#undef P0 -+#undef Q0 -+#undef Q1 -+#undef Q2 -+#undef Q3 -+ -+#define P1 pix_l[0 * xstride] -+#define P0 pix_l[1 * xstride] -+#define Q0 pix_r[0 * xstride] -+#define Q1 pix_r[1 * xstride] -+ -+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, -+ ptrdiff_t _ystride, const int32_t *_tc, -+ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) -+{ -+ int d, j, no_p, no_q; -+ pixel *pix_l = (pixel *)_pix_l; -+ pixel *pix_r = (pixel *)_pix_r; -+ ptrdiff_t xstride = _xstride / sizeof(pixel); -+ ptrdiff_t ystride = _ystride / sizeof(pixel); -+ -+ for (j = 0; j < 2; j++) { -+ const int tc = _tc[j] << (BIT_DEPTH - 8); -+ if (tc <= 0) { -+ pix_l += 4 * ystride; -+ pix_r += 4 * ystride; -+ continue; -+ } -+ no_p = _no_p[j]; -+ no_q = _no_q[j]; -+ -+ for (d = 0; d < 4; d++) { -+ int delta0; -+ const int p1 = P1; -+ const int p0 = P0; -+ const int q0 = Q0; -+ const int q1 = Q1; -+ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); -+ if (!no_p) -+ P0 = av_clip_pixel(p0 + delta0); -+ if (!no_q) -+ Q0 = av_clip_pixel(q0 - delta0); -+ pix_l += ystride; -+ pix_r += ystride; -+ } -+ } -+} -+ -+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, -+ unsigned int no_f) -+{ -+ uint8_t no_p[2] = {no_f & 1, no_f & 2}; -+ uint8_t no_q[2] = {no_f & 4, no_f & 8}; -+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; -+ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); -+ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); -+} -+ -+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, -+ uint8_t * src_l, -+ unsigned int no_f) -+{ -+ uint8_t no_p[2] = {no_f & 1, no_f & 2}; -+ uint8_t no_q[2] = {no_f & 4, no_f & 8}; -+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; -+ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); -+ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); -+} -+ -+#undef P1 -+#undef P0 -+#undef Q0 -+#undef Q1 -+ -+ -+#endif -+ -diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c -index 7a86ed3d31..7d32c4ab14 100644 ---- a/libavcodec/hevcpred.c -+++ b/libavcodec/hevcpred.c -@@ -24,6 +24,7 @@ - - #include "hevcpred.h" - -+#define PRED_C 0 - #define BIT_DEPTH 8 - #include "hevcpred_template.c" - #undef BIT_DEPTH -@@ -39,13 +40,37 @@ - #define BIT_DEPTH 12 - #include "hevcpred_template.c" - #undef BIT_DEPTH -+#undef PRED_C -+ -+#ifdef RPI -+#define PRED_C 1 -+#define BIT_DEPTH 8 -+#include "hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 9 -+#include "hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 10 -+#include "hevcpred_template.c" -+#undef BIT_DEPTH -+ -+#define BIT_DEPTH 12 -+#include "hevcpred_template.c" -+#undef BIT_DEPTH -+#undef PRED_C -+#endif - - void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) - { - #undef FUNC - #define FUNC(a, depth) a ## _ ## depth - --#define HEVC_PRED(depth) \ -+#undef FUNCC -+#define FUNCC(a, depth) a ## _ ## depth ## _c -+ -+#define HEVC_PRED_Y(depth) \ - hpc->intra_pred[0] = FUNC(intra_pred_2, depth); \ - hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \ - hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \ -@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth) - hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ - hpc->pred_angular[3] = FUNC(pred_angular_3, depth); - -+#define HEVC_PRED_C(depth) \ -+ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \ -+ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \ -+ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \ -+ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \ -+ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ -+ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ -+ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ -+ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ -+ hpc->pred_dc_c = FUNCC(pred_dc, depth); \ -+ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ -+ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ -+ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ -+ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); -+ -+#ifdef RPI -+#define HEVC_PRED(depth) \ -+ HEVC_PRED_Y(depth); \ -+ HEVC_PRED_C(depth); -+#else -+#define HEVC_PRED(depth) \ -+ HEVC_PRED_Y(depth); -+#endif -+ - switch (bit_depth) { - case 9: - HEVC_PRED(9); -diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h -index eb17663683..6711cfc06f 100644 ---- a/libavcodec/hevcpred.h -+++ b/libavcodec/hevcpred.h -@@ -27,9 +27,10 @@ - #include - - struct HEVCContext; -+struct HEVCLocalContext; - - typedef struct HEVCPredContext { -- void (*intra_pred[4])(struct HEVCContext *s, int x0, int y0, int c_idx); -+ void (*intra_pred[4])(const struct HEVCContext * const s, struct HEVCLocalContext * const lc, int x0, int y0, int c_idx); - - void (*pred_planar[4])(uint8_t *src, const uint8_t *top, - const uint8_t *left, ptrdiff_t stride); -@@ -38,6 +39,17 @@ typedef struct HEVCPredContext { - void (*pred_angular[4])(uint8_t *src, const uint8_t *top, - const uint8_t *left, ptrdiff_t stride, - int c_idx, int mode); -+#ifdef RPI -+ void (*intra_pred_c[4])(const struct HEVCContext * const s, struct HEVCLocalContext * const lc, int x0, int y0, int c_idx); -+ -+ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride); -+ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left, -+ ptrdiff_t stride, int log2_size, int c_idx); -+ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, -+ const uint8_t *left, ptrdiff_t stride, -+ int c_idx, int mode); -+#endif - } HEVCPredContext; - - void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth); -diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c -index 6fe33546b1..999ef369fe 100644 ---- a/libavcodec/hevcpred_template.c -+++ b/libavcodec/hevcpred_template.c -@@ -20,14 +20,111 @@ - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -+//#define DISABLE_INTRA -+ - #include "libavutil/pixdesc.h" - - #include "bit_depth_template.c" - #include "hevcpred.h" - -+#ifdef RPI -+#include "libavutil/rpi_sand_fns.h" -+#endif -+ -+#define DUMP_PRED 0 -+ - #define POS(x, y) src[(x) + stride * (y)] - --static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0, -+// REPEAT_INCLUDE defined at EOF -+#if defined(RPI) && !defined(INCLUDED_ONCE) -+typedef uint8_t (* c8_dst_ptr_t)[2]; -+typedef const uint8_t (* c8_src_ptr_t)[2]; -+typedef uint16_t (* c16_dst_ptr_t)[2]; -+typedef const uint16_t (* c16_src_ptr_t)[2]; -+ -+// *** On ARM make these NEON registers -+typedef struct pixel4_16 { -+ uint16_t x[4]; -+} pixel4_16; -+typedef struct pixel4_32 { -+ uint32_t x[4]; -+} pixel4_32; -+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) -+{ -+ pixel4_16 t = {{x, x, x, x}}; -+ return t; -+} -+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) -+{ -+ pixel4_32 t = {{x, x, x, x}}; -+ return t; -+} -+#endif -+ -+#if PRED_C -+// For chroma we double pixel size so we copy pairs -+#undef pixel -+#undef pixel2 -+#undef pixel4 -+#undef dctcoef -+#undef INIT_CLIP -+#undef no_rnd_avg_pixel4 -+#undef rnd_avg_pixel4 -+#undef AV_RN2P -+#undef AV_RN4P -+#undef AV_RN4PA -+#undef AV_WN2P -+#undef AV_WN4P -+#undef AV_WN4PA -+#undef CLIP -+#undef FUNC -+#undef FUNCC -+#undef av_clip_pixel -+#undef PIXEL_SPLAT_X4 -+ -+#if BIT_DEPTH == 8 -+#define pixel uint16_t -+#define pixel4 pixel4_16 -+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 -+#define cpel uint8_t -+#define c_src_ptr_t c8_src_ptr_t -+#define c_dst_ptr_t c8_dst_ptr_t -+#else -+#define pixel uint32_t -+#define pixel4 pixel4_32 -+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 -+#define cpel uint16_t -+#define c_src_ptr_t c16_dst_ptr_t -+#define c_dst_ptr_t c16_dst_ptr_t -+#endif -+#define AV_RN4P(p) (*(pixel4*)(p)) -+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) -+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) -+#endif -+ -+ -+// Get PW prior to horrid PRED_C trickery -+#if BIT_DEPTH == 8 -+#define PW 1 -+#else -+#define PW 2 -+#endif -+ -+ -+#if DUMP_PRED && !defined(INCLUDE_ONCE) -+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) -+{ -+ for (unsigned int y = 0; y != size; y++, data += stride * 2) { -+ for (unsigned int x = 0; x != size; x++) { -+ printf("%4d", data[x * 2]); -+ } -+ printf("\n"); -+ } -+ printf("\n"); -+} -+#endif -+ -+static av_always_inline void FUNC(intra_pred)(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, - int log2_size, int c_idx) - { - #define PU(x) \ -@@ -69,8 +166,6 @@ do { \ - AV_WN4P(&ptr[i], a); \ - else \ - a = PIXEL_SPLAT_X4(ptr[i + 3]) -- -- HEVCLocalContext *lc = s->HEVClc; - int i; - int hshift = s->ps.sps->hshift[c_idx]; - int vshift = s->ps.sps->vshift[c_idx]; -@@ -79,15 +174,23 @@ do { \ - int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; - int size_in_luma_v = size << vshift; - int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; -- int x = x0 >> hshift; -- int y = y0 >> vshift; -+ const int x = x0 >> hshift; -+ const int y = y0 >> vshift; - int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; - int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; - - int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb); - -- ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); -+ const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel); -+#if defined(RPI) -+ pixel *const src = !av_rpi_is_sand_frame(s->frame) ? -+ (pixel*)s->frame->data[c_idx] + x + y * stride : -+ c_idx == 0 ? -+ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : -+ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); -+#else - pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride; -+#endif - - int min_pu_width = s->ps.sps->min_pu_width; - -@@ -95,14 +198,20 @@ do { \ - lc->tu.intra_pred_mode; - pixel4 a; - pixel left_array[2 * MAX_TB_SIZE + 1]; -+#if !PRED_C - pixel filtered_left_array[2 * MAX_TB_SIZE + 1]; -+#endif - pixel top_array[2 * MAX_TB_SIZE + 1]; -+#if !PRED_C - pixel filtered_top_array[2 * MAX_TB_SIZE + 1]; -+#endif - - pixel *left = left_array + 1; - pixel *top = top_array + 1; -+#if !PRED_C - pixel *filtered_left = filtered_left_array + 1; - pixel *filtered_top = filtered_top_array + 1; -+#endif - int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); - int cand_left = lc->na.cand_left; - int cand_up_left = lc->na.cand_up_left; -@@ -114,6 +223,27 @@ do { \ - int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - - (x0 + size_in_luma_h)) >> hshift; - -+ pixel * src_l = src - 1; -+ pixel * src_u = src - stride; -+ pixel * src_ur = src_u + size; -+ -+#ifdef DISABLE_INTRA -+ return; -+#endif -+ -+#if defined(RPI) -+ if (av_rpi_is_sand_frame(s->frame)) { -+ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs -+ const AVFrame * const frame = s->frame; -+ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 -+ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; -+ if ((x & mask) == 0) -+ src_l -= stripe_adj; -+ if (((x + size) & mask) == 0) -+ src_ur += stripe_adj; -+ } -+#endif -+ - if (s->ps.pps->constrained_intra_pred_flag == 1) { - int size_in_luma_pu_v = PU(size_in_luma_v); - int size_in_luma_pu_h = PU(size_in_luma_h); -@@ -163,23 +293,24 @@ do { \ - top[-1] = 128; - } - if (cand_up_left) { -- left[-1] = POS(-1, -1); -+ left[-1] = src_l[-stride]; - top[-1] = left[-1]; - } - if (cand_up) -- memcpy(top, src - stride, size * sizeof(pixel)); -+ // Always good - even with sand -+ memcpy(top, src_u, size * sizeof(pixel)); - if (cand_up_right) { -- memcpy(top + size, src - stride + size, size * sizeof(pixel)); -- EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1), -+ memcpy(top + size, src_ur, top_right_size * sizeof(pixel)); -+ EXTEND(top + size + top_right_size, top[size + top_right_size - 1], - size - top_right_size); - } - if (cand_left) - for (i = 0; i < size; i++) -- left[i] = POS(-1, i); -+ left[i] = src_l[stride * i]; - if (cand_bottom_left) { - for (i = size; i < size + bottom_left_size; i++) -- left[i] = POS(-1, i); -- EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1), -+ left[i] = src_l[stride * i]; -+ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1], - size - bottom_left_size); - } - -@@ -268,7 +399,11 @@ do { \ - cand_up_left = 1; - cand_left = 1; - } else { // No samples available -+#if PRED_C -+ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8)); -+#else - left[-1] = (1 << (BIT_DEPTH - 1)); -+#endif - EXTEND(top, left[-1], 2 * size); - EXTEND(left, left[-1], 2 * size); - } -@@ -287,6 +422,9 @@ do { \ - top[-1] = left[-1]; - - // Filtering process -+ // Sand can only apply to chroma_format_idc == 1 so we don't need to -+ // worry about chroma smoothing for that case -+#if !PRED_C - if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) { - if (mode != INTRA_DC && size != 4){ - int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; -@@ -342,12 +480,36 @@ do { \ - mode); - break; - } -+#else -+ switch (mode) { -+ case INTRA_PLANAR: -+ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride); -+ break; -+ case INTRA_DC: -+ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, log2_size, c_idx); -+ break; -+ default: -+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, -+ (uint8_t *)left, stride, c_idx, -+ mode); -+ break; -+ } -+ -+#if DUMP_PRED -+ printf("U pred @ %d, %d: mode=%d\n", x, y, mode); -+ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); -+ printf("V pred @ %d, %d: mode=%d\n", x, y, mode); -+ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); -+#endif -+#endif - } - - #define INTRA_PRED(size) \ --static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx) \ -+static void FUNC(intra_pred_ ## size)(const HEVCContext * const s, HEVCLocalContext * const lc, int x0, int y0, int c_idx) \ - { \ -- FUNC(intra_pred)(s, x0, y0, size, c_idx); \ -+ FUNC(intra_pred)(s, lc, x0, y0, size, c_idx); \ - } - - INTRA_PRED(2) -@@ -357,6 +519,7 @@ INTRA_PRED(5) - - #undef INTRA_PRED - -+#if !PRED_C - static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, - const uint8_t *_left, ptrdiff_t stride, - int trafo_size) -@@ -371,6 +534,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to - POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + - (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); - } -+#else -+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, -+ const uint8_t * _left, ptrdiff_t stride, -+ int trafo_size) -+{ -+ int x, y; -+ int size = 1 << trafo_size; -+ c_dst_ptr_t src = (c_dst_ptr_t)_src; -+ const c_src_ptr_t top = (c_src_ptr_t)_top; -+ const c_src_ptr_t left = (c_src_ptr_t)_left; -+ -+ for (y = 0; y < size; y++, src += stride) -+ { -+ for (x = 0; x < size; x++) -+ { -+ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + -+ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); -+ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + -+ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); -+ } -+ } -+} -+#endif - - #define PRED_PLANAR(size)\ - static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ -@@ -386,6 +572,7 @@ PRED_PLANAR(3) - - #undef PRED_PLANAR - -+#if !PRED_C - static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, - const uint8_t *_left, - ptrdiff_t stride, int log2_size, int c_idx) -@@ -416,7 +603,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, - POS(0, y) = (left[y] + 3 * dc + 2) >> 2; - } - } -+#else -+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, -+ const uint8_t *_left, -+ ptrdiff_t stride, int log2_size, int c_idx) -+{ -+ unsigned int i, j; -+ const unsigned int size = (1 << log2_size); -+ c_dst_ptr_t src = (c_dst_ptr_t)_src; -+ const c_src_ptr_t top = (c_src_ptr_t)_top; -+ const c_src_ptr_t left = (c_src_ptr_t)_left; -+ unsigned int dc0 = size; -+ unsigned int dc1 = size; -+ -+ for (i = 0; i < size; i++) -+ { -+ dc0 += left[i][0] + top[i][0]; -+ dc1 += left[i][1] + top[i][1]; -+ } -+ -+ dc0 >>= log2_size + 1; -+ dc1 >>= log2_size + 1; -+ -+ for (i = 0; i < size; i++, src += stride) -+ { -+ for (j = 0; j < size; ++j) -+ { -+ src[j][0] = dc0; -+ src[j][1] = dc1; - -+ } -+ } -+} -+#endif -+ -+#ifndef ANGLE_CONSTS -+#define ANGLE_CONSTS -+static const int intra_pred_angle[] = { -+ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, -+ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 -+}; -+static const int inv_angle[] = { -+ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, -+ -630, -910, -1638, -4096 -+}; -+#endif -+ -+#if !PRED_C - static av_always_inline void FUNC(pred_angular)(uint8_t *_src, - const uint8_t *_top, - const uint8_t *_left, -@@ -428,15 +661,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, - const pixel *top = (const pixel *)_top; - const pixel *left = (const pixel *)_left; - -- static const int intra_pred_angle[] = { -- 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, -- -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 -- }; -- static const int inv_angle[] = { -- -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, -- -630, -910, -1638, -4096 -- }; -- - int angle = intra_pred_angle[mode - 2]; - pixel ref_array[3 * MAX_TB_SIZE + 4]; - pixel *ref_tmp = ref_array + size; -@@ -509,6 +733,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src, - } - } - } -+#else -+static av_always_inline void FUNC(pred_angular)(uint8_t *_src, -+ const uint8_t *_top, -+ const uint8_t *_left, -+ ptrdiff_t stride, int c_idx, -+ int mode, int size) -+{ -+ int x, y; -+ c_dst_ptr_t src = (c_dst_ptr_t)_src; -+ c_src_ptr_t top = (c_src_ptr_t)_top; -+ c_src_ptr_t left = (c_src_ptr_t)_left; -+ -+ const int angle = intra_pred_angle[mode - 2]; -+ cpel ref_array[3 * MAX_TB_SIZE + 4][2]; -+ c_dst_ptr_t ref_tmp = ref_array + size; -+ c_src_ptr_t ref; -+ const int last = (size * angle) >> 5; -+ -+ if (mode >= 18) { -+ ref = top - 1; -+ if (angle < 0 && last < -1) { -+ memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW); -+ for (x = last; x <= -1; x++) -+ { -+ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; -+ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; -+ } -+ ref = (c_src_ptr_t)ref_tmp; -+ } -+ -+ for (y = 0; y < size; y++, src += stride) { -+ const int idx = ((y + 1) * angle) >> 5; -+ const int fact = ((y + 1) * angle) & 31; -+ if (fact) { -+ for (x = 0; x < size; ++x) { -+ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + -+ fact * ref[x + idx + 2][0] + 16) >> 5; -+ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + -+ fact * ref[x + idx + 2][1] + 16) >> 5; -+ } -+ } else { -+ memcpy(src, ref + idx + 1, size * 2 * PW); -+ } -+ } -+ } else { -+ ref = left - 1; -+ if (angle < 0 && last < -1) { -+ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); -+ for (x = last; x <= -1; x++) -+ { -+ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; -+ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; -+ } -+ ref = (c_src_ptr_t)ref_tmp; -+ } -+ -+ for (x = 0; x < size; x++, src++) { -+ const int idx = ((x + 1) * angle) >> 5; -+ const int fact = ((x + 1) * angle) & 31; -+ if (fact) { -+ for (y = 0; y < size; y++) { -+ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + -+ fact * ref[y + idx + 2][0] + 16) >> 5; -+ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + -+ fact * ref[y + idx + 2][1] + 16) >> 5; -+ } -+ } else { -+ for (y = 0; y < size; y++) -+ { -+ src[y * stride][0] = ref[y + idx + 1][0]; -+ src[y * stride][1] = ref[y + idx + 1][1]; -+ } -+ } -+ } -+ } -+} -+#endif - - static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, - const uint8_t *left, -@@ -538,6 +839,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, - FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5); - } - -+#undef cpel -+#undef c_src_ptr_t -+#undef c_dst_ptr_t -+ - #undef EXTEND_LEFT_CIP - #undef EXTEND_RIGHT_CIP - #undef EXTEND_UP_CIP -@@ -549,3 +854,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, - #undef EXTEND - #undef MIN_TB_ADDR_ZS - #undef POS -+#undef PW -+ -+#ifndef INCLUDED_ONCE -+#define INCLUDED_ONCE -+#endif -+ -diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c -index 0b1195dc3e..5ef81fa739 100644 ---- a/libavcodec/mmaldec.c -+++ b/libavcodec/mmaldec.c -@@ -24,6 +24,9 @@ - * MMAL Video Decoder - */ - -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" - #include - #include - #include -@@ -31,6 +34,7 @@ - #include - #include - #include -+#pragma GCC diagnostic pop - #include - - #include "avcodec.h" -diff --git a/libavcodec/raw.c b/libavcodec/raw.c -index 8da2a9735e..0ff0e421fc 100644 ---- a/libavcodec/raw.c -+++ b/libavcodec/raw.c -@@ -283,6 +283,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { - { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, - { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, - -+ /* RPI */ -+#ifdef RPI -+ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, -+ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, -+#endif -+ - /* special */ - { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ - { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ -diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c -index d181b74570..7d89af32a2 100644 ---- a/libavcodec/rawenc.c -+++ b/libavcodec/rawenc.c -@@ -31,6 +31,8 @@ - #include "libavutil/intreadwrite.h" - #include "libavutil/imgutils.h" - #include "libavutil/internal.h" -+#include "libavutil/avassert.h" -+#include "libavutil/rpi_sand_fns.h" - - static av_cold int raw_encode_init(AVCodecContext *avctx) - { -@@ -49,6 +51,55 @@ FF_ENABLE_DEPRECATION_WARNINGS - return 0; - } - -+#ifdef RPI -+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3 / 2; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); -+ dst += width * height; -+ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); -+ return 0; -+} -+ -+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, -+ const AVFrame *frame) -+{ -+ const int width = av_frame_cropped_width(frame); -+ const int height = av_frame_cropped_height(frame); -+ const int x0 = frame->crop_left; -+ const int y0 = frame->crop_top; -+ const int size = width * height * 3; -+ uint8_t * dst; -+ int ret; -+ -+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) -+ return ret; -+ -+ dst = pkt->data; -+ -+ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); -+ dst += width * height * 2; -+ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, -+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); -+ return 0; -+} -+#endif -+ -+ - static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, - const AVFrame *frame, int *got_packet) - { -@@ -58,6 +109,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, - if (ret < 0) - return ret; - -+#ifdef RPI -+ if (av_rpi_is_sand_frame(frame)) { -+ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame); -+ *got_packet = (ret == 0); -+ return ret; -+ } -+#endif -+ - if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0) - return ret; - if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size, -diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s -new file mode 100644 -index 0000000000..391f761df9 ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform.s -@@ -0,0 +1,923 @@ -+# ****************************************************************************** -+# Argon Design Ltd. -+# (c) Copyright 2015 Argon Design Ltd. All rights reserved. -+# -+# Module : HEVC -+# Author : Peter de Rivaz -+# ****************************************************************************** -+ -+# HEVC VPU Transform -+# fe -+# Transform matrix can be thought of as -+# output row vector = input row vector * transMatrix2 -+# -+# The even rows of the matrix are symmetric -+# The odd rows of the matrix are antisymmetric -+# -+# So only need to compute the first half of the results, then can compute the remainder with a butterfly -+# -+# EXAMPLE -+# (a b c d) (1 2 2 1) -+# (3 4 -4 -3) -+# (5 6 6 5) -+# (7 8 -8 -7) -+# -+# x=(a c)(1 2) = 1a+5c 2a+6c -+# (5 6) -+# -+# y=(b d)(3 4) = 3b+7d 4b+8d -+# (7 8) -+# -+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d -+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d -+# -+# Final results are (u , v[::-1]) -+# -+# -+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) -+# Apply the even matrix first and stop before rounding -+# Then apply the odd matrix in a full manner: -+# -+# First step is to compute partial products with the first input (16 cycles) -+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output -+# 2a 4b 6c 8d -+# 2a -4b 6c -8d -+# 1a -3b 5c -7d -+# -+# Second step is to sum partial products into final position (8 cycles) -+# 1a+3b+5c+7d -+# 2a+4b+6c+8d -+# 2a-4b+6c-8d -+# 1a-3b+5c-7d -+# -+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) -+# -+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) -+# -+# For 8x8 we could compute two in parallel. -+# -+# -+ -+# Columns are transformed first -+# -+# Store top left half of transMatrix2 in -+# Store bottom left half of transMatrix2 in HX(32,32) -+# -+# For 16x16 -+# HX(0:15,0) contains input data before transform -+# HY(0:15,0) contains 32bit output data after transform -+# HX(32,0) contains even rows of left half of transMatrix2 -+# HX(32,32) contains odd rows of left half of transMatrix2 -+# HY(48,0) contains partial products ready for summing -+# -+ -+ -+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done -+# coeffs32 -+# num32: number of 32x32 transforms -+# command 0 for transform, 1 for memclear16(int16_t *dst,num16) -+# -+ -+.equ TRANS_SHIFT, 20 - BIT_DEPTH -+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) -+.equ TRANS_ASL2, 16 - TRANS_SHIFT -+ -+ -+hevc_trans_16x16: -+ cmp r5,1 -+ beq memclear16 -+ cmp r5,2 -+ beq hevc_deblock_16x16 -+ cmp r5,3 -+ beq hevc_uv_deblock_16x16 -+ cmp r5,4 -+ beq hevc_uv_deblock_16x16_with_clear -+ cmp r5,5 -+ beq hevc_run_command_list -+ -+ push r6-r15, lr # TODO cut down number of used registers -+ mov r14,r3 # coeffs32 -+ mov r15,r4 # num32 -+ mov r3, 16*2 # Stride of transMatrix2 in bytes -+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix -+ -+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix -+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ # Now use r0 to describe which matrix we are working on. -+ # Allows us to prefetch the next block of coefficients for efficiency. -+ mov r0,0 # This describes the location where we read our coefficients from -+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) -+ mov r7,16*16*2 # Total block size -+ mov r8,64*16 # Value used to swap from current to next VRF location -+ vldh HX(0++,0)+r0,(r1 += r3) REP 16 -+ mov r4,64 # Constant used for rounding first pass -+ mov r5,TRANS_RND2 # Constant used for rounding second pass -+ -+ # At start of block r0,r1 point to the current block (that has already been loaded) -+block_loop: -+ eor r0,r8 -+ add r1,r7 -+ # Prefetch the next block -+ vldh HX(0++,0)+r0,(r1 += r3) REP 16 -+ eor r0,r8 -+ sub r1,r7 -+ -+ # Transform the current block -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? -+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position -+ -+ bl col_trans_16 -+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate -+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. -+ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) -+ -+ # Save results - note there has been a transposition during the processing so we save columns -+ vsth VX(0,32++)+r0, (r1 += r3) REP 16 -+ -+ # Move onto next block -+ eor r0,r8 -+ add r1,r7 -+ -+ addcmpbgt r2,-1,0,block_loop -+ -+ # Now go and do any 32x32 transforms -+ b hevc_trans_32x32 -+ -+ pop r6-r15, pc -+ -+# r1,r2,r3 r7,r8 should be preserved -+# HX(0++,0)+r0 is the block to be transformed -+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients -+# Use HY(48,0) for intermediate results -+# r0 can be used, but should be returned to its original value at the end -+col_trans_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+col_trans_odd_16: -+ add r6,r0,16 # Final value for this loop -+col_trans_odd_16_loop: -+ # First compute partial products for a single column -+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 -+ # Then sum up the results and place back -+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC -+ addcmpblt r0,1,r6,col_trans_odd_16_loop -+ sub r0,16 # put r0 back to its original value -+ b lr -+ -+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) -+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd -+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) -+# num: number of 16x16 transforms to be done -+# -+hevc_trans_32x32: -+ mov r1,r14 # coeffs -+ mov r2,r15 # num -+ -+ # Fetch odd transform matrix -+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) -+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix -+ #add r0, 16*16*2 -+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix -+ -+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer -+ mov r7, 16*16*2 # Total block size -+ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) -+ # set r8 to 32byte aligned stack pointer -+ add r8,sp,31 -+ lsr r8,5 -+ lsl r8,5 -+ mov r9,r8 # Backup of the temporary storage -+ mov r10,r1 # Backup of the coefficient buffer -+block_loop32: -+ -+ # COLUMN TRANSFORM -+ mov r4, 64 # Constant used for rounding first pass -+ mov r5, 9 # left shift used for rounding first pass -+ -+ # Transform the first 16 columns -+ mov r1,r10 # Input Coefficient buffer -+ mov r8,r9 # Output temporary storage -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ # ROW TRANSFORM -+ mov r4, TRANS_RND2 # Constant used for rounding second pass -+ mov r5, TRANS_ASL2 # left shift used for rounding second pass -+ -+ mov r1,r9 # Input temporary storage -+ mov r8,r10 # Output Coefficient buffer -+ bl trans32 -+ # Transform the second 16 columns -+ add r8,32*16*2 -+ add r1,32 -+ bl trans32 -+ -+ add r10, 32*32*2 # move onto next block of coefficients -+ addcmpbgt r2,-1,0,block_loop32 -+ -+ add sp,sp,32*32*2+32 # Restore stack -+ -+ pop r6-r15, pc -+ -+trans32: -+ push lr -+ # We can no longer afford the VRF space to do prefetching when doing 32x32 -+ # Fetch the even rows -+ vldh HX(0++,0),(r1 += r3) REP 16 -+ # Fetch the odd rows -+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 -+ -+ # Transform the even rows using even matrix -+ mov r0, 0 # Even rows -+ bl col_trans_16 -+ -+ # Now transform the odd rows using odd matrix -+ mov r0, 64*16 # Odd rows -+ bl col_trans_odd_16 -+ -+ # Now apply butterfly to compute the first 16 results -+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ # 16bit results now in HX(48,32) -+ mov r0,r8 -+ mov r6,32*2 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ -+ # Now apply butterfly to compute the second 16 results (in reverse order) -+ vsub HY(63,0),HY(0 ,0),HY(16,0) -+ vsub HY(62,0),HY(1 ,0),HY(17,0) -+ vsub HY(61,0),HY(2 ,0),HY(18,0) -+ vsub HY(60,0),HY(3 ,0),HY(19,0) -+ vsub HY(59,0),HY(4 ,0),HY(20,0) -+ vsub HY(58,0),HY(5 ,0),HY(21,0) -+ vsub HY(57,0),HY(6 ,0),HY(22,0) -+ vsub HY(56,0),HY(7 ,0),HY(23,0) -+ vsub HY(55,0),HY(8 ,0),HY(24,0) -+ vsub HY(54,0),HY(9 ,0),HY(25,0) -+ vsub HY(53,0),HY(10,0),HY(26,0) -+ vsub HY(52,0),HY(11,0),HY(27,0) -+ vsub HY(51,0),HY(12,0),HY(28,0) -+ vsub HY(50,0),HY(13,0),HY(29,0) -+ vsub HY(49,0),HY(14,0),HY(30,0) -+ vsub HY(48,0),HY(15,0),HY(31,0) -+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, -+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate -+ add r0,r8,32 -+ vsth VX(48,32++),(r0+=r6) REP 16 -+ pop pc -+ -+memclear16: -+ # r0 is address -+ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified) -+ vmov HX(0++,0),0 REP 16 -+ mov r2,32 -+loop: -+ vsth HX(0++,0),(r0+=r2) REP 16 -+ add r0,16*16*2 -+ sub r1,16*16 -+ cmp r1,0 -+ bgt loop -+ b lr -+ -+ -+################################################################################ -+# HEVC VPU Deblock -+# -+# Vertical edges before horizontal -+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked -+# -+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge. -+# The VPU code works in units of 16x16 blocks. -+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time). -+# One final horizontal filter is required at the end. -+# PCM is not allowed in this code. -+# -+# -+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering) -+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering. -+ -+.set P0,63 -+.set P1,62 -+.set P2,61 -+.set P3,60 -+.set Q0,59 -+.set Q1,58 -+.set Q2,57 -+.set Q3,56 -+ -+.set dp,32 -+.set dq,33 -+.set d,34 -+.set decision,35 -+.set beta,36 -+.set beta2,37 -+.set beta3,38 -+.set ptest,39 -+.set qtest,40 -+.set pqtest,41 -+.set thresh,42 -+.set deltatest, 44 -+.set deltap1, 45 -+.set tc25, 46 -+.set setup,47 -+.set tc,48 -+.set tc25,49 -+.set tc2, 50 -+.set do_filter, 51 -+.set delta, 52 -+.set tc10, 53 -+.set delta0, 54 -+.set delta1, 55 -+.set zeros, 0 -+.set setup_input, 1 -+.set deltaq1, 2 -+ -+ -+ -+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image. -+# Row has num16 16x16 blocks across -+# Beta goes from 0 to 64 -+# tc goes from 0 to 24 -+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number] -+# has 8 bytes per edge -+# has 16 bytes per direction -+# has 32 bytes per 16x16 block -+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4)) -+hevc_deblock_16x16: -+ push r6-r15, lr -+ mov r9,r4 -+ mov r4,r3 -+ mov r13,r2 -+ mov r2,r0 -+ mov r10,r0 -+ subscale4 r0,r1 -+ mov r8,63 -+ mov r6,-3 -+ vmov H(zeros,0),0 -+# r7 is number of blocks still to load -+# r0 is location of current block - 4 * stride -+# r1 is stride -+# r2 is location of current block -+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical -+# r4 is setup -+# r5 is for temporary calculations -+# r8 holds 63 -+# r6 holds -3 -+# r9 holds the number of 16 high rows to process -+# r10 holds the original img base -+# r11 returns 0 if no filtering was done on the edge -+# r12 saves a copy of this -+# r13 is copy of width -+ -+process_row: -+ # First iteration does not do horizontal filtering on previous -+ mov r7, r13 -+ mov r3,0 -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) # We may wish to prefetch these -+ vstb H(zeros,0),(r4) -+ bl vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 -+ bl vert_filter -+ sub r3,8 -+ b start_deblock_loop -+deblock_loop: -+ # Middle iterations do vertical on current block and horizontal on preceding -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) -+ vstb H(zeros,0),(r4) -+ bl vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl vert_filter -+ sub r3,8 -+ vldb H(setup_input,0), -16(r4) -+ vstb H(zeros,0),-16(r4) -+ bl horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl horz_filter -+ sub r3,8*64 -+ addcmpbeq r12,0,0,skip_save_top -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+skip_save_top: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+start_deblock_loop: -+ # move onto next 16x16 (could do this with circular buffer support instead) -+ add r3,16 -+ and r3,r8 -+ add r4,32 -+ # Perform loop counter operations (may work with an addcmpbgt as well?) -+ add r0,16 -+ add r2,16 -+ sub r7,1 -+ cmp r7,0 # Are there still more blocks to load -+ bgt deblock_loop -+ -+ # Final iteration needs to just do horizontal filtering -+ vldb H(setup_input,0), -16(r4) -+ vstb H(zeros,0),-16(r4) -+ bl horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl horz_filter -+ sub r3,64*8 -+ addcmpbeq r12,0,0,skip_save_top2 -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+skip_save_top2: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+ -+# Now look to see if we should do another row -+ sub r9,1 -+ cmp r9,0 -+ bgt start_again -+ pop r6-r15, pc -+start_again: -+ # Need to sort out r0,r2 to point to next row down -+ addscale16 r10,r1 -+ mov r2,r10 -+ subscale4 r0,r2,r1 -+ b process_row -+ -+ -+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered -+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations -+ -+vert_filter: -+ push lr -+ -+ vmov HX(P3,0), V(16,12)+r3 -+ vmov HX(P2,0), V(16,13)+r3 -+ vmov HX(P1,0), V(16,14)+r3 -+ vmov HX(P0,0), V(16,15)+r3 -+ vmov HX(Q0,0), V(16,16)+r3 -+ vmov HX(Q1,0), V(16,17)+r3 -+ vmov HX(Q2,0), V(16,18)+r3 -+ vmov HX(Q3,0), V(16,19)+r3 -+ -+ bl do_luma_filter -+ -+ vadds V(16,13)+r3, HX(P2,0), 0 -+ vadds V(16,14)+r3, HX(P1,0), 0 -+ vadds V(16,15)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds V(16,16)+r3, HX(Q0,0), 0 -+ vadds V(16,17)+r3, HX(Q1,0), 0 -+ vadds V(16,18)+r3, HX(Q2,0), 0 -+ -+ pop pc -+ -+# Filter edge at H(16,0)+r3 -+horz_filter: -+ push lr -+ -+ vmov HX(P3,0), H(12,0)+r3 -+ vmov HX(P2,0), H(13,0)+r3 -+ vmov HX(P1,0), H(14,0)+r3 -+ vmov HX(P0,0), H(15,0)+r3 -+ vmov HX(Q0,0), H(16,0)+r3 -+ vmov HX(Q1,0), H(17,0)+r3 -+ vmov HX(Q2,0), H(18,0)+r3 -+ vmov HX(Q3,0), H(19,0)+r3 -+ -+ bl do_luma_filter -+ -+ vadds H(13,0)+r3, HX(P2,0), 0 -+ vadds H(14,0)+r3, HX(P1,0), 0 -+ vadds H(15,0)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds H(16,0)+r3, HX(Q0,0), 0 -+ vadds H(17,0)+r3, HX(Q1,0), 0 -+ vadds H(18,0)+r3, HX(Q2,0), 0 -+ -+ pop pc -+ -+# r4 points to array of beta/tc for each 4 length edge -+do_luma_filter: -+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8 -+ valtl HX(beta,0),H(setup,0),H(setup,0) -+ valtu HX(tc,0),H(setup,0),H(setup,0) -+ vmul HX(tc25,0), HX(tc,0), 5 -+ vadd HX(tc25,0),HX(tc25,0), 1 -+ vasr HX(tc25,0), HX(tc25,0), 1 -+ -+ # Compute decision -+ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1 -+ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1 -+ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0 -+ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0 -+ -+ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1 -+ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1 -+ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0 -+ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0 -+ -+ vadd HX(d,0), HX(dp,0), HX(dq,0) -+ vasr HX(beta2,0),HX(beta,0),2 -+ vasr HX(beta3,0),HX(beta,0),3 -+ -+ # Compute flags that are negative if all conditions pass -+ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC -+ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC -+ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF -+ -+ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN -+ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF -+ vadd HX(decision,0), HX(d,0), HX(d,0) IFN -+ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF -+ vmov HX(decision,0), 1 IFNN -+ vadd H(decision,0),H(decision,3),0 IFN -+ vadd H(decision,16),H(decision,19),0 IFN -+ vmov -,HX(decision,0) SETF # N marks strong filter -+ vmov HX(decision,0), 1 IFNN # NN marks normal filter -+ -+ vadd HX(do_filter,0), HX(d,3), HX(d,0) -+ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter -+ vmov HX(decision,0),0 IFNN # Z marks no filter -+ -+ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3 -+ # First extract out even terms -+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3 -+ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123 -+ # Now expand back -+ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233 -+ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333 -+ -+ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering -+ -+ # Do a quick check to see if there is anything to do -+ mov r11, 0 # Signal no filtering -+ vmov -,1 IFNZ SUMS r5 -+ cmp r5,0 -+ beq filtering_done -+ mov r11, 1 # Signal some filtering -+ # And whether there is any strong filtering -+ vmov -,1 IFN SUMS r5 -+ cmp r5,0 -+ beq normal_filtering -+ -+ ############################################################################## -+ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!) -+ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2 -+ -+ # Take a copy of the original pixels for use in decision calculation -+ vmov HX(P0,32),HX(P0,0) -+ vmov HX(Q0,32),HX(Q0,0) -+ vmov HX(P1,32),HX(P1,0) -+ vmov HX(Q1,32),HX(Q1,0) -+ vmov HX(P2,32),HX(P2,0) -+ vmov HX(Q2,32),HX(Q2,0) -+ -+ vadd -,HX(P2,32),4 CLRA SACC -+ vshl -,HX(P1,32),1 SACC -+ vshl -,HX(P0,32),1 SACC -+ vshl -,HX(Q0,32),1 SACC -+ vshl HX(delta,0),HX(Q1,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(P0,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN -+ -+ vadd -,HX(P2,32),2 CLRA SACC -+ vadd -,HX(P1,32),HX(P0,32) SACC -+ vshl HX(delta,0),HX(Q0,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 2 -+ vsub HX(delta,0),HX(delta,0),HX(P1,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN -+ -+ vadd -,HX(Q0,32),4 CLRA SACC -+ vadd -,HX(P1,32),HX(P0,32) SACC -+ vmul -,HX(P2,32),3 SACC -+ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(P2,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN -+ #vmov HX(P2,0),3 IFN -+ -+ # Now reverse all P/Qs -+ -+ vadd -,HX(Q2,32),4 CLRA SACC -+ vshl -,HX(Q1,32),1 SACC -+ vshl -,HX(Q0,32),1 SACC -+ vshl -,HX(P0,32),1 SACC -+ vshl HX(delta,0),HX(P1,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(Q0,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN -+ -+ vadd -,HX(Q2,32),2 CLRA SACC -+ vadd -,HX(Q1,32),HX(Q0,32) SACC -+ vshl HX(delta,0),HX(P0,32),0 SACC -+ vasr HX(delta,0),HX(delta,0), 2 -+ vsub HX(delta,0),HX(delta,0),HX(Q1,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN -+ -+ vadd -,HX(P0,32),4 CLRA SACC -+ vadd -,HX(Q1,32),HX(Q0,32) SACC -+ vmul -,HX(Q2,32),3 SACC -+ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct -+ vasr HX(delta,0),HX(delta,0), 3 -+ vsub HX(delta,0),HX(delta,0),HX(Q2,32) -+ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) -+ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN -+ -+ ############################################################################## -+ # Normal filtering -+normal_filtering: -+ # Invert the decision flags -+ # make instruction more complicated as assembler has error and loses SETF -+ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering -+ vmov -, HX(tc10,0) SETF # IFN means normal filtering -+ -+ vmov -,1 IFN SUMS r5 -+ cmp r5,0 -+ beq filtering_done -+ -+ vasr HX(tc2,0), HX(tc,0), 1 -+ vmul HX(tc10,0), HX(tc,0), 10 -+ -+ vasr HX(thresh,0), HX(beta,0), 1 -+ vadd HX(thresh,0), HX(thresh,0), HX(beta,0) -+ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC -+ -+ vadd HX(ptest,0),HX(dp,3),HX(dp,0) -+ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel -+ vadd HX(qtest,0),HX(dq,3),HX(dq,0) -+ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel -+ # Expand ptest and qtest together -+ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q -+ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........ -+ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq -+ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0) -+ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0) -+ -+ vsub HX(delta0,0), HX(Q0,0), HX(P0,0) -+ vsub HX(delta1,0), HX(Q1,0), HX(P1,0) -+ vmov -,8 CLRA SACC -+ vmul -,HX(delta0,0), 9 SACC -+ vmul HX(delta0,0),HX(delta1,0), r6 SACC -+ vasr HX(delta0,0), HX(delta0,0), 4 -+ vdist HX(deltatest,0), HX(delta0,0), 0 -+ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something -+ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later -+ -+ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0) -+ -+ vadd HX(deltap1,0), HX(P2,0), HX(P0,0) -+ vadd HX(deltap1,0), HX(deltap1,0), 1 -+ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC -+ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC -+ vasr HX(deltap1,0), HX(deltap1,0), 1 -+ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0) -+ -+ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0) -+ vadd HX(deltaq1,0), HX(deltaq1,0), 1 -+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC -+ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0) -+ vrsub -, HX(delta0,0), 0 SACC -+ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC -+ vasr HX(deltaq1,0), HX(deltaq1,0), 1 -+ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0) -+ -+ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN -+ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN -+ -+ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1 -+ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN -+ -+ vmov -,HX(deltatest,0) SETF -+ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1 -+ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN -+ -+ #vmov HX(P2,0),1 IFN -+ -+filtering_done: -+ b lr -+ -+ -+hevc_uv_deblock_16x16: -+ push r6-r15, lr -+ mov r14,0 -+ b hevc_uv_start -+hevc_uv_deblock_16x16_with_clear: -+ push r6-r15, lr -+ mov r14,1 -+ b hevc_uv_start -+ -+hevc_uv_start: -+ mov r9,r4 -+ mov r4,r3 -+ mov r13,r2 -+ mov r2,r0 -+ mov r10,r0 -+ subscale4 r0,r1 -+ mov r8,63 -+ mov r6,-3 -+ vmov H(zeros,0),0 -+# r7 is number of blocks still to load -+# r0 is location of current block - 4 * stride -+# r1 is stride -+# r2 is location of current block -+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical -+# r4 is setup -+# r5 is for temporary calculations -+# r8 holds 63 -+# r6 holds -3 -+# r9 holds the number of 16 high rows to process -+# r10 holds the original img base -+# r11 returns 0 if no filtering was done on the edge -+# r12 saves a copy of this -+# r13 is copy of width -+# r14 is 1 if we should clear the old contents, or 0 if not -+ -+uv_process_row: -+ # First iteration does not do horizontal filtering on previous -+ mov r7, r13 -+ mov r3,0 -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) # We may wish to prefetch these -+ cmp r14,1 -+ bne uv_skip0 -+ vstb H(zeros,0),(r4) -+uv_skip0: -+ bl uv_vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 -+ bl uv_vert_filter -+ sub r3,8 -+ b uv_start_deblock_loop -+uv_deblock_loop: -+ # Middle iterations do vertical on current block and horizontal on preceding -+ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block -+ vldb H(16++,16)+r3,(r2 += r1) REP 16 -+ vldb H(setup_input,0), (r4) -+ cmp r14,1 -+ bne uv_skip1 -+ vstb H(zeros,0),(r4) -+uv_skip1: -+ bl uv_vert_filter -+ add r3,8 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_vert_filter -+ sub r3,8 -+ vldb H(setup_input,0), -16(r4) -+ cmp r14,1 -+ bne uv_skip3 -+ vstb H(zeros,0),-16(r4) -+uv_skip3: -+ bl uv_horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_horz_filter -+ sub r3,8*64 -+ addcmpbeq r12,0,0,uv_skip_save_top -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+uv_skip_save_top: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+uv_start_deblock_loop: -+ # move onto next 16x16 (could do this with circular buffer support instead) -+ add r3,16 -+ and r3,r8 -+ add r4,32 -+ # Perform loop counter operations (may work with an addcmpbgt as well?) -+ add r0,16 -+ add r2,16 -+ sub r7,1 -+ cmp r7,0 # Are there still more blocks to load -+ bgt uv_deblock_loop -+ -+ # Final iteration needs to just do horizontal filtering -+ vldb H(setup_input,0), -16(r4) -+ cmp r14,1 -+ bne uv_skip2 -+ vstb H(zeros,0),-16(r4) -+uv_skip2: -+ bl uv_horz_filter -+ mov r12,r11 -+ add r3,8*64 -+ vadd H(setup_input,0),H(setup_input,8),0 -+ bl uv_horz_filter -+ sub r3,64*8 -+ addcmpbeq r12,0,0,uv_skip_save_top2 -+ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block -+uv_skip_save_top2: -+ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 -+ -+# Now look to see if we should do another row -+ sub r9,1 -+ cmp r9,0 -+ bgt uv_start_again -+ pop r6-r15, pc -+uv_start_again: -+ # Need to sort out r0,r2 to point to next row down -+ addscale16 r10,r1 -+ mov r2,r10 -+ subscale4 r0,r2,r1 -+ b uv_process_row -+ -+ -+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered -+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations -+ -+uv_vert_filter: -+ push lr -+ -+ vmov HX(P1,0), V(16,14)+r3 -+ vmov HX(P0,0), V(16,15)+r3 -+ vmov HX(Q0,0), V(16,16)+r3 -+ vmov HX(Q1,0), V(16,17)+r3 -+ -+ bl do_chroma_filter -+ -+ vadds V(16,15)+r3, HX(P0,0), 0 -+ vadds V(16,16)+r3, HX(Q0,0), 0 -+ -+ pop pc -+ -+# Filter edge at H(16,0)+r3 -+uv_horz_filter: -+ push lr -+ -+ vmov HX(P1,0), H(14,0)+r3 -+ vmov HX(P0,0), H(15,0)+r3 -+ vmov HX(Q0,0), H(16,0)+r3 -+ vmov HX(Q1,0), H(17,0)+r3 -+ -+ bl do_chroma_filter -+ -+ vadds H(15,0)+r3, HX(P0,0), 0 -+ # P3 and Q3 never change so don't bother saving back -+ vadds H(16,0)+r3, HX(Q0,0), 0 -+ -+ pop pc -+ -+# r4 points to array of beta/tc for each 4 length edge -+do_chroma_filter: -+ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8 -+ valtl HX(tc,0),H(setup,0),H(setup,0) -+ -+ vsub HX(delta,0),HX(Q0,0),HX(P0,0) -+ vshl HX(delta,0),HX(delta,0),2 CLRA SACC -+ vsub -,HX(P1,0),HX(Q1,0) SACC -+ vmov HX(delta,0),4 SACC -+ vasr HX(delta,0),HX(delta,0),3 -+ vclamps HX(delta,0), HX(delta,0), HX(tc,0) -+ vadd HX(P0,0),HX(P0,0),HX(delta,0) -+ vsub HX(Q0,0),HX(Q0,0),HX(delta,0) -+ b lr -+ -+# r0 = list -+# r1 = number -+hevc_run_command_list: -+ push r6-r7, lr -+ mov r6, r0 -+ mov r7, r1 -+loop_cmds: -+ ld r0,(r6) # How to encode r6++? -+ add r6,4 -+ ld r1,(r6) -+ add r6,4 -+ ld r2,(r6) -+ add r6,4 -+ ld r3,(r6) -+ add r6,4 -+ ld r4,(r6) -+ add r6,4 -+ ld r5,(r6) -+ add r6,4 -+ bl hevc_trans_16x16 -+ sub r7,1 -+ cmp r7,0 -+ bgt loop_cmds -+ -+ pop r6-r7, pc -diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h -new file mode 100644 -index 0000000000..b0e9902d82 ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform10.h -@@ -0,0 +1,3070 @@ -+static const unsigned char rpi_hevc_transform10 [] = { -+21, -+106, -+0, -+144, -+47, -+1, -+37, -+106, -+0, -+144, -+66, -+1, -+53, -+106, -+0, -+144, -+192, -+4, -+69, -+106, -+0, -+144, -+192, -+4, -+85, -+106, -+0, -+144, -+220, -+5, -+169, -+3, -+62, -+64, -+79, -+64, -+3, -+232, -+32, -+0, -+0, -+0, -+12, -+248, -+0, -+136, -+0, -+0, -+192, -+248, -+0, -+0, -+64, -+232, -+0, -+2, -+0, -+0, -+12, -+248, -+0, -+168, -+0, -+0, -+192, -+248, -+0, -+0, -+0, -+96, -+3, -+232, -+32, -+0, -+0, -+0, -+7, -+232, -+0, -+2, -+0, -+0, -+8, -+232, -+0, -+4, -+0, -+0, -+12, -+248, -+0, -+128, -+0, -+0, -+192, -+8, -+4, -+0, -+4, -+232, -+64, -+0, -+0, -+0, -+5, -+232, -+0, -+2, -+0, -+0, -+128, -+69, -+113, -+66, -+12, -+248, -+0, -+128, -+0, -+0, -+192, -+8, -+4, -+0, -+128, -+69, -+113, -+70, -+128, -+144, -+40, -+0, -+4, -+255, -+48, -+192, -+128, -+3, -+32, -+8, -+16, -+0, -+76, -+254, -+48, -+192, -+9, -+4, -+32, -+8, -+0, -+0, -+4, -+254, -+0, -+144, -+128, -+2, -+0, -+8, -+2, -+0, -+128, -+144, -+23, -+0, -+4, -+255, -+48, -+192, -+128, -+3, -+32, -+8, -+20, -+0, -+76, -+254, -+48, -+192, -+6, -+4, -+32, -+8, -+0, -+0, -+140, -+248, -+44, -+0, -+0, -+0, -+32, -+48, -+4, -+0, -+128, -+69, -+113, -+66, -+242, -+140, -+211, -+192, -+34, -+31, -+41, -+3, -+70, -+192, -+80, -+7, -+164, -+255, -+36, -+204, -+96, -+2, -+0, -+248, -+62, -+0, -+3, -+255, -+55, -+208, -+120, -+3, -+224, -+3, -+190, -+11, -+16, -+139, -+246, -+91, -+0, -+103, -+90, -+0, -+70, -+192, -+80, -+7, -+164, -+255, -+36, -+204, -+224, -+2, -+0, -+248, -+62, -+0, -+3, -+255, -+55, -+208, -+120, -+3, -+224, -+3, -+190, -+11, -+16, -+139, -+246, -+91, -+0, -+103, -+90, -+0, -+225, -+64, -+242, -+64, -+3, -+232, -+128, -+0, -+0, -+0, -+7, -+232, -+0, -+2, -+0, -+0, -+57, -+239, -+224, -+247, -+255, -+255, -+72, -+192, -+95, -+207, -+88, -+122, -+88, -+124, -+137, -+64, -+26, -+64, -+4, -+232, -+64, -+0, -+0, -+0, -+149, -+96, -+161, -+64, -+152, -+64, -+128, -+144, -+35, -+0, -+72, -+232, -+0, -+4, -+0, -+0, -+65, -+232, -+32, -+0, -+0, -+0, -+128, -+144, -+27, -+0, -+4, -+232, -+0, -+2, -+0, -+0, -+101, -+96, -+145, -+64, -+168, -+64, -+128, -+144, -+19, -+0, -+72, -+232, -+0, -+4, -+0, -+0, -+65, -+232, -+32, -+0, -+0, -+0, -+128, -+144, -+11, -+0, -+74, -+232, -+0, -+8, -+0, -+0, -+242, -+140, -+221, -+192, -+57, -+239, -+32, -+8, -+0, -+0, -+41, -+3, -+239, -+3, -+12, -+248, -+0, -+128, -+0, -+0, -+192, -+248, -+4, -+0, -+12, -+248, -+0, -+132, -+64, -+0, -+192, -+248, -+4, -+0, -+0, -+96, -+255, -+159, -+154, -+255, -+0, -+232, -+0, -+4, -+0, -+0, -+255, -+159, -+165, -+255, -+4, -+255, -+48, -+204, -+16, -+3, -+224, -+251, -+62, -+0, -+4, -+255, -+51, -+204, -+128, -+3, -+224, -+251, -+16, -+0, -+76, -+254, -+51, -+204, -+128, -+3, -+224, -+251, -+20, -+0, -+128, -+64, -+6, -+232, -+64, -+0, -+0, -+0, -+140, -+248, -+47, -+0, -+0, -+0, -+224, -+99, -+0, -+0, -+32, -+247, -+240, -+207, -+16, -+3, -+32, -+247, -+176, -+207, -+17, -+19, -+32, -+247, -+112, -+207, -+18, -+35, -+32, -+247, -+48, -+207, -+19, -+51, -+32, -+247, -+240, -+206, -+20, -+67, -+32, -+247, -+176, -+206, -+21, -+83, -+32, -+247, -+112, -+206, -+22, -+99, -+32, -+247, -+48, -+206, -+23, -+115, -+32, -+247, -+240, -+205, -+24, -+131, -+32, -+247, -+176, -+205, -+25, -+147, -+32, -+247, -+112, -+205, -+26, -+163, -+32, -+247, -+48, -+205, -+27, -+179, -+32, -+247, -+240, -+204, -+28, -+195, -+32, -+247, -+176, -+204, -+29, -+211, -+32, -+247, -+112, -+204, -+30, -+227, -+32, -+247, -+48, -+204, -+31, -+243, -+4, -+255, -+51, -+204, -+128, -+3, -+224, -+251, -+16, -+0, -+76, -+254, -+51, -+204, -+128, -+3, -+224, -+251, -+20, -+0, -+0, -+237, -+32, -+0, -+0, -+0, -+140, -+248, -+47, -+0, -+0, -+0, -+224, -+99, -+0, -+0, -+111, -+3, -+4, -+254, -+0, -+128, -+0, -+4, -+0, -+248, -+0, -+0, -+2, -+232, -+32, -+0, -+0, -+0, -+140, -+248, -+32, -+0, -+0, -+0, -+224, -+35, -+0, -+0, -+64, -+232, -+0, -+2, -+0, -+0, -+193, -+232, -+0, -+1, -+0, -+0, -+1, -+106, -+116, -+30, -+90, -+0, -+169, -+3, -+73, -+64, -+52, -+64, -+45, -+64, -+2, -+64, -+10, -+64, -+64, -+198, -+1, -+7, -+8, -+232, -+63, -+0, -+0, -+0, -+6, -+232, -+253, -+255, -+255, -+255, -+0, -+246, -+0, -+0, -+0, -+4, -+215, -+64, -+3, -+96, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+137, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+129, -+0, -+131, -+102, -+0, -+158, -+67, -+0, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+108, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+100, -+0, -+131, -+102, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+161, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+150, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+3, -+99, -+131, -+71, -+68, -+232, -+32, -+0, -+0, -+0, -+0, -+99, -+2, -+99, -+23, -+102, -+7, -+106, -+127, -+156, -+182, -+255, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+112, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+101, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+25, -+102, -+9, -+106, -+2, -+30, -+41, -+3, -+26, -+87, -+162, -+64, -+64, -+198, -+1, -+23, -+127, -+158, -+103, -+255, -+239, -+3, -+0, -+254, -+0, -+143, -+92, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+143, -+93, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+143, -+94, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+95, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+208, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+209, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+142, -+210, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+0, -+142, -+211, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+107, -+0, -+8, -+255, -+99, -+23, -+0, -+212, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+23, -+0, -+228, -+192, -+51, -+0, -+0, -+8, -+255, -+227, -+23, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+52, -+0, -+180, -+192, -+51, -+0, -+0, -+8, -+255, -+99, -+52, -+0, -+164, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+52, -+0, -+148, -+192, -+51, -+0, -+0, -+111, -+3, -+239, -+3, -+0, -+254, -+0, -+143, -+12, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+143, -+13, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+143, -+14, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+15, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+16, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+17, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+142, -+18, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+0, -+142, -+19, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+33, -+0, -+8, -+255, -+99, -+3, -+0, -+212, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+3, -+0, -+228, -+192, -+51, -+0, -+0, -+8, -+255, -+227, -+3, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+4, -+0, -+180, -+192, -+51, -+0, -+0, -+8, -+255, -+99, -+4, -+0, -+164, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+4, -+0, -+148, -+192, -+51, -+0, -+0, -+111, -+3, -+32, -+246, -+192, -+11, -+1, -+16, -+32, -+246, -+2, -+137, -+47, -+240, -+40, -+246, -+2, -+140, -+47, -+240, -+128, -+245, -+99, -+140, -+5, -+4, -+0, -+247, -+99, -+140, -+1, -+20, -+88, -+246, -+99, -+140, -+1, -+20, -+0, -+247, -+35, -+136, -+62, -+226, -+32, -+247, -+35, -+136, -+32, -+210, -+0, -+247, -+34, -+136, -+63, -+2, -+208, -+246, -+34, -+136, -+0, -+4, -+0, -+247, -+99, -+136, -+58, -+162, -+32, -+247, -+99, -+136, -+33, -+146, -+0, -+247, -+98, -+136, -+59, -+18, -+208, -+246, -+98, -+136, -+0, -+20, -+0, -+247, -+162, -+136, -+33, -+2, -+88, -+246, -+98, -+137, -+2, -+68, -+88, -+246, -+162, -+137, -+3, -+68, -+208, -+254, -+227, -+136, -+60, -+242, -+192, -+243, -+188, -+11, -+208, -+254, -+227, -+136, -+56, -+178, -+192, -+243, -+188, -+10, -+32, -+255, -+226, -+136, -+38, -+58, -+192, -+243, -+60, -+0, -+208, -+254, -+227, -+136, -+59, -+242, -+192, -+243, -+60, -+128, -+32, -+255, -+226, -+136, -+49, -+58, -+192, -+243, -+60, -+128, -+0, -+255, -+226, -+136, -+34, -+34, -+192, -+243, -+60, -+128, -+32, -+255, -+226, -+136, -+37, -+58, -+192, -+243, -+60, -+128, -+0, -+254, -+192, -+136, -+1, -+4, -+0, -+240, -+0, -+160, -+0, -+255, -+194, -+8, -+0, -+52, -+195, -+243, -+0, -+128, -+0, -+255, -+202, -+40, -+0, -+52, -+195, -+243, -+0, -+128, -+0, -+254, -+0, -+240, -+35, -+10, -+0, -+240, -+60, -+0, -+0, -+254, -+192, -+136, -+1, -+4, -+0, -+240, -+0, -+160, -+0, -+255, -+226, -+140, -+34, -+34, -+195, -+243, -+60, -+0, -+32, -+255, -+227, -+140, -+36, -+58, -+192, -+243, -+60, -+0, -+0, -+254, -+192, -+136, -+0, -+4, -+0, -+240, -+0, -+160, -+16, -+246, -+226, -+136, -+35, -+50, -+16, -+246, -+226, -+136, -+35, -+50, -+32, -+246, -+226, -+136, -+35, -+50, -+32, -+254, -+226, -+136, -+35, -+58, -+192, -+243, -+60, -+0, -+11, -+96, -+0, -+254, -+0, -+240, -+1, -+4, -+0, -+240, -+64, -+115, -+5, -+106, -+0, -+144, -+173, -+1, -+27, -+96, -+0, -+254, -+0, -+240, -+1, -+4, -+0, -+240, -+64, -+147, -+5, -+106, -+0, -+144, -+227, -+0, -+64, -+246, -+163, -+140, -+1, -+4, -+0, -+246, -+192, -+175, -+63, -+2, -+0, -+246, -+192, -+174, -+59, -+2, -+0, -+246, -+128, -+175, -+62, -+2, -+0, -+246, -+128, -+174, -+58, -+2, -+0, -+246, -+64, -+175, -+61, -+2, -+0, -+246, -+64, -+174, -+57, -+2, -+0, -+255, -+43, -+240, -+4, -+212, -+192, -+243, -+128, -+11, -+64, -+254, -+43, -+240, -+1, -+228, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+244, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+180, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+141, -+0, -+164, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+191, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+235, -+143, -+52, -+242, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+2, -+212, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+191, -+226, -+192, -+243, -+188, -+10, -+64, -+254, -+43, -+141, -+0, -+180, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+2, -+68, -+32, -+247, -+35, -+141, -+190, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+171, -+143, -+52, -+226, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+4, -+180, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+191, -+226, -+192, -+243, -+188, -+10, -+128, -+253, -+43, -+240, -+3, -+212, -+192, -+243, -+128, -+10, -+64, -+254, -+35, -+141, -+1, -+196, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+189, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+107, -+143, -+52, -+210, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+4, -+148, -+192, -+243, -+128, -+11, -+64, -+254, -+43, -+240, -+1, -+164, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+180, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+244, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+141, -+0, -+228, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+187, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+235, -+142, -+52, -+178, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+2, -+148, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+187, -+162, -+192, -+243, -+188, -+10, -+64, -+254, -+43, -+141, -+0, -+244, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+2, -+68, -+32, -+247, -+35, -+141, -+186, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+171, -+142, -+52, -+162, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+4, -+244, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+187, -+162, -+192, -+243, -+188, -+10, -+128, -+253, -+43, -+240, -+3, -+148, -+192, -+243, -+128, -+10, -+64, -+254, -+35, -+141, -+1, -+132, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+185, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+107, -+142, -+52, -+146, -+192, -+243, -+60, -+128, -+64, -+255, -+98, -+141, -+0, -+52, -+192, -+243, -+0, -+0, -+0, -+254, -+0, -+240, -+53, -+10, -+0, -+240, -+60, -+0, -+0, -+254, -+0, -+240, -+1, -+4, -+0, -+240, -+64, -+147, -+5, -+106, -+0, -+144, -+177, -+0, -+88, -+246, -+163, -+140, -+1, -+4, -+128, -+245, -+99, -+141, -+10, -+4, -+88, -+246, -+162, -+138, -+1, -+68, -+0, -+247, -+162, -+138, -+36, -+162, -+88, -+254, -+162, -+138, -+3, -+164, -+192, -+243, -+128, -+11, -+0, -+255, -+226, -+137, -+32, -+2, -+195, -+243, -+60, -+0, -+32, -+247, -+226, -+137, -+42, -+114, -+0, -+255, -+34, -+138, -+33, -+18, -+195, -+243, -+60, -+0, -+32, -+247, -+34, -+138, -+42, -+130, -+16, -+246, -+98, -+138, -+40, -+114, -+16, -+246, -+98, -+138, -+41, -+146, -+32, -+246, -+98, -+138, -+41, -+146, -+32, -+246, -+226, -+137, -+41, -+146, -+40, -+246, -+34, -+138, -+41, -+146, -+32, -+247, -+163, -+141, -+63, -+178, -+32, -+247, -+227, -+141, -+62, -+162, -+0, -+254, -+0, -+240, -+8, -+4, -+0, -+240, -+128, -+11, -+128, -+253, -+35, -+240, -+9, -+100, -+192, -+243, -+128, -+10, -+128, -+253, -+163, -+141, -+128, -+115, -+192, -+243, -+152, -+10, -+88, -+246, -+163, -+141, -+4, -+100, -+208, -+246, -+35, -+139, -+0, -+100, -+32, -+255, -+34, -+139, -+53, -+202, -+192, -+243, -+60, -+128, -+0, -+254, -+0, -+139, -+0, -+4, -+0, -+240, -+0, -+160, -+240, -+246, -+163, -+141, -+48, -+98, -+0, -+247, -+99, -+139, -+63, -+210, -+0, -+247, -+98, -+139, -+1, -+212, -+88, -+254, -+98, -+139, -+1, -+212, -+192, -+243, -+128, -+11, -+32, -+255, -+99, -+139, -+62, -+98, -+192, -+243, -+188, -+10, -+88, -+246, -+98, -+139, -+1, -+212, -+240, -+246, -+98, -+139, -+50, -+210, -+0, -+247, -+163, -+128, -+59, -+146, -+0, -+247, -+160, -+128, -+1, -+36, -+88, -+254, -+160, -+128, -+1, -+36, -+192, -+243, -+128, -+11, -+0, -+247, -+163, -+128, -+58, -+98, -+64, -+255, -+35, -+240, -+0, -+100, -+192, -+243, -+128, -+10, -+64, -+255, -+163, -+128, -+0, -+164, -+192, -+243, -+128, -+10, -+88, -+246, -+160, -+128, -+1, -+36, -+240, -+246, -+160, -+128, -+50, -+34, -+8, -+255, -+227, -+143, -+54, -+242, -+192, -+243, -+60, -+128, -+40, -+255, -+227, -+142, -+54, -+178, -+192, -+243, -+60, -+128, -+0, -+254, -+0, -+240, -+39, -+10, -+0, -+240, -+60, -+128, -+8, -+255, -+163, -+143, -+45, -+226, -+192, -+243, -+60, -+128, -+0, -+254, -+0, -+240, -+44, -+10, -+0, -+240, -+60, -+0, -+0, -+254, -+0, -+240, -+40, -+10, -+0, -+240, -+60, -+128, -+8, -+255, -+163, -+142, -+2, -+162, -+192, -+243, -+60, -+128, -+90, -+0, -+169, -+3, -+14, -+96, -+4, -+31, -+169, -+3, -+30, -+96, -+1, -+31, -+73, -+64, -+52, -+64, -+45, -+64, -+2, -+64, -+10, -+64, -+64, -+198, -+1, -+7, -+8, -+232, -+63, -+0, -+0, -+0, -+6, -+232, -+253, -+255, -+255, -+255, -+0, -+246, -+0, -+0, -+0, -+4, -+215, -+64, -+3, -+96, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+30, -+106, -+132, -+24, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+143, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+135, -+0, -+131, -+102, -+0, -+158, -+71, -+0, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+30, -+106, -+132, -+24, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+112, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+104, -+0, -+131, -+102, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+30, -+106, -+134, -+24, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+123, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+112, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+3, -+99, -+131, -+71, -+68, -+232, -+32, -+0, -+0, -+0, -+0, -+99, -+2, -+99, -+23, -+102, -+7, -+106, -+127, -+156, -+178, -+255, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+30, -+106, -+134, -+24, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+72, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+61, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+25, -+102, -+9, -+106, -+2, -+30, -+41, -+3, -+26, -+87, -+162, -+64, -+64, -+198, -+1, -+23, -+127, -+158, -+95, -+255, -+239, -+3, -+0, -+254, -+128, -+143, -+94, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+95, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+208, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+209, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+47, -+0, -+8, -+255, -+227, -+23, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+52, -+0, -+180, -+192, -+51, -+0, -+0, -+111, -+3, -+239, -+3, -+0, -+254, -+128, -+143, -+14, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+15, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+16, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+17, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+13, -+0, -+8, -+255, -+227, -+3, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+4, -+0, -+180, -+192, -+51, -+0, -+0, -+111, -+3, -+32, -+246, -+192, -+11, -+1, -+16, -+32, -+246, -+2, -+140, -+47, -+240, -+32, -+247, -+35, -+141, -+63, -+178, -+64, -+254, -+35, -+141, -+2, -+68, -+192, -+243, -+128, -+11, -+32, -+255, -+35, -+240, -+58, -+226, -+192, -+243, -+188, -+10, -+0, -+254, -+0, -+141, -+4, -+4, -+0, -+240, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+240, -+246, -+35, -+141, -+48, -+66, -+0, -+247, -+227, -+143, -+52, -+242, -+32, -+247, -+227, -+142, -+52, -+178, -+90, -+0, -+161, -+3, -+6, -+64, -+23, -+64, -+96, -+8, -+70, -+98, -+97, -+8, -+70, -+98, -+98, -+8, -+70, -+98, -+99, -+8, -+70, -+98, -+100, -+8, -+70, -+98, -+101, -+8, -+70, -+98, -+255, -+159, -+8, -+250, -+23, -+102, -+7, -+106, -+112, -+30, -+33, -+3, -+}; -diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h -new file mode 100644 -index 0000000000..2901b6568d ---- /dev/null -+++ b/libavcodec/rpi_hevc_transform8.h -@@ -0,0 +1,3070 @@ -+static const unsigned char rpi_hevc_transform8 [] = { -+21, -+106, -+0, -+144, -+47, -+1, -+37, -+106, -+0, -+144, -+66, -+1, -+53, -+106, -+0, -+144, -+192, -+4, -+69, -+106, -+0, -+144, -+192, -+4, -+85, -+106, -+0, -+144, -+220, -+5, -+169, -+3, -+62, -+64, -+79, -+64, -+3, -+232, -+32, -+0, -+0, -+0, -+12, -+248, -+0, -+136, -+0, -+0, -+192, -+248, -+0, -+0, -+64, -+232, -+0, -+2, -+0, -+0, -+12, -+248, -+0, -+168, -+0, -+0, -+192, -+248, -+0, -+0, -+0, -+96, -+3, -+232, -+32, -+0, -+0, -+0, -+7, -+232, -+0, -+2, -+0, -+0, -+8, -+232, -+0, -+4, -+0, -+0, -+12, -+248, -+0, -+128, -+0, -+0, -+192, -+8, -+4, -+0, -+4, -+232, -+64, -+0, -+0, -+0, -+5, -+232, -+0, -+8, -+0, -+0, -+128, -+69, -+113, -+66, -+12, -+248, -+0, -+128, -+0, -+0, -+192, -+8, -+4, -+0, -+128, -+69, -+113, -+70, -+128, -+144, -+40, -+0, -+4, -+255, -+48, -+192, -+128, -+3, -+32, -+8, -+16, -+0, -+76, -+254, -+48, -+192, -+9, -+4, -+32, -+8, -+0, -+0, -+4, -+254, -+0, -+144, -+128, -+2, -+0, -+8, -+2, -+0, -+128, -+144, -+23, -+0, -+4, -+255, -+48, -+192, -+128, -+3, -+32, -+8, -+20, -+0, -+76, -+254, -+48, -+192, -+4, -+4, -+32, -+8, -+0, -+0, -+140, -+248, -+44, -+0, -+0, -+0, -+32, -+48, -+4, -+0, -+128, -+69, -+113, -+66, -+242, -+140, -+211, -+192, -+34, -+31, -+41, -+3, -+70, -+192, -+80, -+7, -+164, -+255, -+36, -+204, -+96, -+2, -+0, -+248, -+62, -+0, -+3, -+255, -+55, -+208, -+120, -+3, -+224, -+3, -+190, -+11, -+16, -+139, -+246, -+91, -+0, -+103, -+90, -+0, -+70, -+192, -+80, -+7, -+164, -+255, -+36, -+204, -+224, -+2, -+0, -+248, -+62, -+0, -+3, -+255, -+55, -+208, -+120, -+3, -+224, -+3, -+190, -+11, -+16, -+139, -+246, -+91, -+0, -+103, -+90, -+0, -+225, -+64, -+242, -+64, -+3, -+232, -+128, -+0, -+0, -+0, -+7, -+232, -+0, -+2, -+0, -+0, -+57, -+239, -+224, -+247, -+255, -+255, -+72, -+192, -+95, -+207, -+88, -+122, -+88, -+124, -+137, -+64, -+26, -+64, -+4, -+232, -+64, -+0, -+0, -+0, -+149, -+96, -+161, -+64, -+152, -+64, -+128, -+144, -+35, -+0, -+72, -+232, -+0, -+4, -+0, -+0, -+65, -+232, -+32, -+0, -+0, -+0, -+128, -+144, -+27, -+0, -+4, -+232, -+0, -+8, -+0, -+0, -+69, -+96, -+145, -+64, -+168, -+64, -+128, -+144, -+19, -+0, -+72, -+232, -+0, -+4, -+0, -+0, -+65, -+232, -+32, -+0, -+0, -+0, -+128, -+144, -+11, -+0, -+74, -+232, -+0, -+8, -+0, -+0, -+242, -+140, -+221, -+192, -+57, -+239, -+32, -+8, -+0, -+0, -+41, -+3, -+239, -+3, -+12, -+248, -+0, -+128, -+0, -+0, -+192, -+248, -+4, -+0, -+12, -+248, -+0, -+132, -+64, -+0, -+192, -+248, -+4, -+0, -+0, -+96, -+255, -+159, -+154, -+255, -+0, -+232, -+0, -+4, -+0, -+0, -+255, -+159, -+165, -+255, -+4, -+255, -+48, -+204, -+16, -+3, -+224, -+251, -+62, -+0, -+4, -+255, -+51, -+204, -+128, -+3, -+224, -+251, -+16, -+0, -+76, -+254, -+51, -+204, -+128, -+3, -+224, -+251, -+20, -+0, -+128, -+64, -+6, -+232, -+64, -+0, -+0, -+0, -+140, -+248, -+47, -+0, -+0, -+0, -+224, -+99, -+0, -+0, -+32, -+247, -+240, -+207, -+16, -+3, -+32, -+247, -+176, -+207, -+17, -+19, -+32, -+247, -+112, -+207, -+18, -+35, -+32, -+247, -+48, -+207, -+19, -+51, -+32, -+247, -+240, -+206, -+20, -+67, -+32, -+247, -+176, -+206, -+21, -+83, -+32, -+247, -+112, -+206, -+22, -+99, -+32, -+247, -+48, -+206, -+23, -+115, -+32, -+247, -+240, -+205, -+24, -+131, -+32, -+247, -+176, -+205, -+25, -+147, -+32, -+247, -+112, -+205, -+26, -+163, -+32, -+247, -+48, -+205, -+27, -+179, -+32, -+247, -+240, -+204, -+28, -+195, -+32, -+247, -+176, -+204, -+29, -+211, -+32, -+247, -+112, -+204, -+30, -+227, -+32, -+247, -+48, -+204, -+31, -+243, -+4, -+255, -+51, -+204, -+128, -+3, -+224, -+251, -+16, -+0, -+76, -+254, -+51, -+204, -+128, -+3, -+224, -+251, -+20, -+0, -+0, -+237, -+32, -+0, -+0, -+0, -+140, -+248, -+47, -+0, -+0, -+0, -+224, -+99, -+0, -+0, -+111, -+3, -+4, -+254, -+0, -+128, -+0, -+4, -+0, -+248, -+0, -+0, -+2, -+232, -+32, -+0, -+0, -+0, -+140, -+248, -+32, -+0, -+0, -+0, -+224, -+35, -+0, -+0, -+64, -+232, -+0, -+2, -+0, -+0, -+193, -+232, -+0, -+1, -+0, -+0, -+1, -+106, -+116, -+30, -+90, -+0, -+169, -+3, -+73, -+64, -+52, -+64, -+45, -+64, -+2, -+64, -+10, -+64, -+64, -+198, -+1, -+7, -+8, -+232, -+63, -+0, -+0, -+0, -+6, -+232, -+253, -+255, -+255, -+255, -+0, -+246, -+0, -+0, -+0, -+4, -+215, -+64, -+3, -+96, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+137, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+129, -+0, -+131, -+102, -+0, -+158, -+67, -+0, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+108, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+100, -+0, -+131, -+102, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+161, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+150, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+3, -+99, -+131, -+71, -+68, -+232, -+32, -+0, -+0, -+0, -+0, -+99, -+2, -+99, -+23, -+102, -+7, -+106, -+127, -+156, -+182, -+255, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+112, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+101, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+25, -+102, -+9, -+106, -+2, -+30, -+41, -+3, -+26, -+87, -+162, -+64, -+64, -+198, -+1, -+23, -+127, -+158, -+103, -+255, -+239, -+3, -+0, -+254, -+0, -+143, -+92, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+143, -+93, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+143, -+94, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+95, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+208, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+209, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+142, -+210, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+0, -+142, -+211, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+107, -+0, -+8, -+255, -+99, -+23, -+0, -+212, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+23, -+0, -+228, -+192, -+51, -+0, -+0, -+8, -+255, -+227, -+23, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+52, -+0, -+180, -+192, -+51, -+0, -+0, -+8, -+255, -+99, -+52, -+0, -+164, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+52, -+0, -+148, -+192, -+51, -+0, -+0, -+111, -+3, -+239, -+3, -+0, -+254, -+0, -+143, -+12, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+143, -+13, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+143, -+14, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+15, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+16, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+17, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+64, -+142, -+18, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+0, -+142, -+19, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+33, -+0, -+8, -+255, -+99, -+3, -+0, -+212, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+3, -+0, -+228, -+192, -+51, -+0, -+0, -+8, -+255, -+227, -+3, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+4, -+0, -+180, -+192, -+51, -+0, -+0, -+8, -+255, -+99, -+4, -+0, -+164, -+192, -+51, -+0, -+0, -+8, -+255, -+163, -+4, -+0, -+148, -+192, -+51, -+0, -+0, -+111, -+3, -+32, -+246, -+192, -+11, -+1, -+16, -+32, -+246, -+2, -+137, -+47, -+240, -+40, -+246, -+2, -+140, -+47, -+240, -+128, -+245, -+99, -+140, -+5, -+4, -+0, -+247, -+99, -+140, -+1, -+20, -+88, -+246, -+99, -+140, -+1, -+20, -+0, -+247, -+35, -+136, -+62, -+226, -+32, -+247, -+35, -+136, -+32, -+210, -+0, -+247, -+34, -+136, -+63, -+2, -+208, -+246, -+34, -+136, -+0, -+4, -+0, -+247, -+99, -+136, -+58, -+162, -+32, -+247, -+99, -+136, -+33, -+146, -+0, -+247, -+98, -+136, -+59, -+18, -+208, -+246, -+98, -+136, -+0, -+20, -+0, -+247, -+162, -+136, -+33, -+2, -+88, -+246, -+98, -+137, -+2, -+68, -+88, -+246, -+162, -+137, -+3, -+68, -+208, -+254, -+227, -+136, -+60, -+242, -+192, -+243, -+188, -+11, -+208, -+254, -+227, -+136, -+56, -+178, -+192, -+243, -+188, -+10, -+32, -+255, -+226, -+136, -+38, -+58, -+192, -+243, -+60, -+0, -+208, -+254, -+227, -+136, -+59, -+242, -+192, -+243, -+60, -+128, -+32, -+255, -+226, -+136, -+49, -+58, -+192, -+243, -+60, -+128, -+0, -+255, -+226, -+136, -+34, -+34, -+192, -+243, -+60, -+128, -+32, -+255, -+226, -+136, -+37, -+58, -+192, -+243, -+60, -+128, -+0, -+254, -+192, -+136, -+1, -+4, -+0, -+240, -+0, -+160, -+0, -+255, -+194, -+8, -+0, -+52, -+195, -+243, -+0, -+128, -+0, -+255, -+202, -+40, -+0, -+52, -+195, -+243, -+0, -+128, -+0, -+254, -+0, -+240, -+35, -+10, -+0, -+240, -+60, -+0, -+0, -+254, -+192, -+136, -+1, -+4, -+0, -+240, -+0, -+160, -+0, -+255, -+226, -+140, -+34, -+34, -+195, -+243, -+60, -+0, -+32, -+255, -+227, -+140, -+36, -+58, -+192, -+243, -+60, -+0, -+0, -+254, -+192, -+136, -+0, -+4, -+0, -+240, -+0, -+160, -+16, -+246, -+226, -+136, -+35, -+50, -+16, -+246, -+226, -+136, -+35, -+50, -+32, -+246, -+226, -+136, -+35, -+50, -+32, -+254, -+226, -+136, -+35, -+58, -+192, -+243, -+60, -+0, -+11, -+96, -+0, -+254, -+0, -+240, -+1, -+4, -+0, -+240, -+64, -+115, -+5, -+106, -+0, -+144, -+173, -+1, -+27, -+96, -+0, -+254, -+0, -+240, -+1, -+4, -+0, -+240, -+64, -+147, -+5, -+106, -+0, -+144, -+227, -+0, -+64, -+246, -+163, -+140, -+1, -+4, -+0, -+246, -+192, -+175, -+63, -+2, -+0, -+246, -+192, -+174, -+59, -+2, -+0, -+246, -+128, -+175, -+62, -+2, -+0, -+246, -+128, -+174, -+58, -+2, -+0, -+246, -+64, -+175, -+61, -+2, -+0, -+246, -+64, -+174, -+57, -+2, -+0, -+255, -+43, -+240, -+4, -+212, -+192, -+243, -+128, -+11, -+64, -+254, -+43, -+240, -+1, -+228, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+244, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+180, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+141, -+0, -+164, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+191, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+235, -+143, -+52, -+242, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+2, -+212, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+191, -+226, -+192, -+243, -+188, -+10, -+64, -+254, -+43, -+141, -+0, -+180, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+2, -+68, -+32, -+247, -+35, -+141, -+190, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+171, -+143, -+52, -+226, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+4, -+180, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+191, -+226, -+192, -+243, -+188, -+10, -+128, -+253, -+43, -+240, -+3, -+212, -+192, -+243, -+128, -+10, -+64, -+254, -+35, -+141, -+1, -+196, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+189, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+107, -+143, -+52, -+210, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+4, -+148, -+192, -+243, -+128, -+11, -+64, -+254, -+43, -+240, -+1, -+164, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+180, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+240, -+1, -+244, -+192, -+243, -+128, -+10, -+64, -+254, -+43, -+141, -+0, -+228, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+187, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+235, -+142, -+52, -+178, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+2, -+148, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+187, -+162, -+192, -+243, -+188, -+10, -+64, -+254, -+43, -+141, -+0, -+244, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+2, -+68, -+32, -+247, -+35, -+141, -+186, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+171, -+142, -+52, -+162, -+192, -+243, -+60, -+128, -+0, -+255, -+43, -+240, -+4, -+244, -+192, -+243, -+128, -+11, -+0, -+255, -+43, -+240, -+187, -+162, -+192, -+243, -+188, -+10, -+128, -+253, -+43, -+240, -+3, -+148, -+192, -+243, -+128, -+10, -+64, -+254, -+35, -+141, -+1, -+132, -+192, -+243, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+32, -+247, -+35, -+141, -+185, -+66, -+240, -+246, -+35, -+141, -+50, -+66, -+0, -+255, -+107, -+142, -+52, -+146, -+192, -+243, -+60, -+128, -+64, -+255, -+98, -+141, -+0, -+52, -+192, -+243, -+0, -+0, -+0, -+254, -+0, -+240, -+53, -+10, -+0, -+240, -+60, -+0, -+0, -+254, -+0, -+240, -+1, -+4, -+0, -+240, -+64, -+147, -+5, -+106, -+0, -+144, -+177, -+0, -+88, -+246, -+163, -+140, -+1, -+4, -+128, -+245, -+99, -+141, -+10, -+4, -+88, -+246, -+162, -+138, -+1, -+68, -+0, -+247, -+162, -+138, -+36, -+162, -+88, -+254, -+162, -+138, -+3, -+164, -+192, -+243, -+128, -+11, -+0, -+255, -+226, -+137, -+32, -+2, -+195, -+243, -+60, -+0, -+32, -+247, -+226, -+137, -+42, -+114, -+0, -+255, -+34, -+138, -+33, -+18, -+195, -+243, -+60, -+0, -+32, -+247, -+34, -+138, -+42, -+130, -+16, -+246, -+98, -+138, -+40, -+114, -+16, -+246, -+98, -+138, -+41, -+146, -+32, -+246, -+98, -+138, -+41, -+146, -+32, -+246, -+226, -+137, -+41, -+146, -+40, -+246, -+34, -+138, -+41, -+146, -+32, -+247, -+163, -+141, -+63, -+178, -+32, -+247, -+227, -+141, -+62, -+162, -+0, -+254, -+0, -+240, -+8, -+4, -+0, -+240, -+128, -+11, -+128, -+253, -+35, -+240, -+9, -+100, -+192, -+243, -+128, -+10, -+128, -+253, -+163, -+141, -+128, -+115, -+192, -+243, -+152, -+10, -+88, -+246, -+163, -+141, -+4, -+100, -+208, -+246, -+35, -+139, -+0, -+100, -+32, -+255, -+34, -+139, -+53, -+202, -+192, -+243, -+60, -+128, -+0, -+254, -+0, -+139, -+0, -+4, -+0, -+240, -+0, -+160, -+240, -+246, -+163, -+141, -+48, -+98, -+0, -+247, -+99, -+139, -+63, -+210, -+0, -+247, -+98, -+139, -+1, -+212, -+88, -+254, -+98, -+139, -+1, -+212, -+192, -+243, -+128, -+11, -+32, -+255, -+99, -+139, -+62, -+98, -+192, -+243, -+188, -+10, -+88, -+246, -+98, -+139, -+1, -+212, -+240, -+246, -+98, -+139, -+50, -+210, -+0, -+247, -+163, -+128, -+59, -+146, -+0, -+247, -+160, -+128, -+1, -+36, -+88, -+254, -+160, -+128, -+1, -+36, -+192, -+243, -+128, -+11, -+0, -+247, -+163, -+128, -+58, -+98, -+64, -+255, -+35, -+240, -+0, -+100, -+192, -+243, -+128, -+10, -+64, -+255, -+163, -+128, -+0, -+164, -+192, -+243, -+128, -+10, -+88, -+246, -+160, -+128, -+1, -+36, -+240, -+246, -+160, -+128, -+50, -+34, -+8, -+255, -+227, -+143, -+54, -+242, -+192, -+243, -+60, -+128, -+40, -+255, -+227, -+142, -+54, -+178, -+192, -+243, -+60, -+128, -+0, -+254, -+0, -+240, -+39, -+10, -+0, -+240, -+60, -+128, -+8, -+255, -+163, -+143, -+45, -+226, -+192, -+243, -+60, -+128, -+0, -+254, -+0, -+240, -+44, -+10, -+0, -+240, -+60, -+0, -+0, -+254, -+0, -+240, -+40, -+10, -+0, -+240, -+60, -+128, -+8, -+255, -+163, -+142, -+2, -+162, -+192, -+243, -+60, -+128, -+90, -+0, -+169, -+3, -+14, -+96, -+4, -+31, -+169, -+3, -+30, -+96, -+1, -+31, -+73, -+64, -+52, -+64, -+45, -+64, -+2, -+64, -+10, -+64, -+64, -+198, -+1, -+7, -+8, -+232, -+63, -+0, -+0, -+0, -+6, -+232, -+253, -+255, -+255, -+255, -+0, -+246, -+0, -+0, -+0, -+4, -+215, -+64, -+3, -+96, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+30, -+106, -+132, -+24, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+143, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+135, -+0, -+131, -+102, -+0, -+158, -+71, -+0, -+2, -+248, -+0, -+35, -+0, -+0, -+64, -+56, -+0, -+0, -+4, -+248, -+0, -+36, -+0, -+0, -+64, -+56, -+8, -+0, -+0, -+240, -+64, -+0, -+132, -+3, -+30, -+106, -+132, -+24, -+128, -+240, -+0, -+0, -+132, -+3, -+128, -+144, -+112, -+0, -+131, -+98, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+104, -+0, -+131, -+102, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+30, -+106, -+134, -+24, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+123, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+112, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+3, -+99, -+131, -+71, -+68, -+232, -+32, -+0, -+0, -+0, -+0, -+99, -+2, -+99, -+23, -+102, -+7, -+106, -+127, -+156, -+178, -+255, -+0, -+248, -+64, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+30, -+106, -+134, -+24, -+128, -+248, -+0, -+0, -+112, -+0, -+192, -+243, -+211, -+31, -+128, -+144, -+72, -+0, -+188, -+64, -+67, -+232, -+0, -+2, -+0, -+0, -+0, -+255, -+64, -+0, -+0, -+20, -+200, -+243, -+0, -+0, -+128, -+144, -+61, -+0, -+195, -+232, -+0, -+2, -+0, -+0, -+12, -+128, -+7, -+192, -+130, -+248, -+0, -+0, -+112, -+192, -+224, -+16, -+195, -+31, -+132, -+248, -+1, -+0, -+112, -+0, -+224, -+16, -+203, -+31, -+25, -+102, -+9, -+106, -+2, -+30, -+41, -+3, -+26, -+87, -+162, -+64, -+64, -+198, -+1, -+23, -+127, -+158, -+95, -+255, -+239, -+3, -+0, -+254, -+128, -+143, -+94, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+95, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+208, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+209, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+47, -+0, -+8, -+255, -+227, -+23, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+52, -+0, -+180, -+192, -+51, -+0, -+0, -+111, -+3, -+239, -+3, -+0, -+254, -+128, -+143, -+14, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+143, -+15, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+192, -+142, -+16, -+0, -+0, -+240, -+12, -+0, -+0, -+254, -+128, -+142, -+17, -+0, -+0, -+240, -+12, -+0, -+128, -+144, -+13, -+0, -+8, -+255, -+227, -+3, -+0, -+244, -+192, -+51, -+0, -+0, -+8, -+255, -+35, -+4, -+0, -+180, -+192, -+51, -+0, -+0, -+111, -+3, -+32, -+246, -+192, -+11, -+1, -+16, -+32, -+246, -+2, -+140, -+47, -+240, -+32, -+247, -+35, -+141, -+63, -+178, -+64, -+254, -+35, -+141, -+2, -+68, -+192, -+243, -+128, -+11, -+32, -+255, -+35, -+240, -+58, -+226, -+192, -+243, -+188, -+10, -+0, -+254, -+0, -+141, -+4, -+4, -+0, -+240, -+128, -+10, -+88, -+246, -+35, -+141, -+3, -+68, -+240, -+246, -+35, -+141, -+48, -+66, -+0, -+247, -+227, -+143, -+52, -+242, -+32, -+247, -+227, -+142, -+52, -+178, -+90, -+0, -+161, -+3, -+6, -+64, -+23, -+64, -+96, -+8, -+70, -+98, -+97, -+8, -+70, -+98, -+98, -+8, -+70, -+98, -+99, -+8, -+70, -+98, -+100, -+8, -+70, -+98, -+101, -+8, -+70, -+98, -+255, -+159, -+8, -+250, -+23, -+102, -+7, -+106, -+112, -+30, -+33, -+3, -+}; -diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c -new file mode 100644 -index 0000000000..0255f5dd44 ---- /dev/null -+++ b/libavcodec/rpi_mailbox.c -@@ -0,0 +1,149 @@ -+/* -+Copyright (c) 2012, Broadcom Europe Ltd. -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in the -+ documentation and/or other materials provided with the distribution. -+ * Neither the name of the copyright holder nor the -+ names of its contributors may be used to endorse or promote products -+ derived from this software without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY -+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+#ifdef RPI -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#define MAJOR_NUM 100 -+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *) -+#define DEVICE_FILE_NAME "/dev/vcio" -+ -+#include "rpi_mailbox.h" -+//#include -+ -+/* -+ * use ioctl to send mbox property message -+ */ -+ -+static int mbox_property(int file_desc, void *buf) -+{ -+ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf); -+ -+ if (ret_val < 0) { -+ printf("ioctl_set_msg failed:%d\n", ret_val); -+ } -+ -+#ifdef DEBUG -+ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf; -+ for (i=0; i -+#include -+#include -+#include -+#include -+#include "libavutil/avassert.h" -+ -+#include "config.h" -+ -+#include -+#include -+ -+#include -+ -+#include "rpi_mailbox.h" -+#include "rpi_qpu.h" -+#include "rpi_shader.h" -+#include "rpi_hevc_transform8.h" -+#include "rpi_hevc_transform10.h" -+#include "libavutil/rpi_sand_fns.h" -+ -+#pragma GCC diagnostic push -+// Many many redundant decls in the header files -+#pragma GCC diagnostic ignored "-Wredundant-decls" -+#include "interface/vmcs_host/vc_vchi_gpuserv.h" -+#pragma GCC diagnostic pop -+ -+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) -+#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 -+ -+// Add profile flags to all QPU requests - generates output in "vcdbg log msg" -+// Beware this is expensive and will probably throw off all other timing by >10% -+#define RPI_TRACE_QPU_PROFILE_ALL 0 -+ -+// QPU "noflush" flags -+// a mixture of flushing & profiling -+ -+#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed -+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers -+#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results -+#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling -+#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) -+ -+#define vcos_verify_ge0(x) ((x)>=0) -+ -+// Size in 32bit words -+#define QPU_CODE_SIZE 4098 -+#define VPU_CODE_SIZE 2048 -+ -+static const short rpi_transMatrix2even[32][16] = { // Even rows first -+{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, -+{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, -+{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, -+{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87}, -+{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83}, -+{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80}, -+{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75}, -+{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70}, -+{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64}, -+{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57}, -+{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50}, -+{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43}, -+{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36}, -+{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25}, -+{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18}, -+{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9}, -+// Odd rows -+{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4}, -+{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13}, -+{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22}, -+{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31}, -+{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38}, -+{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46}, -+{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54}, -+{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61}, -+{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67}, -+{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73}, -+{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78}, -+{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82}, -+{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85}, -+{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88}, -+{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90}, -+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} -+}; -+ -+// Code/constants on GPU -+struct GPU -+{ -+ unsigned int qpu_code[QPU_CODE_SIZE]; -+ unsigned int vpu_code8[VPU_CODE_SIZE]; -+ unsigned int vpu_code10[VPU_CODE_SIZE]; -+ short transMatrix2even[16*16*2]; -+}; -+ -+#define CFE_ENTS_PER_A 8 -+// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices -+// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70 -+// allow 128 -+#define CFE_ENT_COUNT 128 -+#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A) -+ -+struct rpi_cache_flush_env_s { -+// unsigned int n; -+// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; -+ struct vcsm_user_clean_invalid2_s v; -+}; -+ -+#define WAIT_COUNT_MAX 16 -+ -+typedef struct trace_time_one_s -+{ -+ int count; -+ int64_t start[WAIT_COUNT_MAX]; -+ int64_t total[WAIT_COUNT_MAX]; -+} trace_time_one_t; -+ -+typedef struct trace_time_wait_s -+{ -+ unsigned int jcount; -+ int64_t start0; -+ int64_t last_update; -+ trace_time_one_t active; -+ trace_time_one_t wait; -+} trace_time_wait_t; -+ -+typedef struct vq_wait_s -+{ -+ sem_t sem; -+ struct vq_wait_s * next; -+} vq_wait_t; -+ -+#define VQ_WAIT_POOL_SIZE 16 -+typedef struct vq_wait_pool_s -+{ -+ vq_wait_t * head; -+ vq_wait_t pool[VQ_WAIT_POOL_SIZE]; -+} vq_wait_pool_t; -+ -+static void vq_wait_pool_init(vq_wait_pool_t * const pool); -+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); -+ -+typedef struct gpu_env_s -+{ -+ int open_count; -+ int init_count; -+ int mb; -+ int vpu_i_cache_flushed; -+ GPU_MEM_PTR_T code_gm_ptr; -+ vq_wait_pool_t wait_pool; -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ trace_time_wait_t ttw; -+#endif -+} gpu_env_t; -+ -+// Stop more than one thread trying to allocate memory or use the processing resources at once -+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; -+static gpu_env_t * gpu = NULL; -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ -+static int64_t ns_time(void) -+{ -+ struct timespec ts; -+ clock_gettime(CLOCK_MONOTONIC, &ts); -+ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; -+} -+ -+ -+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 -+ -+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) -+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) -+#define T_ARG(t) T_SEC(t), T_MS(t) -+#define T_FMT "%u.%03u" -+ -+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) -+{ -+ // Update totals for levels that are still pending -+ for (int i = 0; i < tto->count; ++i) { -+ tto->total[i] += now - tto->start[i]; -+ tto->start[i] = now; -+ } -+ -+ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", -+ prefix, -+ T_ARG(now - start0 - tto->total[0]), -+ T_ARG(tto->total[0]), -+ T_ARG(tto->total[1]), -+ T_ARG(tto->total[2]), -+ T_ARG(tto->total[3])); -+} -+ -+ -+static void tto_start(trace_time_one_t * const tto, const int64_t now) -+{ -+ av_assert0(tto->count < WAIT_COUNT_MAX); -+ tto->start[tto->count++] = now; -+} -+ -+static void tto_end(trace_time_one_t * const tto, const int64_t now) -+{ -+ const int n = --tto->count; -+ av_assert0(n >= 0); -+ tto->total[n] += now - tto->start[n]; -+} -+ -+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) -+{ -+ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); -+ tto_print(&ttw->active, now, ttw->start0, "Active"); -+ tto_print(&ttw->wait, now, ttw->start0, " Wait"); -+} -+ -+#endif -+ -+// GPU memory alloc fns (internal) -+ -+// GPU_MEM_PTR_T alloc fns -+static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { -+ p->numbytes = (numbytes + 255) & ~255; // Round up -+ p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); -+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); -+ av_assert0(p->vcsm_handle); -+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); -+ av_assert0(p->vc_handle); -+ p->arm = vcsm_lock(p->vcsm_handle); -+ av_assert0(p->arm); -+ p->vc = mbox_mem_lock(mb, p->vc_handle); -+ av_assert0(p->vc); -+// printf("***** %s, %d\n", __func__, numbytes); -+ -+ return 0; -+} -+ -+static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { -+ p->numbytes = numbytes; -+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" ); -+ av_assert0(p->vcsm_handle); -+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); -+ av_assert0(p->vc_handle); -+ p->arm = vcsm_lock(p->vcsm_handle); -+ av_assert0(p->arm); -+ p->vc = mbox_mem_lock(mb, p->vc_handle); -+ av_assert0(p->vc); -+// printf("***** %s, %d\n", __func__, numbytes); -+ return 0; -+} -+ -+static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) { -+ mbox_mem_unlock(mb, p->vc_handle); -+ vcsm_unlock_ptr(p->arm); -+ vcsm_free(p->vcsm_handle); -+ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again -+// printf("***** %s\n", __func__); -+} -+ -+ -+// GPU init, free, lock, unlock -+ -+static void gpu_term(void) -+{ -+ gpu_env_t * const ge = gpu; -+ -+ // We have to hope that eveything has terminated... -+ gpu = NULL; -+ -+ vc_gpuserv_deinit(); -+ -+ gpu_free_internal(ge->mb, &ge->code_gm_ptr); -+ -+ vcsm_exit(); -+ -+ mbox_close(ge->mb); -+ -+ vq_wait_pool_deinit(&ge->wait_pool); -+ -+ free(ge); -+} -+ -+ -+// Connect to QPU, returns 0 on success. -+static int gpu_init(gpu_env_t ** const gpu) { -+ volatile struct GPU* ptr; -+ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); -+ *gpu = NULL; -+ -+ if (ge == NULL) -+ return -1; -+ -+ if ((ge->mb = mbox_open()) < 0) -+ return -1; -+ -+ vq_wait_pool_init(&ge->wait_pool); -+ -+ vcsm_init(); -+ -+ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr); -+ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; -+ -+ // Zero everything so we have zeros between the code bits -+ memset((void *)ptr, 0, sizeof(*ptr)); -+ -+ // Now copy over the QPU code into GPU memory -+ { -+ int num_bytes = (char *)mc_end - (char *)rpi_shader; -+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes); -+ } -+ // And the VPU code -+ { -+ int num_bytes = sizeof(rpi_hevc_transform8); -+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); -+ } -+ { -+ int num_bytes = sizeof(rpi_hevc_transform10); -+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); -+ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); -+ } -+ // And the transform coefficients -+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); -+ -+ *gpu = ge; -+ return 0; -+} -+ -+ -+ -+static void gpu_unlock(void) { -+ pthread_mutex_unlock(&gpu_mutex); -+} -+ -+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. -+static gpu_env_t * gpu_lock(void) { -+ pthread_mutex_lock(&gpu_mutex); -+ -+ av_assert0(gpu != NULL); -+ return gpu; -+} -+ -+static gpu_env_t * gpu_lock_ref(void) -+{ -+ pthread_mutex_lock(&gpu_mutex); -+ -+ if (gpu == NULL) { -+ int rv = gpu_init(&gpu); -+ if (rv != 0) { -+ gpu_unlock(); -+ return NULL; -+ } -+ } -+ -+ ++gpu->open_count; -+ return gpu; -+} -+ -+static void gpu_unlock_unref(gpu_env_t * const ge) -+{ -+ if (--ge->open_count == 0) -+ gpu_term(); -+ -+ gpu_unlock(); -+} -+ -+static inline gpu_env_t * gpu_ptr(void) -+{ -+ av_assert0(gpu != NULL); -+ return gpu; -+} -+ -+// Public gpu fns -+ -+// Allocate memory on GPU -+// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes -+// Returns 0 on success. -+// This allocates memory that will not be cached in ARM's data cache. -+// Therefore safe to use without data cache flushing. -+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) -+{ -+ int r; -+ gpu_env_t * const ge = gpu_lock_ref(); -+ if (ge == NULL) -+ return -1; -+ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p); -+ gpu_unlock(); -+ return r; -+} -+ -+// This allocates data that will be -+// Cached in ARM L2 -+// Uncached in VPU L2 -+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) -+{ -+ int r; -+ gpu_env_t * const ge = gpu_lock_ref(); -+ if (ge == NULL) -+ return -1; -+ r = gpu_malloc_cached_internal(ge->mb, numbytes, p); -+ gpu_unlock(); -+ return r; -+} -+ -+void gpu_free(GPU_MEM_PTR_T * const p) { -+ gpu_env_t * const ge = gpu_lock(); -+ gpu_free_internal(ge->mb, p); -+ gpu_unlock_unref(ge); -+} -+ -+unsigned int vpu_get_fn(const unsigned int bit_depth) { -+ // Make sure that the gpu is initialized -+ av_assert0(gpu != NULL); -+ switch (bit_depth){ -+ case 8: -+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); -+ case 10: -+ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); -+ default: -+ av_assert0(0); -+ } -+ return 0; -+} -+ -+unsigned int vpu_get_constants(void) { -+ av_assert0(gpu != NULL); -+ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even); -+} -+ -+int gpu_get_mailbox(void) -+{ -+ av_assert0(gpu); -+ return gpu->mb; -+} -+ -+void gpu_ref(void) -+{ -+ gpu_lock_ref(); -+ gpu_unlock(); -+} -+ -+void gpu_unref(void) -+{ -+ gpu_env_t * const ge = gpu_lock(); -+ gpu_unlock_unref(ge); -+} -+ -+// ---------------------------------------------------------------------------- -+// -+// Cache flush functions -+ -+#define CACHE_EL_MAX 16 -+ -+rpi_cache_flush_env_t * rpi_cache_flush_init() -+{ -+ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) + -+ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX); -+ if (rfe == NULL) -+ return NULL; -+ -+ rfe->v.op_count = 0; -+ return rfe; -+} -+ -+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) -+{ -+ if (rfe != NULL) -+ free(rfe); -+} -+ -+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe) -+{ -+ int rc = 0; -+ if (rfe->v.op_count != 0) { -+ if (vcsm_clean_invalid2(&rfe->v) != 0) -+ { -+ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno); -+ rc = -1; -+ } -+ rfe->v.op_count = 0; -+ } -+ return rc; -+} -+ -+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) -+{ -+ int rc = rpi_cache_flush_execute(rfe);; -+ -+ free(rfe); -+ return rc; -+} -+ -+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, -+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) -+{ -+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; -+ -+ av_assert0(rfe->v.op_count <= CACHE_EL_MAX); -+ -+ b->invalidate_mode = mode; -+ b->block_count = blocks; -+ b->start_address = gm->arm + offset0; -+ b->block_size = block_size; -+ b->inter_block_stride = block_stride; -+} -+ -+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, -+ const unsigned int offset, const unsigned int size) -+{ -+ // Deal with empty pointer trivially -+ if (gm == NULL || size == 0) -+ return; -+ -+ av_assert0(offset <= gm->numbytes); -+ av_assert0(size <= gm->numbytes); -+ av_assert0(offset + size <= gm->numbytes); -+ -+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); -+} -+ -+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) -+{ -+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); -+} -+ -+ -+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) -+{ -+#if !RPI_ONE_BUF -+#error Fixme! (NIF) -+#endif -+ if (gpu_is_buf1(frame)) { -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); -+ } -+ else -+ { -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); -+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); -+ } -+} -+ -+// Flush an area of a frame -+// Width, height, x0, y0 in luma pels -+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, -+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, -+ const unsigned int uv_shift, const int do_luma, const int do_chroma) -+{ -+ const unsigned int y_offset = frame->linesize[0] * y0; -+ const unsigned int y_size = frame->linesize[0] * height; -+ // Round UV up/down to get everything -+ const unsigned int uv_rnd = (1U << uv_shift) >> 1; -+ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); -+ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; -+ -+#if 0 -+ // *** frame->height is cropped height so not good -+ // As all unsigned they will also reject -ve -+ // Test individually as well as added to reject overflow -+ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped -+ av_assert0(n <= (unsigned int)frame->height); -+ av_assert0(start_line + n <= (unsigned int)frame->height); -+#endif -+ -+ if (!gpu_is_buf1(frame)) -+ { -+ if (do_luma) { -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); -+ } -+ if (do_chroma) { -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); -+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); -+ } -+ } -+ else if (!av_rpi_is_sand_frame(frame)) -+ { -+ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); -+ if (do_luma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); -+ } -+ if (do_chroma) { -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); -+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); -+ } -+ } -+ else -+ { -+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); -+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); -+ const unsigned int xshl = av_rpi_sand_frame_xshl(frame); -+ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); -+ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C -+ av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); -+ -+ if (do_chroma) -+ { -+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; -+ b->invalidate_mode = mode; -+ b->block_count = block_count; -+ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); -+ b->block_size = uv_size; -+ b->inter_block_stride = stride1 * stride2; -+ } -+ if (do_luma) -+ { -+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; -+ b->invalidate_mode = mode; -+ b->block_count = block_count; -+ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); -+ b->block_size = y_size; -+ b->inter_block_stride = stride1 * stride2; -+ } -+ } -+} -+ -+// Call this to clean and invalidate a region of memory -+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) -+{ -+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); -+ rpi_cache_flush_add_gm_ptr(rfe, p, mode); -+ rpi_cache_flush_finish(rfe); -+} -+ -+ -+// ---------------------------------------------------------------------------- -+ -+ -+// Wait abstractions - mostly so we can easily add profile code -+static void vq_wait_pool_init(vq_wait_pool_t * const wp) -+{ -+ unsigned int i; -+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { -+ sem_init(&wp->pool[i].sem, 0, 0); -+ wp->pool[i].next = wp->pool + i + 1; -+ } -+ wp->head = wp->pool + 0; -+ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; -+} -+ -+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) -+{ -+ unsigned int i; -+ wp->head = NULL; -+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { -+ sem_destroy(&wp->pool[i].sem); -+ wp->pool[i].next = NULL; -+ } -+} -+ -+ -+// If sem_init actually takes time then maybe we want a pool... -+static vq_wait_t * vq_wait_new(void) -+{ -+ gpu_env_t * const ge = gpu_lock_ref(); -+ vq_wait_t * const wait = ge->wait_pool.head; -+ ge->wait_pool.head = wait->next; -+ wait->next = NULL; -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ tto_start(&ge->ttw.active, ns_time()); -+#endif -+ -+ gpu_unlock(); -+ return wait; -+} -+ -+static void vq_wait_delete(vq_wait_t * const wait) -+{ -+ gpu_env_t * const ge = gpu_lock(); -+ wait->next = ge->wait_pool.head; -+ ge->wait_pool.head = wait; -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ { -+ trace_time_wait_t * const ttw = &ge->ttw; -+ const int64_t now = ns_time(); -+ ++ttw->jcount; -+ tto_end(&ttw->wait, now); -+ -+ if (ttw->start0 == 0) -+ { -+ ttw->start0 = ttw->active.start[0]; -+ ttw->last_update = ttw->start0; -+ } -+ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) -+ { -+ ttw->last_update += WAIT_TIME_PRINT_PERIOD; -+ ttw_print(ttw, now); -+ } -+ } -+#endif -+ gpu_unlock_unref(ge); -+} -+ -+static void vq_wait_wait(vq_wait_t * const wait) -+{ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ { -+ const int64_t now = ns_time(); -+ gpu_env_t * const ge = gpu_lock(); -+ tto_start(&ge->ttw.wait, now); -+ gpu_unlock(); -+ } -+#endif -+ -+ while (sem_wait(&wait->sem) == -1 && errno == EINTR) -+ /* loop */; -+} -+ -+static void vq_wait_post(vq_wait_t * const wait) -+{ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ { -+ gpu_env_t *const ge = gpu_lock(); -+ tto_end(&ge->ttw.active, ns_time()); -+ gpu_unlock(); -+ } -+#endif -+ -+ sem_post(&wait->sem); -+} -+ -+ -+ -+// Header comments were wrong for these two -+#define VPU_QPU_MASK_QPU 1 -+#define VPU_QPU_MASK_VPU 2 -+ -+#define VPU_QPU_JOB_MAX 4 -+struct vpu_qpu_job_env_s -+{ -+ unsigned int n; -+ unsigned int mask; -+ struct gpu_job_s j[VPU_QPU_JOB_MAX]; -+}; -+ -+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; -+ -+vpu_qpu_job_env_t * vpu_qpu_job_new(void) -+{ -+ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); -+ return vqj; -+} -+ -+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) -+{ -+ memset(vqj, 0, sizeof(*vqj)); -+ free(vqj); -+} -+ -+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) -+{ -+ struct gpu_job_s * const j = vqj->j + vqj->n++; -+ av_assert0(vqj->n <= VPU_QPU_JOB_MAX); -+ return j; -+} -+ -+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, -+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) -+{ -+ if (vpu_code != 0) { -+ struct gpu_job_s *const j = new_job(vqj); -+ vqj->mask |= VPU_QPU_MASK_VPU; -+ -+ j->command = EXECUTE_VPU; -+ // The bottom two bits of the execute address contain no-flush flags -+ // b0 will flush the VPU I-cache if unset so we nearly always want that set -+ // as we never reload code -+ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; -+ j->u.v.q[1] = r0; -+ j->u.v.q[2] = r1; -+ j->u.v.q[3] = r2; -+ j->u.v.q[4] = r3; -+ j->u.v.q[5] = r4; -+ j->u.v.q[6] = r5; -+ gpu->vpu_i_cache_flushed = 1; -+ } -+} -+ -+// flags are QPU_FLAGS_xxx -+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) -+{ -+ if (n != 0) { -+ struct gpu_job_s *const j = new_job(vqj); -+ vqj->mask |= VPU_QPU_MASK_QPU; -+ -+ j->command = EXECUTE_QPU; -+ j->u.q.jobs = n; -+#if RPI_TRACE_QPU_PROFILE_ALL -+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; -+#else -+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; -+#endif -+ j->u.q.timeout = 5000; -+ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); -+ } -+} -+ -+// Convert callback to sem post -+static void vpu_qpu_job_callback_wait(void * v) -+{ -+ vq_wait_post(v); -+} -+ -+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) -+{ -+ vq_wait_t * wait; -+ -+ if (vqj->mask == 0) { -+ *wait_h = NULL; -+ return; -+ } -+ -+ // We are going to want a sync object -+ wait = vq_wait_new(); -+ -+ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync -+ // If we only posted one thing or only QPU jobs -+ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) -+ { -+ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); -+ av_assert0(j->callback.func == 0); -+ -+ j->callback.func = vpu_qpu_job_callback_wait; -+ j->callback.cookie = wait; -+ } -+ else -+ { -+ struct gpu_job_s *const j = new_job(vqj); -+ -+ j->command = EXECUTE_SYNC; -+ j->u.s.mask = vqj->mask; -+ j->callback.func = vpu_qpu_job_callback_wait; -+ j->callback.cookie = wait; -+ } -+ -+ vqj->mask = 0; -+ *wait_h = wait; -+} -+ -+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) -+{ -+ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j); -+} -+ -+// Simple wrapper of start + delete -+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) -+{ -+ int rv; -+ rv = vpu_qpu_job_start(vqj); -+ vpu_qpu_job_delete(vqj); -+ return rv; -+} -+ -+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) -+{ -+ if (wait_h != NULL) -+ { -+ vq_wait_t * const wait = *wait_h; -+ if (wait != NULL) { -+ *wait_h = NULL; -+ vq_wait_wait(wait); -+ vq_wait_delete(wait); -+ } -+ } -+} -+ -+int vpu_qpu_init() -+{ -+ gpu_env_t * const ge = gpu_lock_ref(); -+ if (ge == NULL) -+ return -1; -+ -+ if (ge->init_count++ == 0) -+ { -+ vc_gpuserv_init(); -+ } -+ -+ gpu_unlock(); -+ return 0; -+} -+ -+void vpu_qpu_term() -+{ -+ gpu_env_t * const ge = gpu_lock(); -+ -+ if (--ge->init_count == 0) { -+ vc_gpuserv_deinit(); -+ -+#if RPI_TRACE_TIME_VPU_QPU_WAIT -+ ttw_print(&ge->ttw, ns_time()); -+#endif -+ } -+ -+ gpu_unlock_unref(ge); -+} -+ -+uint32_t qpu_fn(const int * const mc_fn) -+{ -+ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code); -+} -+ -+ -+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) -+{ -+ // Dummy values we can catch with emulation -+ qf->y_pxx = ~1U; -+ qf->y_bxx = ~2U; -+ qf->y_p00 = ~3U; -+ qf->y_b00 = ~4U; -+ qf->c_pxx = ~5U; -+ qf->c_bxx = ~6U; -+ -+ switch (bit_depth) { -+ case 8: -+ qf->y_pxx = qpu_fn(mc_filter_y_pxx); -+ qf->y_pxx = qpu_fn(mc_filter_y_pxx); -+ qf->y_bxx = qpu_fn(mc_filter_y_bxx); -+ qf->y_p00 = qpu_fn(mc_filter_y_p00); -+ qf->y_b00 = qpu_fn(mc_filter_y_b00); -+ qf->c_pxx = qpu_fn(mc_filter_c_p); -+ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); -+ qf->c_bxx = qpu_fn(mc_filter_c_b); -+ break; -+ case 10: -+ qf->c_pxx = qpu_fn(mc_filter_c10_p); -+ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); -+ qf->c_bxx = qpu_fn(mc_filter_c10_b); -+ qf->y_pxx = qpu_fn(mc_filter_y10_pxx); -+ qf->y_bxx = qpu_fn(mc_filter_y10_bxx); -+ qf->y_p00 = qpu_fn(mc_filter_y10_p00); -+ qf->y_b00 = qpu_fn(mc_filter_y10_b00); -+ break; -+ default: -+ return -1; -+ } -+ return 0; -+} -+ -+#endif // RPI -diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h -new file mode 100644 -index 0000000000..9389047f8e ---- /dev/null -+++ b/libavcodec/rpi_qpu.h -@@ -0,0 +1,208 @@ -+#ifndef RPI_QPU_H -+#define RPI_QPU_H -+ -+#define RPI_ONE_BUF 1 -+ -+typedef struct gpu_mem_ptr_s { -+ unsigned char *arm; // Pointer to memory mapped on ARM side -+ int vc_handle; // Videocore handle of relocatable memory -+ int vcsm_handle; // Handle for use by VCSM -+ int vc; // Address for use in GPU code -+ int numbytes; // Size of memory block -+} GPU_MEM_PTR_T; -+ -+// General GPU functions -+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); -+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); -+extern void gpu_free(GPU_MEM_PTR_T * const p); -+ -+#include "libavutil/frame.h" -+#if !RPI_ONE_BUF -+static inline uint32_t get_vc_address_y(const AVFrame * const frame) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]); -+ return p->vc; -+} -+ -+static inline uint32_t get_vc_address_u(const AVFrame * const frame) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]); -+ return p->vc; -+} -+ -+static inline uint32_t get_vc_address_v(const AVFrame * const frame) { -+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]); -+ return p->vc; -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { -+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { -+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { -+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]); -+} -+ -+#else -+ -+static inline int gpu_is_buf1(const AVFrame * const frame) -+{ -+ return frame->buf[1] == NULL; -+} -+ -+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame) -+{ -+ return av_buffer_get_opaque(frame->buf[0]); -+} -+ -+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) -+{ -+ return av_buffer_pool_opaque(frame->buf[n]); -+} -+ -+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) -+{ -+ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); -+ return gm->vc + (frame->data[n] - gm->arm); -+} -+ -+ -+static inline uint32_t get_vc_address_y(const AVFrame * const frame) { -+ return get_vc_address3(frame, 0); -+} -+ -+static inline uint32_t get_vc_address_u(const AVFrame * const frame) { -+ return get_vc_address3(frame, 1); -+} -+ -+static inline uint32_t get_vc_address_v(const AVFrame * const frame) { -+ return get_vc_address3(frame, 2); -+} -+ -+#if 0 -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { -+ if (gpu_is_buf1(frame)) -+ { -+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); -+ g.numbytes = frame->data[1] - frame->data[0]; -+ return g; -+ } -+ else -+ return *gpu_buf3_gmem(frame, 0); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { -+ if (gpu_is_buf1(frame)) -+ { -+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); -+ g.arm += frame->data[1] - frame->data[0]; -+ g.vc += frame->data[1] - frame->data[0]; -+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size -+ return g; -+ } -+ else -+ return *gpu_buf3_gmem(frame, 1); -+} -+ -+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { -+ if (gpu_is_buf1(frame)) -+ { -+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); -+ g.arm += frame->data[2] - frame->data[0]; -+ g.vc += frame->data[2] - frame->data[0]; -+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size -+ return g; -+ } -+ else -+ return *gpu_buf3_gmem(frame, 2); -+} -+#endif -+#endif -+ -+// Cache flush stuff -+ -+struct rpi_cache_flush_env_s; -+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; -+ -+rpi_cache_flush_env_t * rpi_cache_flush_init(void); -+// Free env without flushing -+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); -+// Do the accumulated flush & clear but do not free the env -+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe); -+// Do the accumulated flush & free the env -+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); -+ -+typedef enum -+{ -+ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, -+ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, -+ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 -+} rpi_cache_flush_mode_t; -+ -+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); -+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, -+ const unsigned int offset, const unsigned int size); -+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, -+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); -+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode); -+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, -+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, -+ const unsigned int uv_shift, const int do_luma, const int do_chroma); -+ -+// init, add, finish for one gm ptr -+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); -+ -+ -+// QPU specific functions -+ -+typedef struct HEVCRpiQpu { -+ uint32_t c_pxx; -+ uint32_t c_pxx_l1; -+ uint32_t c_bxx; -+ uint32_t y_pxx; -+ uint32_t y_bxx; -+ uint32_t y_p00; -+ uint32_t y_b00; -+} HEVCRpiQpu; -+ -+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); -+ -+uint32_t qpu_fn(const int * const mc_fn); -+ -+#define QPU_N_GRP 4 -+#define QPU_N_MAX 12 -+ -+#define QPU_MAIL_EL_VALS 2 -+ -+struct vpu_qpu_wait_s; -+typedef struct vq_wait_s * vpu_qpu_wait_h; -+ -+// VPU specific functions -+ -+struct vpu_qpu_job_env_s; -+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; -+ -+vpu_qpu_job_h vpu_qpu_job_new(void); -+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); -+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, -+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); -+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); -+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); -+int vpu_qpu_job_start(const vpu_qpu_job_h vqj); -+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); -+ -+extern unsigned int vpu_get_fn(const unsigned int bit_depth); -+extern unsigned int vpu_get_constants(void); -+ -+// Waits for previous post_codee to complete and Will null out *wait_h after use -+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); -+int vpu_qpu_init(void); -+void vpu_qpu_term(void); -+ -+extern int gpu_get_mailbox(void); -+void gpu_ref(void); -+void gpu_unref(void); -+ -+#endif -diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c -new file mode 100644 -index 0000000000..2c6541a8fb ---- /dev/null -+++ b/libavcodec/rpi_shader.c -@@ -0,0 +1,1570 @@ -+#include "rpi_shader.h" -+ -+#ifdef _MSC_VER -+ #include -+ /* cast through uintptr_t to avoid warnings */ -+ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X)) -+#else -+ #define POINTER_TO_UINT(X) ((unsigned int)(X)) -+#endif -+ -+#ifdef __cplusplus -+extern "C" { /* the types are probably wrong... */ -+#endif -+#ifdef __cplusplus -+} -+#endif -+ -+#ifdef _MSC_VER -+__declspec(align(8)) -+#elif defined(__GNUC__) -+__attribute__((aligned(8))) -+#endif -+unsigned int rpi_shader[] = { ++unsigned int ff_hevc_rpi_shader[] = { +// ::mc_setup_c_q0 +// ::mc_start +/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) @@ -28869,2552 +17987,21710 @@ index 0000000000..2c6541a8fb +/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init +// ::mc_end +}; -+#ifdef __HIGHC__ -+#pragma Align_to(8, rpi_shader) ++#ifdef __HIGHC__ ++#pragma Align_to(8, rpi_shader) ++#endif +diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h +new file mode 100644 +index 0000000000..ddb351782d +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.h +@@ -0,0 +1,63 @@ ++#ifndef rpi_hevc_shader_H ++#define rpi_hevc_shader_H ++ ++extern unsigned int ff_hevc_rpi_shader[]; ++ ++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0) ++#define mc_start (ff_hevc_rpi_shader + 0) ++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2) ++#define mc_filter_c_p (ff_hevc_rpi_shader + 142) ++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 272) ++#define mc_filter_c_b (ff_hevc_rpi_shader + 402) ++#define mc_sync_q0 (ff_hevc_rpi_shader + 590) ++#define mc_sync_q1 (ff_hevc_rpi_shader + 608) ++#define mc_sync_q2 (ff_hevc_rpi_shader + 620) ++#define mc_sync_q3 (ff_hevc_rpi_shader + 632) ++#define mc_sync_q4 (ff_hevc_rpi_shader + 644) ++#define mc_sync_q5 (ff_hevc_rpi_shader + 662) ++#define mc_sync_q6 (ff_hevc_rpi_shader + 674) ++#define mc_sync_q7 (ff_hevc_rpi_shader + 686) ++#define mc_sync_q8 (ff_hevc_rpi_shader + 698) ++#define mc_sync_q9 (ff_hevc_rpi_shader + 716) ++#define mc_sync_q10 (ff_hevc_rpi_shader + 728) ++#define mc_sync_q11 (ff_hevc_rpi_shader + 740) ++#define mc_exit_c_qn (ff_hevc_rpi_shader + 752) ++#define mc_exit_y_qn (ff_hevc_rpi_shader + 752) ++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 770) ++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 770) ++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 790) ++#define mc_setup_y_qn (ff_hevc_rpi_shader + 792) ++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1032) ++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1162) ++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1292) ++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1382) ++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1462) ++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1464) ++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1600) ++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1728) ++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1856) ++#define mc_sync10_q0 (ff_hevc_rpi_shader + 2042) ++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2060) ++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2072) ++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2084) ++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2096) ++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2114) ++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2126) ++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2138) ++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2150) ++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2168) ++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2180) ++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2192) ++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2204) ++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2204) ++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2224) ++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2224) ++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2242) ++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2244) ++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2494) ++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2624) ++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2716) ++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2846) ++#define mc_end (ff_hevc_rpi_shader + 2926) ++ ++#endif +diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm +new file mode 100644 +index 0000000000..f8572cdebe +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.qasm +@@ -0,0 +1,1741 @@ ++ ++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress ++# the warning that we are using rotation & ra/rb registers. r0..3 can be ++# rotated through all 16 elems ra regs can only be rotated through their ++# local 4. As it happens this is what is wanted here as we do not want the ++# constants from the other half of the calc. ++ ++# PREREAD is the number of requests that we have sitting in the TMU request ++# queue. ++# ++# There are 8 slots availible in the TMU request Q for tm0s requests, but ++# only 4 output FIFO entries and overflow is bad (corruption or crash) ++# (If threaded then only 2 out FIFO entries, but we aren't.) ++# In s/w we are effectively limited to the min vertical read which is >= 4 ++# so output FIFO is the limit. ++# ++# However in the current world there seems to be no benefit (and a small ++# overhead) in setting this bigger than 2. ++ ++.set PREREAD, 4 ++ ++# Block heights - 8 & 16 are the only numbers we currently support ++ ++.set C_BLK_HEIGHT_8, 16 ++.set C_BLK_HEIGHT_16, 8 ++.set Y_BLK_HEIGHT_8, 16 ++.set Y_BLK_HEIGHT_16, 8 ++ ++# QPU counts - depend on block size ++# If we have a 2-byte format & block_size > 8 then can only afford ++# 8 QPUs ++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h ++ ++.set N_QPU_8, 12 ++.set N_QPU_16, 12 ++ ++# register allocation ++# ++ ++# ra0-3 ++# Used as temp and may be loop filter coeffs (split into .8s) ++# or temp in loop. Check usage on an individual basis. ++ ++# ra4-7 ++# C: L0 H filter out FIFO ++# otherwise -- free -- ++ ++# ra8-11 ++# temp in some places - check usage ++# Y: (with rb8-11) horiz out FIFO ++ ++# ra12-15 ++# -- free -- ++ ++# uniform: width:height ++.set ra_width_height, ra16 ++.set ra_width, ra16.16b ++.set ra_height, ra16.16a ++ ++# y:y2 same layout as y_y2_next so we can update both together ++.set ra_y_y2, ra17 ++.set ra_y2, ra17.16a ++.set ra_y, ra17.16b ++ ++# uniform: L1 weight (U on left, V on right) ++# Only used in Y B ++.set ra_wt_off_mul_l1, ra18 ++.set ra_wt_off_l1, ra18.16b ++.set ra_wt_mul_l1, ra18.16a ++ ++# y_next:y2_next same layout as y_y2 so we can update both together ++.set ra_y_y2_next, ra19 ++.set ra_y_next, ra19.16b ++.set ra_y2_next, ra19.16a ++ ++# Setup: consts - subdivide a single register ++.set ra_kff100100, ra20 ++.set ra_k256, ra20.16a ++.set ra_k0, ra20.8a ++.set ra_k1, ra20.8b ++.set ra_k16, ra20.8c ++.set ra_k255, ra20.8d ++ ++# Loop: xshifts ++.set ra_xshift, ra21.16a ++.set ra_xshift_next, ra21.16b ++ ++# Loop var: L0 weight (U on left, V on right) ++# _off_ is not used in loop as we want to modify it before use ++.set ra_wt_off_mul_l0, ra22 ++.set ra_wt_mul_l0, ra22.16a ++.set ra_wt_off_l0, ra22.16b ++ ++# Max pel value (for 8 bit we can get away with sat ops but not 9+) ++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the ++# 2nd byte but as the source should never be > 3 there 0x3ff should do ++.set ra_blk_height_pmax, ra23 ++.set ra_pmax, ra23.16a ++.set ra_blk_height, ra23.8c ++# -- free -- ra23.8d ++ ++# Loop: src frame base (L0) ++.set ra_base, ra24 ++ ++# Loop: src frame base (L1) ++.set ra_base2, ra25 ++ ++# Loop: next src frame base (L0) ++.set ra_base_next, ra26 ++ ++# -- free -- ra27 ++# -- free -- ra28 ++# -- free -- ra29 ++ ++# Use an even numbered register as a link register to avoid corrupting flags ++.set ra_link, ra30 ++ ++# -- free -- ra31 ++ ++.set rb_xshift2, rb0 ++.set rb_xshift2_next, rb1 ++ ++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 ++.set rb_elem_x, rb2 ++ ++# El Flags ++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n ++.set rb_ef, rb3 ++ ++# rb4-7 ++# C-B: L1 H filter out FIFO ++# Y: (with ra2.8x) Y vertical filter coeffs ++ ++# rb8-11 ++# C: Vertical filter coeffs ++# Y: (with ra8-11) horiz out FIFO ++ ++# Loop var: offset to add before shift (round + weighting offsets) ++# Exact value varies by loop ++.set rb_wt_off, rb12 ++ ++# Setup: denom + 6 + 9 ++.set rb_wt_den_p15, rb13 ++ ++# -- free -- rb14 ++# -- free -- rb15 ++ ++# Line pitch (128 for sand128) ++.set rb_pitch, rb16 ++ ++# Loop count - 2 (set up TMU for next xfer) ++.set rb_i_tmu, rb17 ++ ++# Loop count for min(height, 16) ++# Y will reset & loop again if height > 16 ++.set rb_lcount, rb18 ++ ++# frame_base2_next ++.set rb_base2_next, rb19 ++ ++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give ++# offset to the slice ++.set rb_xpitch, rb20 ++ ++# -- free -- rb21 ++ ++# Setup: 0xff (8-bit) / 0xffff (9+ bit) ++.set rb_pmask, rb22 ++ ++# Loop: destination address ++.set rb_dest, rb23 ++ ++# vdw_setup_1(dst_pitch) ++.set rb_dma1_base, rb24 ++ ++# Setup: pic width - 1 ++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. ++.set rb_max_x, rb25 ++ ++# Loop: height<<23 + width<<16 + vdw_setup_0 ++.set rb_dma0, rb26 ++ ++# vdw_setup_0 (depends on QPU number) ++.set rb_dma0_base, rb27 ++ ++# Setup: vw_setup value to reset VPM write pointer ++.set rb_vpm_init, rb28 ++ ++# Loop: vdw_setup_1(dst_pitch-width) = stride ++.set rb_dma1, rb29 ++ ++# Setup: pic_height - 1 ++.set rb_max_y, rb30 ++ ++# -- free -- rb31 ++ ++ ++ ++ ++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. ++.set i_shift16, -16 ++.set i_shift21, -11 ++.set i_shift23, -9 ++.set i_shift30, -2 ++ ++# Much of the setup code is common between Y & C ++# Macros that express this - obviously these can't be overlapped ++# so are probably unsuitable for loop code ++ ++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma ++ mov r2, qpu_num ++.if v_bit_depth <= 8 ++ # 8 bit version ++ asr r1, r2, 2 ++ shl r1, r1, 6 ++ and r0, r2, 3 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit ++ add r_vpm, r0, r1 # VPM 8bit storage ++ ++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later ++ shl r0, r0, 5 ++ ++.else ++ # 16 bit version ++ # Limited to 8 QPUs if blk height > 8 ++ asr r1, r2, 1 ++.if v_blk_height <= 8 ++ shl r1, r1, 4 ++.else ++ shl r1, r1, 5 ++.endif ++ and r0, r2, 1 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR ++ add r_vpm, r0, r1 ++ ++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into ++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) ++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later ++ shl r0, r0, 6 ++.endif ++ add r_dma, r0, r1 # DMA out ++.endm ++ ++ ++.macro m_setup_q0 ++ srel -, 12 ++.endm ++ ++# Code start label ++::mc_start ++ ++################################################################################ ++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) ++ ++.macro m_setup_c, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_pmask, 0xff ++.set v_blk_height, C_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 2 ++.set v_pmask, 0xffff ++.set v_blk_height, C_BLK_HEIGHT_16 ++.endif ++ ++ mov tmurs, 1 # No swap TMUs ++ ++# Load first request location ++ mov ra0, unif # next_x_y ++ ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ++ ++ mov ra_base, unif # Store frame c base ++ ++# Read image dimensions ++ sub r0, unif, 1 # pic c width ++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes ++ sub rb_max_y, unif, 1 # pic c height ++ ++# load constants ++ mov ra_kff100100, 0xff100100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ ++# get source pitch ++ mov rb_xpitch, unif # stride2 ++ mov rb_pitch, unif # stride1 ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly ++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 ++ ++ and r0, 1, elem_num ++ nop ; mul24 r0, r0, 5 ++.if v_bit_depth <= 8 ++ add rb_elem_x, r0, elem_num ++.else ++ add r0, r0, elem_num ++ add rb_elem_x, r0, r0 ++.endif ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base ++ ++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] ++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice ++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y ++ min r0, r0, rb_max_x ++ ++# Get shift ++# Shift will always calculate as 0 for 9+ bit ++# Ideally we can optimize the shift out of the code in these cases but for now ++# it is tidier to leave it in ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.else ++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++.endif ++ ++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to ++ ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ++ add ra_base, ra_base, r0 ++ ++ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator ++ ++# Compute part of VPM to use for DMA output ++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ ++# And again for L1, but only worrying about frame2 stuff ++ ++# Load first request location ++ mov ra0, unif # next_x_y ++ ++ mov ra_base2, unif # [ra0 delay] Store frame c base ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base ++ ++ shl r0, ra0.16b, v_x_shift ++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ ++# Get shift (already zero if 9+ bit so ignore) ++.if v_bit_depth <= 8 ++ shl rb_xshift2_next, r0, 3 ++.endif ++ ++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++ ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r2, ra_y2 ++ add ra_base2, ra_base2, r0 ++ ++# Do preloads ++# r0 = ra_y, r2 = ra_y2 ++ mov r3, PREREAD ; mov r0, ra_y ++ ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b ++ ++ mov ra_link, unif # link ++# touch registers to keep simulator happy ++ # ra/b4..7: B0 -> B stash registers ++ mov ra4, 0 ; mov rb4, 0 ++ bra -, ra_link ++ mov ra5, 0 ; mov rb5, 0 ++ mov ra6, 0 ; mov rb6, 0 ++ mov ra7, 0 ; mov rb7, 0 ++# >>> ra_link ++.endm ++ ++::mc_setup_c_q0 ++ m_setup_q0 ++::mc_setup_c_qn ++ m_setup_c 8 ++ ++################################################################################ ++ ++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst) ++ ++# At this point we have already issued two pairs of texture requests for the current block ++# ra_x, ra_x16_base point to the current coordinates for this block ++ ++.macro m_filter_c_p, v_tmu, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_x_mul, 4 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_tmu == 0 ++.set vrx_xshift, rb_xshift2 # b side more convienient ++.set vrx_xshift_next, ra_xshift_next ++.set vra_y_next, ra_y_next ++.set vrx_base_next, ra_base_next ++.set vra_y, ra_y ++.set vra_base, ra_base ++.set vr_txs, t0s ++.else ++.set vrx_xshift, ra_xshift # a side more convienient ++.set vrx_xshift_next, rb_xshift2_next ++.set vra_y_next, ra_y2_next ++.set vrx_base_next, rb_base2_next ++.set vra_y, ra_y2 ++.set vra_base, ra_base2 ++.set vr_txs, t1s ++.endif ++ ++# per-channel shifts were calculated on the *previous* invocation ++# get base addresses and per-channel shifts for *next* invocation ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base ++ ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 ++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs ++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++ ++.if v_bit_depth <= 8 ++ shl vrx_xshift_next, r0, 3 ++ and r0, r0, -4 ++.endif ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ add vrx_base_next, r3, r0 ; mov r1, ra_height ++ ++# set up VPM write ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight ++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight ++ ++# ; unpack filter coefficients ++ ++ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a ++ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2) ++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight ++ ++ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y ++ ++ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++ ++ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link ++ sub ra3, rb_wt_den_p15, ra_k1 ++ ++# r5 = 0 (loop counter) ++# ra9 = alias for rb_max_y ++# ra_wt_mul_l0 = weight L0 ++# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19] ++# rb_wt_off = (offset * 2 + 1) << (ra3 - 1) ++ ++# We want (r0r1) ++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... ++# We fetch (after shift) ++# C0 : C3 : C1 : C4 : C2 : C5 : ... ++ ++:1 ++# retrieve texture results and pick out bytes ++# then submit two more texture requests ++ ++.if v_tmu == 0 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment ++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++.else ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment ++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++.endif ++ ++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++ min r3, r3, ra9 ; mov.ifnc r0, r2 ++ ++ mov ra4, ra5 ; mul24 r2, r3, rb_pitch ++ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ ++# apply horizontal filter ++# The filter coeffs for the two halves of this are the same (unlike in the ++# Y case) so it doesn't matter which ra0 we get them from ++# Also as the two halves are locked together we don't need to separate the 1st ++# r0 mul or the last r1 mul as they are vaild for all QPUs ++ ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++ ++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift) ++# Have to dup block as we need to move the brr - code is more common than it ++# looks at first glance ++.if v_bit_depth <= 8 ++ brr.anyn -, r:1b ++ add r2, r2, r3 ; mov ra5, ra6 ++ mov ra6, ra7 ; mul24 r1, ra7, rb10 ++ sub ra7, r2, r0 ; mul24 r0, ra4, rb8 ++.else ++ add r2, r2, r3 ; mov ra5, ra6 ++ brr.anyn -, r:1b ++ mov ra6, ra7 ; mul24 r1, ra7, rb10 ++ sub r2, r2, r0 ; mul24 r0, ra4, rb8 ++ asr ra7, r2, v_bit_depth - 8 ++.endif ++# >>> .anyn 1b ++ ++ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay] ++ add r1, r1, r0 ; mul24 r0, ra7, rb11 ++ sub r1, r1, r0 ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ++ nop ; mul24 r1, r1, ra_wt_mul_l0 ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, ra3 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++# At 10 bits ++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits) ++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230 ++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits) ++# (P) ++# * weight (255) = 5987400 = 0x5b5c48 (23 bits) ++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits) ++# ... should be OK ++# ++# (B) ++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits) ++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits) ++# So signed overflow if we sign extend here :-( ++# ++# In practice this doesn't happen (we need a maximal offset and a very unlucky ++# filter). ++# ++# This could be fixed by offsetting the filters s.t. they are unsigned until ++# weight mul and then removing the offset with the weighting offset (I think ++# this should work) or splitting the rounding & offsetting ++ ++::mc_filter_c_p ++ m_filter_c_p 0, 8 ++ ++::mc_filter_c_p_l1 ++ m_filter_c_p 1, 8 ++ ++################################################################################ ++ ++# mc_filter_c_b ++ ++# At this point we have already issued two pairs of texture requests for the current block ++# ra_x, ra_x16_base point to the current coordinates for this block ++ ++.macro m_filter_c_b, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++.set v_x_mul, (1 << v_x_shift) ++ ++# per-channel shifts were calculated on the *previous* invocation ++ ++# get base addresses and per-channel shifts for *next* invocation ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base ++ ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 ++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs ++ ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.endif ++ ++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B ++ ++# set up VPM write ++ ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight ++ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight ++ ++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 ++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base ++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs ++ ++# L1 - uniform layout could possibly be optimized ++ ++ shl r0, ra3.16b, v_x_shift # r0=x*2 ++ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs ++ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight ++ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs ++ min r0, r0, rb_max_x ; mov rb9, ra3.8b ++ ++.if v_bit_depth <= 8 ++ shl rb_xshift2_next, r0, 3 ++.endif ++ ++ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight ++ and r1, r0, r1 ; mov rb10, ra3.8c ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov rb_dest, unif # Add stripe offsets ; dst_addr ++ add rb_base2_next, r3, r0 ++ ++ mov ra9, rb_max_y ; mov rb11, ra3.8d ++ shl r1, ra_wt_off_l1, rb_wt_den_p15 ++ asr rb_wt_off, r1, 9 ; mov ra_link, unif # link ++ ++# r5 loop counter ++# ra0 H coeffs L0 ++# ra1 H coeffs L1 ++# ra2 V coeffs L0 ++# ra3 temp ++# ra4-7 L0 H FIFO ++# rb4-7 L1 H FIFO ++# rb8-rb11 V coeffs L1 ++# ra9 rb_max_y alias ++ ++:1 ++# retrieve texture results and pick out bytes ++# then submit two more texture requests ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment ++ shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next ++ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++ add ra_y, 1, ra_y ; mov r3, ra_y ++ ++ max r3, r3, ra_k0 ; mov r0, r1 << 15 ++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++ ++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ ++# L0 H-filter ++# H FIFO scrolls are spread all over this loop ++ mov rb4, rb5 ; mov ra4, ra5 # ? Just moves ++ ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8d, r1 ++.if v_bit_depth <= 8 ++ sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++.else ++ sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1 ++ asr ra3, r2, (v_bit_depth - 8) ++.endif ++ ++ shr r2, r4, rb_xshift2 ; mov ra5, ra6 ++ shr r1, r2, v_v_shift ; mov r3, ra_y2 ++ add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++ ++ max r3, r3, ra_k0 ; mov r0, r1 << 15 ++ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++ ++ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++ add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ ++# L1 H-filter ++ ++ and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 ++ nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 ++# V filters - start in branch delay slots of H ++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction ++ add r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ brr.anyn -, r:1b ++ mov ra6, ra7 ; mul24 r3, ra7, rb10 ++ sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a ++ asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 ++# >>> .anyn 1b ++ ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay] ++ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ sub r2, r1, r0 ; mul24 r0, ra4, rb8 ++ sub r1, r3, r0 ; mul24 r0, ra5, rb9 ++ add r1, r1, r0 ; mul24 r0, ra7, rb11 ++ sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++ ++ asr r2, r2, 14 ; mul24 r1, r1, ra_k256 ++ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 ++ ++ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9) ++ add r1, r1, r2 ; mov r3, ra_blk_height ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend ++ ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_c_b ++ m_filter_c_b 8 ++ ++################################################################################ ++# Exit code used by both Luma & Chroma so place between them to avoid I-cache ++# conflicts ++ ++.macro m_exit_drain ++.if PREREAD == 2 ++# Special case 2 as loop is wasteful ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ nop ; nop ; ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 ++.else ++ mov.setf r3, PREREAD - 1 ++:1 ++ brr.anynz -, r:1b ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ sub.setf r3, r3, 1 ++ # >>> ++ mov -, vw_wait ++.endif ++.endm ++ ++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) ++# All qpus start at the beginning and after that (group - 1) must have finished ++# before (group) can start ++# ++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain ++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - ++# lockup otherwise) ++# ++# There is some, currently ill defined, potential lockup if we have the VDM active ++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? ++# ++# The code stalled when I had many waiters on a single sem so we have a ++# "ripple" of srels to restart. Unsure why, may have been bug, but this works ++# and we currently have both the memory & sems to support it. ++.macro m_sync_q, n_qpu, n_quads ++# Do not generate code for qpu >= quads * 4 - fns should never be called ++.if n_qpu < n_quads * 4 ++ mov ra_link, unif # Can only branch to an a reg (not r0) ++ mov -, vw_wait # [ra_link delay] ++ ++.set n_sem_sync, n_qpu - (n_qpu % 4) ++.set n_sem_in, n_qpu ++.set n_sem_out, n_qpu + 1 ++ ++.if n_qpu % 4 == 0 ++ ++.set n_sem_quad_in, 12 + n_qpu / 4 ++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) ++ ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ bra -, ra_link ++ sacq -, n_sem_quad_in ++ srel -, n_sem_out ++ srel -, n_sem_quad_out ++ ++.else ++ bra -, ra_link ++ srel -, n_sem_sync ++ sacq -, n_sem_in ++.if n_sem_out % 4 != 0 ++ srel -, n_sem_out ++.else ++ nop ++.endif ++.endif ++.endif ++.endm ++ ++.set v_quads8, N_QPU_8 / 4 ++ ++::mc_sync_q0 ++ m_sync_q 0, v_quads8 ++::mc_sync_q1 ++ m_sync_q 1, v_quads8 ++::mc_sync_q2 ++ m_sync_q 2, v_quads8 ++::mc_sync_q3 ++ m_sync_q 3, v_quads8 ++::mc_sync_q4 ++ m_sync_q 4, v_quads8 ++::mc_sync_q5 ++ m_sync_q 5, v_quads8 ++::mc_sync_q6 ++ m_sync_q 6, v_quads8 ++::mc_sync_q7 ++ m_sync_q 7, v_quads8 ++::mc_sync_q8 ++ m_sync_q 8, v_quads8 ++::mc_sync_q9 ++ m_sync_q 9, v_quads8 ++::mc_sync_q10 ++ m_sync_q 10, v_quads8 ++::mc_sync_q11 ++ m_sync_q 11, v_quads8 ++ ++# mc_exit() ++# Chroma & Luma the same now ++ ++.macro m_exit_qn ++ m_exit_drain ++ nop ; nop ; thrend ++ nop ++ nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_qn ++::mc_exit_y_qn ++ m_exit_qn ++ ++ ++ ++# mc_interrupt_exit12() ++ ++.macro m_exit_q0 ++ m_exit_drain ++ sacq -, 12 ++ nop ; nop ; thrend ++ mov interrupt, 1 ++ nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_q0 ++::mc_exit_y_q0 ++ m_exit_q0 ++ ++# LUMA CODE ++ ++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. ++# For P frames we make the second x,y coordinates offset by +8 ++ ++ ++################################################################################ ++# mc_setup ++# ++# typedef struct qpu_mc_pred_y_s_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t pic_h; ++# uint16_t pic_w; ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_s_t; ++ ++.macro m_setup_y, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_pmask, 0xff ++.set v_blk_height, Y_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 1 ++.set v_pmask, 0xffff ++.set v_blk_height, Y_BLK_HEIGHT_16 ++.endif ++ ++ ++ # Need to save these because we need to know the frame dimensions before computing texture coordinates ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ mov ra9, unif # ref_y_base ++ mov ra1, unif # x2_y2 ++ mov ra11, unif # ref_y2_base ++ ++# load constants ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ++ ++ ++ mov ra_kff100100, 0xff100100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ ++# Compute part of VPM to use ++ ++# Read image dimensions ++ mov ra3, unif # width_height ++ mov rb_xpitch, unif # stride2 ++.if v_x_shift == 0 ++ sub rb_max_x, ra3.16b, 1 ++.else ++ sub r0, ra3.16b, 1 ++ shl rb_max_x, r0, v_x_shift ++.endif ++ sub rb_max_y, ra3.16a, 1 ++ mov rb_pitch, unif # stride1 ++ ++# get destination pitch ++ mov r1, vdw_setup_1(0) ++ or rb_dma1_base, r1, rb_pitch ++ ++# Compute base address for first and second access ++ mov r3, elem_num ++ add r0, ra0.16b, r3 # Load x + elem_num ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ ++# X is byte offset - we can only load words - mask ++ ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base, ra9, r0 ++ ++ # r3 still contains elem_num ++ add r0, ra1.16b, r3 # Load x ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ ++ # r2 still contains mask ++ and r0, r0, -4 ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base2, ra11, r0 ++ ++# Do preloads ++ nop ; mov r0, ra0.16a # ; r0 = y ++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 ++ ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, ra_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b ++ ++ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom ++ ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ ++ mov ra_link, unif # Next fn ++ ++# touch vertical context to keep simulator happy ++ mov ra8, 0 ; mov rb8, 0 ++ bra -, ra_link ++ mov ra9, 0 ; mov rb9, 0 ++ mov ra10, 0 ; mov rb10, 0 ++ mov ra11, 0 ; mov rb11, 0 ++# >>> ra_link ++.endm ++ ++::mc_setup_y_q0 ++ m_setup_q0 ++::mc_setup_y_qn ++ m_setup_y 8 ++ ++################################################################################ ++# ++# Start of per-block setup code ++# P and B blocks share the same setup code to save on Icache space ++ ++# luma_setup_delay3 done in delay slots of branch that got us here ++ ++# get base addresses and per-channel shifts for *next* invocation ++# per-channel shifts were calculated on the *previous* invocation ++ ++# 1st 3 instructions of per_block-setup in branch delay ++# ++# typedef struct qpu_mc_pred_y_p_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t h; ++# uint16_t w; ++# uint32_t mymx21; ++# uint32_t wo1; ++# uint32_t wo2; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p_t; ++# ++ ++.macro m_luma_setup, v_bit_depth ++# Hack - QASM may well have have label pasting but I have no idea how... ++.if v_bit_depth == 8 ++ brr ra_link, r:per_block_setup_8 ++.elif v_bit_depth == 10 ++ brr ra_link, r:per_block_setup_10 ++.endif ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++.endm ++ ++.macro m_per_block_setup, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y ++ add ra_base_next, ra_base_next, r0 # [ra1 delay] ++ ++ add r0, ra1.16b, r3 # Load x2 ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height ++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes ++ add rb_base2_next, rb_base2_next, r0 ++ ++# get width,height of block (unif load above), r1 = width * pel_size ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height ++ add rb_lcount, r0, 7 ++ shl r0, r0, v_dma_h_shift ++ add r0, r0, r1 # Combine width and height of destination area ++ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register ++ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets ++ ++# get filter coefficients and discard unused B frame values ++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight ++ shl ra8, r0, 3 ; mov r3, ra_k255 ++ ++# Pack the 1st 4 filter coefs for H & V tightly ++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) ++ ++ mov r1,0x00010100 # -ve [ra8 delay] ++ ror ra2.8a, r1, ra8.8d ++ ror ra0.8a, r1, ra8.8c ++ ++ mov r1, 0x01040400 ++ ror ra2.8b, r1, ra8.8d ++ ror ra0.8b, r1, ra8.8c ++ ++ mov r1,0x050b0a00 # -ve ++ ror ra2.8c, r1, ra8.8d ++ ror ra0.8c, r1, ra8.8c ++ ++ mov r1,0x11283a40 ++ ror ra2.8d, r1, ra8.8d ++ ror ra0.8d, r1, ra8.8c ++ ++# In the 2nd vertical half we use b registers due to using a-side fifo regs ++ ++ mov r1,0x3a281100 ++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++ ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3 ++ ++ mov r1,0x0a0b0500 # -ve ++ ror r0, r1, ra8.8d ++ ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3 ++ ++ mov r1,0x04040100 ++ ror r0, r1, ra8.8d ++ ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3 ++ ++ mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address ++ ++ mov r1,0x01010000 # -ve ++ ror r0, r1, ra8.8d ++ ++ bra -, ra_link ++ ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3 ++ ++ shl r0, ra_wt_off_l0, rb_wt_den_p15 # Offset calc ++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ asr rb_wt_off, r0, 9 ; mov ra_link, unif # ; link - load after we've used its previous val ++# >>> branch ra_link ++ ++# r5 = 0 ++# ra_wt_mul_l1 = weight L1 ++# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) ++# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) ++# rb_wt_den_p15 = weight denom + 6 + 9 ++# rb_wt_mul_l0 = weight L0 ++.endm ++ ++:per_block_setup_8 ++ m_per_block_setup 8 ++ ++ ++ ++################################################################################ ++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# In a P block, y2_x2 should be y_x+8 ++# At this point we have already issued two pairs of texture requests for the current block ++ ++.macro m_filter_y_pxx, v_bit_depth ++ m_luma_setup v_bit_depth ++ ++ shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++ ++# r5 = 0 (loop count) ++ ++:1 ++# retrieve texture results and pick out bytes ++# then submit two more texture requests ++ ++# N.B. Whilst y == y2 as far as this loop is concerned we will start ++# the grab for the next block before we finish with this block and that ++# might be B where y != y2 so we must do full processing on both y and y2 ++ ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ; mov ra7, ra8 ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ ++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++ ++# apply horizontal filter ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ ++ sub.setf -, r5, 8 ; mov ra9, ra10 ++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++ brr.anyn -, r:1b ++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++ mov ra10, ra11 ; mov rb10, rb11 ++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++ # >>> .anyn 1b ++ ++ # apply vertical filter and write to VPM ++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb4 ++ add r1, r1, r0 ; mul24 r0, ra9, rb5 ++ sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++ add r1, r1, r0 ; mul24 r0, ra11, rb7 ++ sub r1, r1, r0 ++# At this point r1 is a 22-bit signed quantity: 8 (original sample), ++# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign) ++# The top 8 bits have rubbish in them as mul24 is unsigned ++# The low 6 bits need discard before weighting ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish ++ asr r1, r1, 14 ++ nop ; mul24 r1, r1, ra_wt_mul_l0 ++ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop ++ ++ shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++ ++# >>> branch.anyn yloop ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_pxx ++ m_filter_y_pxx 8 ++ ++ ++################################################################################ ++ ++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# In a P block, only the first half of coefficients contain used information. ++# At this point we have already issued two pairs of texture requests for the current block ++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time? ++# Or possibly by taking advantage of symmetry? ++ ++.macro m_filter_y_bxx, v_bit_depth ++ m_luma_setup v_bit_depth ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ; mov ra7, ra8 ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ ++ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++ ++# apply horizontal filter ++ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 ++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ ++ sub.setf -, r5, 8 ; mov ra9, ra10 ++ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a ++ brr.anyn -, r:1b ++ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b ++ mov ra10, ra11 ; mov rb10, rb11 ++ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 ++ # >>> .anyn 1b ++ ++ # apply vertical filter and write to VPM ++ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb4 ++ add r1, r1, r0 ; mul24 r0, ra9, rb5 ++ sub r1, r1, r0 ; mul24 r0, ra10, rb6 ++ add r1, r1, r0 ; mul24 r0, ra11, rb7 ++ sub r1, r1, r0 ; mov r2, rb_wt_off ++# As with P-pred r1 is a 22-bit signed quantity in 32-bits ++# Top 8 bits are bad - low 6 bits should be discarded ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++ ++ asr r1, r1, 14 ++ nop ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++ ++ add r1, r1, r0 ; mov r3, ra_blk_height ++ shl r1, r1, 8 ; v8subs r0, ra_height, r3 ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_bxx ++ m_filter_y_bxx 8 ++ ++################################################################################ ++# ++# typedef struct qpu_mc_pred_y_p00_s { ++# qpu_mc_src_t next_src1; ++# uint16_t h; ++# uint16_t w; ++# uint32_t wo1; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p00_t; ++ ++.macro m_filter_y_p00, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++ mov ra0, unif ; mov r3, elem_num # y_x ++ mov ra_xshift, ra_xshift_next # [ra0 delay] ++ add r0, ra0.16b, r3 ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height ++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write ++ ++# get width,height of block (unif load above) ++# Compute vdw_setup1(dst_pitch-width) ++ shl r1, ra_width, v_x_shift ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset ++ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr ++ add rb_dma0, r0, rb_dma0_base ++ ++ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 ++ # For B l1 & L0 offsets should be identical so it doesn't matter which we use ++ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_p00 ++ m_filter_y_p00 8 ++ ++################################################################################ ++ ++.macro m_filter_y_b00, v_bit_depth ++# luma setup does a fair bit more than we need calculating filter coeffs ++# that we will never use but it saves I-cache to use it (also simple!) ++ m_luma_setup v_bit_depth ++ ++# Fix up vals that were expecting a filter (somewhat icky) ++ mov r0, 7 ++ sub rb_i_tmu, rb_i_tmu, r0 ++ sub rb_lcount, rb_lcount, r0 ++ mov r0, 8 ; mov r1, ra_wt_off_mul_l0 ++ shl rb_wt_off, rb_wt_off, r0 ++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++ add r1, r0, r1 ++ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, rb_wt_den_p15 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height (currently always 16) ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc rb_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ add rb_lcount, rb_lcount, r0 ++ brr -, r:1b ++ add rb_dma0, rb_dma0, r1 ++ add rb_dest, rb_dest, r2 ++ mov vw_setup, rb_vpm_init # Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_b00 ++ m_filter_y_b00 8 ++ ++################################################################################ ++################################################################################ ++# 10 BIT ++ ++::mc_setup_c10_q0 ++ m_setup_q0 ++::mc_setup_c10_qn ++ m_setup_c 10 ++ ++::mc_filter_c10_p ++ m_filter_c_p 0, 10 ++ ++::mc_filter_c10_p_l1 ++ m_filter_c_p 1, 10 ++ ++ ++::mc_filter_c10_b ++ m_filter_c_b 10 ++ ++# Even if these fns are the same as for other bit depths we want our own copy ++# to keep the code we are using in a single lump to avoid (direct map) cache ++# thrashing ++.set v_quads10, N_QPU_16 / 4 ++ ++::mc_sync10_q0 ++ m_sync_q 0, v_quads10 ++::mc_sync10_q1 ++ m_sync_q 1, v_quads10 ++::mc_sync10_q2 ++ m_sync_q 2, v_quads10 ++::mc_sync10_q3 ++ m_sync_q 3, v_quads10 ++::mc_sync10_q4 ++ m_sync_q 4, v_quads10 ++::mc_sync10_q5 ++ m_sync_q 5, v_quads10 ++::mc_sync10_q6 ++ m_sync_q 6, v_quads10 ++::mc_sync10_q7 ++ m_sync_q 7, v_quads10 ++::mc_sync10_q8 ++ m_sync_q 8, v_quads10 ++::mc_sync10_q9 ++ m_sync_q 9, v_quads10 ++::mc_sync10_q10 ++ m_sync_q 10, v_quads10 ++::mc_sync10_q11 ++ m_sync_q 11, v_quads10 ++ ++::mc_exit_y10_q0 ++::mc_exit_c10_q0 ++ m_exit_q0 ++ ++::mc_exit_y10_qn ++::mc_exit_c10_qn ++ m_exit_qn ++ ++::mc_setup_y10_q0 ++ m_setup_q0 ++::mc_setup_y10_qn ++ m_setup_y 10 ++ ++:per_block_setup_10 ++ m_per_block_setup 10 ++ ++::mc_filter_y10_pxx ++ m_filter_y_pxx 10 ++ ++::mc_filter_y10_p00 ++ m_filter_y_p00 10 ++ ++::mc_filter_y10_bxx ++ m_filter_y_bxx 10 ++ ++::mc_filter_y10_b00 ++ m_filter_y_b00 10 ++ ++ ++ ++::mc_end ++# Do not add code here because mc_end must appear after all other code. +diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h +new file mode 100644 +index 0000000000..9f8983da52 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_cmd.h +@@ -0,0 +1,128 @@ ++#ifndef RPI_SHADER_CMD_H ++#define RPI_SHADER_CMD_H ++ ++#pragma pack(push, 4) ++ ++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y ++// If mixed then we are just confused and get a lot of warnings.... ++typedef const uint8_t * qpu_mc_src_addr_t; ++typedef uint8_t * qpu_mc_dst_addr_t; ++#else ++typedef uint32_t qpu_mc_src_addr_t; ++typedef uint32_t qpu_mc_dst_addr_t; ++#endif ++ ++typedef struct qpu_mc_src_s ++{ ++ int16_t y; ++ int16_t x; ++ qpu_mc_src_addr_t base; ++} qpu_mc_src_t; ++ ++ ++typedef struct qpu_mc_pred_c_p_s { ++ qpu_mc_src_t next_src; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ qpu_mc_dst_addr_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_p_t; ++ ++typedef struct qpu_mc_pred_c_b_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x1; ++ uint32_t coeffs_y1; ++ uint32_t weight_u1; ++ uint32_t weight_v1; ++ qpu_mc_src_t next_src2; ++ uint32_t coeffs_x2; ++ uint32_t coeffs_y2; ++ uint32_t wo_u2; ++ uint32_t wo_v2; ++ qpu_mc_dst_addr_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_b_t; ++ ++typedef struct qpu_mc_pred_c_s_s { ++ qpu_mc_src_t next_src1; ++ uint32_t pic_cw; // C Width (== Y width / 2) ++ uint32_t pic_ch; // C Height (== Y Height / 2) ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ qpu_mc_src_t next_src2; ++ uint32_t next_fn; ++} qpu_mc_pred_c_s_t; ++ ++typedef struct qpu_mc_pred_c_s { ++ union { ++ qpu_mc_pred_c_p_t p; ++ qpu_mc_pred_c_b_t b; ++ qpu_mc_pred_c_s_t s; ++ }; ++} qpu_mc_pred_c_t; ++ ++ ++typedef struct qpu_mc_pred_y_p_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t h; ++ uint16_t w; ++ uint32_t mymx21; ++ uint32_t wo1; ++ uint32_t wo2; ++ qpu_mc_dst_addr_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p_t; ++ ++typedef struct qpu_mc_pred_y_p00_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t wo1; ++ qpu_mc_dst_addr_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p00_t; ++ ++typedef struct qpu_mc_pred_y_s_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t pic_h; ++ uint16_t pic_w; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++ uint32_t next_fn; ++} qpu_mc_pred_y_s_t; ++ ++// Only a useful structure in that it allows us to return something other than a void * ++typedef struct qpu_mc_pred_y_s { ++ union { ++ qpu_mc_pred_y_p_t p; ++ qpu_mc_pred_y_p00_t p00; ++ qpu_mc_pred_y_s_t s; ++ }; ++} qpu_mc_pred_y_t; ++ ++typedef union qpu_mc_pred_cmd_u { ++ qpu_mc_pred_y_t y; ++ qpu_mc_pred_c_t c; ++ uint32_t data[1]; ++} qpu_mc_pred_cmd_t; ++ ++#define QPU_MC_PRED_N_Y8 12 ++#define QPU_MC_PRED_N_C8 12 ++ ++#define QPU_MC_PRED_N_Y10 12 ++#define QPU_MC_PRED_N_C10 12 ++ ++#pragma pack(pop) ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c +new file mode 100644 +index 0000000000..0c80cf4de0 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template.c +@@ -0,0 +1,62 @@ ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "rpi_hevc_shader_cmd.h" ++#include "rpi_hevc_shader_template.h" ++ ++typedef struct shader_track_s ++{ ++ const union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ const struct qpu_mc_src_s *last_l0; ++ const struct qpu_mc_src_s *last_l1; ++ uint32_t width; // pic_width * PW ++ uint32_t height; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t wdenom; ++} shader_track_t; ++ ++static int wtoidx(const unsigned int w) ++{ ++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++ return pel_weight[w]; ++} ++ ++static const int fctom(uint32_t x) ++{ ++ int rv; ++ // As it happens we can take the 2nd filter term & divide it by 8 ++ // (dropping fractions) to get the fractional move ++ rv = 8 - ((x >> 11) & 0xf); ++ av_assert2(rv >= 0 && rv <= 7); ++ return rv; ++} ++ ++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) ++{ ++ return (x << shl) >> shr; ++} ++ ++static inline int woff_p(HEVCRpiContext *const s, int32_t x) ++{ ++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int woff_b(HEVCRpiContext *const s, int32_t x) ++{ ++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int wweight(int32_t x) ++{ ++ return ext(x, 16, 16); ++} ++ ++ ++#define PW 1 ++#include "rpi_hevc_shader_template_fn.h" ++ ++#undef PW ++#define PW 2 ++#include "rpi_hevc_shader_template_fn.h" ++ +diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h +new file mode 100644 +index 0000000000..304d73ea4a +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template.h +@@ -0,0 +1,22 @@ ++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++ ++struct HEVCRpiContext; ++struct HEVCRpiInterPredEnv; ++ ++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void rpi_sand_dump8(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++void rpi_sand_dump16(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h +new file mode 100644 +index 0000000000..b9e7c07fe3 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template_fn.h +@@ -0,0 +1,477 @@ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++#define PATCH_STRIDE (16 * PW) ++ ++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { ++ const pixel s = *(const pixel *)src; ++ pixel * d = (pixel *)dst; ++ for (unsigned int j = 0; j < w; j += PW) { ++ *d++ = s; ++ } ++ } ++} ++ ++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride) { ++ memcpy(dst, src, w); ++ } ++} ++ ++static void FUNC(get_patch_y)(const shader_track_t * const st, ++ uint8_t * dst, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > st->width) { ++ if (x >= st->width) ++ x = st->width - PW; ++ dr = (x + w) - st->width; ++ w = st->width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > st->height) { ++ if (y >= st->height) ++ y = st->height - 1; ++ db = (y + h) - st->height; ++ h = st->height - y; ++ } ++ ++ dst += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); ++ if (dr != 0) ++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); ++ w += dl + dr; ++ dst -= dl; ++ ++ if (dt != 0) ++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); ++ if (db != 0) ++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); ++} ++ ++ ++ ++static void FUNC(get_patch_c)(const shader_track_t * const st, ++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ const int width = st->width; ++ const int height = st->height; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > width) { ++ if (x >= width) ++ x = width - PW; ++ dr = (x + w) - width; ++ w = width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > height) { ++ if (y >= height) ++ y = height - 1; ++ db = (y + h) - height; ++ h = height - y; ++ } ++ ++ dst_u += dl + dt * dst_stride; ++ dst_v += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ { ++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); ++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); ++ } ++ if (dr != 0) ++ { ++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); ++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); ++ } ++ w += dl + dr; ++ dst_u -= dl; ++ dst_v -= dl; ++ ++ if (dt != 0) ++ { ++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); ++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); ++ } ++ if (db != 0) ++ { ++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); ++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); ++ } ++} ++ ++// w, y, w, h in pixels ++// stride1, stride2 in bytes ++void FUNC(rpi_sand_dump)(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) ++{ ++ const int mask = stride2 == 0 ? ~0 : stride1 - 1; ++ ++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); ++ ++ if (is_c) { ++ x *= 2; ++ w *= 2; ++ } ++ ++ for (int i = y; i != y + h; ++i) { ++ for (int j = x; j != x + w; ++j) { ++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; ++ char sep = is_c && (j & 1) == 0 ? ':' : ' '; ++#if PW == 1 ++ if (j < 0 || i < 0) ++ printf("..%c", sep); ++ else ++ printf("%02x%c", *(const pixel*)p, sep); ++#else ++ if (j < 0 || i < 0) ++ printf("...%c", sep); ++ else ++ printf("%03x%c", *(const pixel*)p, sep); ++#endif ++ } ++ printf("\n"); ++ } ++} ++ ++ ++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s, ++ const HEVCRpiInterPredEnv *const ipe_y, ++ const HEVCRpiInterPredEnv *const ipe_c) ++{ ++ for (int c_idx = 0; c_idx < 2; ++c_idx) ++ { ++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; ++ shader_track_t tracka[QPU_N_MAX] = {{NULL}}; ++ unsigned int exit_n = 0; ++ ++ if (ipe == NULL || !ipe->used) { ++ continue; ++ } ++ ++ do { ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ const HEVCRpiInterPredQ * const q = ipe->q + i; ++ shader_track_t * const st = tracka + i; ++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; ++ ++ for (;;) { ++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; ++ ++ if (link == q->code_setup) { ++ if (c_idx == 0) { ++ // Luma ++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s; ++ ++ st->height = c->pic_h; ++ st->width = c->pic_w * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->wdenom = c->wdenom; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else { ++ // Chroma ++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s; ++ ++ st->height = c->pic_ch; ++ st->width = c->pic_cw * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->wdenom = c->wdenom; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ } ++ else if (link == s->qpu.y_pxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ const int w1 = FFMIN(c->w, 8); ++ const int w2 = c->w - w1; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ if (w2 > 0) { ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ } ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); ++ if (w2 > 0) { ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); ++ } ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_bxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, ++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_p00) { ++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); ++ ++ st->last_l0 = &c->next_src1; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_b00) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ av_assert0(c->w <= 16 && c->h <= 64); ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( ++ patch_y3, patch_y1, PATCH_STRIDE, ++ c->h, 0, 0, c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, ++ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), 0, 0, c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx_l1) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l1 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_bxx) { ++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b; ++ const int mx1 = fctom(c->coeffs_x1); ++ const int my1 = fctom(c->coeffs_y1); ++ const int mx2 = fctom(c->coeffs_x2); ++ const int my2 = fctom(c->coeffs_y2); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; ++ uint8_t patch_v1[PATCH_STRIDE * 72]; ++ uint8_t patch_u2[PATCH_STRIDE * 72]; ++ uint8_t patch_v2[PATCH_STRIDE * 72]; ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, ++ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2), ++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w); ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, ++ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2), ++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == q->code_sync) { ++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); ++ break; ++ } ++ else if (link == q->code_exit) { ++ // We expect exit to occur without other sync ++ av_assert0(i == exit_n); ++ ++exit_n; ++ break; ++ } ++ else { ++ av_assert0(0); ++ } ++ } ++ ++ st->qpu_mc_curr = cmd; ++ } ++ } while (exit_n == 0); ++ } ++} ++ ++#undef FUNC ++#undef pixel ++ +diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s +new file mode 100644 +index 0000000000..a08a1d6bef +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform.s +@@ -0,0 +1,927 @@ ++# ****************************************************************************** ++# Argon Design Ltd. ++# (c) Copyright 2015 Argon Design Ltd. All rights reserved. ++# ++# Module : HEVC ++# Author : Peter de Rivaz ++# ****************************************************************************** ++ ++# HEVC VPU Transform ++# fe ++# Transform matrix can be thought of as ++# output row vector = input row vector * transMatrix2 ++# ++# The even rows of the matrix are symmetric ++# The odd rows of the matrix are antisymmetric ++# ++# So only need to compute the first half of the results, then can compute the remainder with a butterfly ++# ++# EXAMPLE ++# (a b c d) (1 2 2 1) ++# (3 4 -4 -3) ++# (5 6 6 5) ++# (7 8 -8 -7) ++# ++# x=(a c)(1 2) = 1a+5c 2a+6c ++# (5 6) ++# ++# y=(b d)(3 4) = 3b+7d 4b+8d ++# (7 8) ++# ++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d ++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d ++# ++# Final results are (u , v[::-1]) ++# ++# ++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) ++# Apply the even matrix first and stop before rounding ++# Then apply the odd matrix in a full manner: ++# ++# First step is to compute partial products with the first input (16 cycles) ++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output ++# 2a 4b 6c 8d ++# 2a -4b 6c -8d ++# 1a -3b 5c -7d ++# ++# Second step is to sum partial products into final position (8 cycles) ++# 1a+3b+5c+7d ++# 2a+4b+6c+8d ++# 2a-4b+6c-8d ++# 1a-3b+5c-7d ++# ++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) ++# ++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) ++# ++# For 8x8 we could compute two in parallel. ++# ++# ++ ++# Columns are transformed first ++# ++# Store top left half of transMatrix2 in ++# Store bottom left half of transMatrix2 in HX(32,32) ++# ++# For 16x16 ++# HX(0:15,0) contains input data before transform ++# HY(0:15,0) contains 32bit output data after transform ++# HX(32,0) contains even rows of left half of transMatrix2 ++# HX(32,32) contains odd rows of left half of transMatrix2 ++# HY(48,0) contains partial products ready for summing ++# ++ ++ ++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# coeffs32 ++# num32: number of 32x32 transforms ++# command 0 for transform, 1 for memclear16(int16_t *dst,num16) ++# ++ ++.equ TRANS_SHIFT, 20 - BIT_DEPTH ++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) ++.equ TRANS_ASL2, 16 - TRANS_SHIFT ++ ++ ++hevc_trans_16x16: ++ cmp r5,1 ++ beq memclear16 ++ cmp r5,2 ++ beq hevc_deblock_16x16 ++ cmp r5,3 ++ beq hevc_uv_deblock_16x16 ++ cmp r5,4 ++ beq hevc_uv_deblock_16x16_with_clear ++ cmp r5,5 ++ beq hevc_run_command_list ++ ++ push r6-r15, lr # TODO cut down number of used registers ++ mov r14,r3 # coeffs32 ++ mov r15,r4 # num32 ++ mov r3, 16*2 # Stride of transMatrix2 in bytes ++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix ++ ++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix ++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ # Now use r0 to describe which matrix we are working on. ++ # Allows us to prefetch the next block of coefficients for efficiency. ++ mov r0,0 # This describes the location where we read our coefficients from ++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) ++ mov r7,16*16*2 # Total block size ++ mov r8,64*16 # Value used to swap from current to next VRF location ++ vldh HX(0++,0)+r0,(r1 += r3) REP 16 ++ mov r4,64 # Constant used for rounding first pass ++ mov r5,TRANS_RND2 # Constant used for rounding second pass ++ ++ # At start of block r0,r1 point to the current block (that has already been loaded) ++block_loop: ++ eor r0,r8 ++ add r1,r7 ++ # Prefetch the next block ++ vldh HX(0++,0)+r0,(r1 += r3) REP 16 ++ eor r0,r8 ++ sub r1,r7 ++ ++ # Transform the current block ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? ++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position ++ ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) ++ ++ # Save results - note there has been a transposition during the processing so we save columns ++ vsth VX(0,32++)+r0, (r1 += r3) REP 16 ++ ++ # Move onto next block ++ eor r0,r8 ++ add r1,r7 ++ ++ addcmpbgt r2,-1,0,block_loop ++ ++ # Now go and do any 32x32 transforms ++ b hevc_trans_32x32 ++ ++ pop r6-r15, pc ++ ++# r1,r2,r3 r7,r8 should be preserved ++# HX(0++,0)+r0 is the block to be transformed ++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients ++# Use HY(48,0) for intermediate results ++# r0 can be used, but should be returned to its original value at the end ++col_trans_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++col_trans_odd_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_odd_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_odd_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# ++hevc_trans_32x32: ++ mov r1,r14 # coeffs ++ mov r2,r15 # num ++ ++ # Fetch odd transform matrix ++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) ++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix ++ #add r0, 16*16*2 ++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer ++ mov r7, 16*16*2 # Total block size ++ sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) ++ # set r8 to 32byte aligned stack pointer ++ add r8,sp,31 ++ lsr r8,5 ++ lsl r8,5 ++ mov r9,r8 # Backup of the temporary storage ++ mov r10,r1 # Backup of the coefficient buffer ++block_loop32: ++ ++ # COLUMN TRANSFORM ++ mov r4, 64 # Constant used for rounding first pass ++ mov r5, 9 # left shift used for rounding first pass ++ ++ # Transform the first 16 columns ++ mov r1,r10 # Input Coefficient buffer ++ mov r8,r9 # Output temporary storage ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ # ROW TRANSFORM ++ mov r4, TRANS_RND2 # Constant used for rounding second pass ++ mov r5, TRANS_ASL2 # left shift used for rounding second pass ++ ++ mov r1,r9 # Input temporary storage ++ mov r8,r10 # Output Coefficient buffer ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ add r10, 32*32*2 # move onto next block of coefficients ++ addcmpbgt r2,-1,0,block_loop32 ++ ++ add sp,sp,32*32*2+32 # Restore stack ++ ++ pop r6-r15, pc ++ ++trans32: ++ push lr ++ # We can no longer afford the VRF space to do prefetching when doing 32x32 ++ # Fetch the even rows ++ vldh HX(0++,0),(r1 += r3) REP 16 ++ # Fetch the odd rows ++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 ++ ++ # Transform the even rows using even matrix ++ mov r0, 0 # Even rows ++ bl col_trans_16 ++ ++ # Now transform the odd rows using odd matrix ++ mov r0, 64*16 # Odd rows ++ bl col_trans_odd_16 ++ ++ # Now apply butterfly to compute the first 16 results ++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ # 16bit results now in HX(48,32) ++ mov r0,r8 ++ mov r6,32*2 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ ++ # Now apply butterfly to compute the second 16 results (in reverse order) ++ vsub HY(63,0),HY(0 ,0),HY(16,0) ++ vsub HY(62,0),HY(1 ,0),HY(17,0) ++ vsub HY(61,0),HY(2 ,0),HY(18,0) ++ vsub HY(60,0),HY(3 ,0),HY(19,0) ++ vsub HY(59,0),HY(4 ,0),HY(20,0) ++ vsub HY(58,0),HY(5 ,0),HY(21,0) ++ vsub HY(57,0),HY(6 ,0),HY(22,0) ++ vsub HY(56,0),HY(7 ,0),HY(23,0) ++ vsub HY(55,0),HY(8 ,0),HY(24,0) ++ vsub HY(54,0),HY(9 ,0),HY(25,0) ++ vsub HY(53,0),HY(10,0),HY(26,0) ++ vsub HY(52,0),HY(11,0),HY(27,0) ++ vsub HY(51,0),HY(12,0),HY(28,0) ++ vsub HY(50,0),HY(13,0),HY(29,0) ++ vsub HY(49,0),HY(14,0),HY(30,0) ++ vsub HY(48,0),HY(15,0),HY(31,0) ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ add r0,r8,32 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ pop pc ++ ++memclear16: ++ # r0 is address ++ # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified) ++ vmov HX(0++,0),0 REP 16 ++ mov r2,32 ++loop: ++ vsth HX(0++,0),(r0+=r2) REP 16 ++ add r0,16*16*2 ++ sub r1,16*16 ++ cmp r1,0 ++ bgt loop ++ b lr ++ ++ ++################################################################################ ++# HEVC VPU Deblock ++# ++# Vertical edges before horizontal ++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked ++# ++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge. ++# The VPU code works in units of 16x16 blocks. ++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time). ++# One final horizontal filter is required at the end. ++# PCM is not allowed in this code. ++# ++# ++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering) ++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering. ++ ++.set P0,63 ++.set P1,62 ++.set P2,61 ++.set P3,60 ++.set Q0,59 ++.set Q1,58 ++.set Q2,57 ++.set Q3,56 ++ ++.set dp,32 ++.set dq,33 ++.set d,34 ++.set decision,35 ++.set beta,36 ++.set beta2,37 ++.set beta3,38 ++.set ptest,39 ++.set qtest,40 ++.set pqtest,41 ++.set thresh,42 ++.set deltatest, 44 ++.set deltap1, 45 ++.set tc25, 46 ++.set setup,47 ++.set tc,48 ++.set tc25,49 ++.set tc2, 50 ++.set do_filter, 51 ++.set delta, 52 ++.set tc10, 53 ++.set delta0, 54 ++.set delta1, 55 ++.set zeros, 0 ++.set setup_input, 1 ++.set deltaq1, 2 ++ ++ ++ ++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image. ++# Row has num16 16x16 blocks across ++# Beta goes from 0 to 64 ++# tc goes from 0 to 24 ++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number] ++# has 8 bytes per edge ++# has 16 bytes per direction ++# has 32 bytes per 16x16 block ++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4)) ++hevc_deblock_16x16: ++ push r6-r15, lr ++ mov r9,r4 ++ mov r4,r3 ++ mov r13,r2 ++ mov r2,r0 ++ mov r10,r0 ++ subscale4 r0,r1 ++ mov r8,63 ++ mov r6,-3 ++ vmov H(zeros,0),0 ++# r7 is number of blocks still to load ++# r0 is location of current block - 4 * stride ++# r1 is stride ++# r2 is location of current block ++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical ++# r4 is setup ++# r5 is for temporary calculations ++# r8 holds 63 ++# r6 holds -3 ++# r9 holds the number of 16 high rows to process ++# r10 holds the original img base ++# r11 returns 0 if no filtering was done on the edge ++# r12 saves a copy of this ++# r13 is copy of width ++ ++process_row: ++ # First iteration does not do horizontal filtering on previous ++ mov r7, r13 ++ mov r3,0 ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) # We may wish to prefetch these ++ vstb H(zeros,0),(r4) ++ bl vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 ++ bl vert_filter ++ sub r3,8 ++ b start_deblock_loop ++deblock_loop: ++ # Middle iterations do vertical on current block and horizontal on preceding ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) ++ vstb H(zeros,0),(r4) ++ bl vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl vert_filter ++ sub r3,8 ++ vldb H(setup_input,0), -16(r4) ++ vstb H(zeros,0),-16(r4) ++ bl horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl horz_filter ++ sub r3,8*64 ++ addcmpbeq r12,0,0,skip_save_top ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++skip_save_top: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++start_deblock_loop: ++ # move onto next 16x16 (could do this with circular buffer support instead) ++ add r3,16 ++ and r3,r8 ++ add r4,32 ++ # Perform loop counter operations (may work with an addcmpbgt as well?) ++ add r0,16 ++ add r2,16 ++ sub r7,1 ++ cmp r7,0 # Are there still more blocks to load ++ bgt deblock_loop ++ ++ # Final iteration needs to just do horizontal filtering ++ vldb H(setup_input,0), -16(r4) ++ vstb H(zeros,0),-16(r4) ++ bl horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl horz_filter ++ sub r3,64*8 ++ addcmpbeq r12,0,0,skip_save_top2 ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++skip_save_top2: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++ ++# Now look to see if we should do another row ++ sub r9,1 ++ cmp r9,0 ++ bgt start_again ++ pop r6-r15, pc ++start_again: ++ # Need to sort out r0,r2 to point to next row down ++ addscale16 r10,r1 ++ mov r2,r10 ++ subscale4 r0,r2,r1 ++ b process_row ++ ++ ++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered ++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations ++ ++vert_filter: ++ push lr ++ ++ vmov HX(P3,0), V(16,12)+r3 ++ vmov HX(P2,0), V(16,13)+r3 ++ vmov HX(P1,0), V(16,14)+r3 ++ vmov HX(P0,0), V(16,15)+r3 ++ vmov HX(Q0,0), V(16,16)+r3 ++ vmov HX(Q1,0), V(16,17)+r3 ++ vmov HX(Q2,0), V(16,18)+r3 ++ vmov HX(Q3,0), V(16,19)+r3 ++ ++ bl do_luma_filter ++ ++ vadds V(16,13)+r3, HX(P2,0), 0 ++ vadds V(16,14)+r3, HX(P1,0), 0 ++ vadds V(16,15)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds V(16,16)+r3, HX(Q0,0), 0 ++ vadds V(16,17)+r3, HX(Q1,0), 0 ++ vadds V(16,18)+r3, HX(Q2,0), 0 ++ ++ pop pc ++ ++# Filter edge at H(16,0)+r3 ++horz_filter: ++ push lr ++ ++ vmov HX(P3,0), H(12,0)+r3 ++ vmov HX(P2,0), H(13,0)+r3 ++ vmov HX(P1,0), H(14,0)+r3 ++ vmov HX(P0,0), H(15,0)+r3 ++ vmov HX(Q0,0), H(16,0)+r3 ++ vmov HX(Q1,0), H(17,0)+r3 ++ vmov HX(Q2,0), H(18,0)+r3 ++ vmov HX(Q3,0), H(19,0)+r3 ++ ++ bl do_luma_filter ++ ++ vadds H(13,0)+r3, HX(P2,0), 0 ++ vadds H(14,0)+r3, HX(P1,0), 0 ++ vadds H(15,0)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds H(16,0)+r3, HX(Q0,0), 0 ++ vadds H(17,0)+r3, HX(Q1,0), 0 ++ vadds H(18,0)+r3, HX(Q2,0), 0 ++ ++ pop pc ++ ++# r4 points to array of beta/tc for each 4 length edge ++do_luma_filter: ++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8 ++ valtl HX(beta,0),H(setup,0),H(setup,0) ++ valtu HX(tc,0),H(setup,0),H(setup,0) ++ vmul HX(tc25,0), HX(tc,0), 5 ++ vadd HX(tc25,0),HX(tc25,0), 1 ++ vasr HX(tc25,0), HX(tc25,0), 1 ++ ++ # Compute decision ++ vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1 ++ vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1 ++ vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0 ++ vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0 ++ ++ vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1 ++ vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1 ++ vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0 ++ vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0 ++ ++ vadd HX(d,0), HX(dp,0), HX(dq,0) ++ vasr HX(beta2,0),HX(beta,0),2 ++ vasr HX(beta3,0),HX(beta,0),3 ++ ++ # Compute flags that are negative if all conditions pass ++ vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC ++ vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC ++ vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF ++ ++ vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN ++ vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF ++ vadd HX(decision,0), HX(d,0), HX(d,0) IFN ++ vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF ++ vmov HX(decision,0), 1 IFNN ++ vadd H(decision,0),H(decision,3),0 IFN ++ vadd H(decision,16),H(decision,19),0 IFN ++ vmov -,HX(decision,0) SETF # N marks strong filter ++ vmov HX(decision,0), 1 IFNN # NN marks normal filter ++ ++ vadd HX(do_filter,0), HX(d,3), HX(d,0) ++ vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter ++ vmov HX(decision,0),0 IFNN # Z marks no filter ++ ++ # Expand out decision (currently valid one every 4 pixels) 0...1...2...3 ++ # First extract out even terms ++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0.1.2.3 ++ vodd HX(decision,0),HX(decision,0),HX(decision,0) # 0123 ++ # Now expand back ++ valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233 ++ valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333 ++ ++ # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering ++ ++ # Do a quick check to see if there is anything to do ++ mov r11, 0 # Signal no filtering ++ vmov -,1 IFNZ SUMS r5 ++ cmp r5,0 ++ beq filtering_done ++ mov r11, 1 # Signal some filtering ++ # And whether there is any strong filtering ++ vmov -,1 IFN SUMS r5 ++ cmp r5,0 ++ beq normal_filtering ++ ++ ############################################################################## ++ # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!) ++ vshl HX(tc2,0), HX(tc,0), 1 # Note that in normal filtering tx2 is tc/2, while here it is tc*2 ++ ++ # Take a copy of the original pixels for use in decision calculation ++ vmov HX(P0,32),HX(P0,0) ++ vmov HX(Q0,32),HX(Q0,0) ++ vmov HX(P1,32),HX(P1,0) ++ vmov HX(Q1,32),HX(Q1,0) ++ vmov HX(P2,32),HX(P2,0) ++ vmov HX(Q2,32),HX(Q2,0) ++ ++ vadd -,HX(P2,32),4 CLRA SACC ++ vshl -,HX(P1,32),1 SACC ++ vshl -,HX(P0,32),1 SACC ++ vshl -,HX(Q0,32),1 SACC ++ vshl HX(delta,0),HX(Q1,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(P0,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN ++ ++ vadd -,HX(P2,32),2 CLRA SACC ++ vadd -,HX(P1,32),HX(P0,32) SACC ++ vshl HX(delta,0),HX(Q0,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 2 ++ vsub HX(delta,0),HX(delta,0),HX(P1,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN ++ ++ vadd -,HX(Q0,32),4 CLRA SACC ++ vadd -,HX(P1,32),HX(P0,32) SACC ++ vmul -,HX(P2,32),3 SACC ++ vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(P2,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN ++ #vmov HX(P2,0),3 IFN ++ ++ # Now reverse all P/Qs ++ ++ vadd -,HX(Q2,32),4 CLRA SACC ++ vshl -,HX(Q1,32),1 SACC ++ vshl -,HX(Q0,32),1 SACC ++ vshl -,HX(P0,32),1 SACC ++ vshl HX(delta,0),HX(P1,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(Q0,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN ++ ++ vadd -,HX(Q2,32),2 CLRA SACC ++ vadd -,HX(Q1,32),HX(Q0,32) SACC ++ vshl HX(delta,0),HX(P0,32),0 SACC ++ vasr HX(delta,0),HX(delta,0), 2 ++ vsub HX(delta,0),HX(delta,0),HX(Q1,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN ++ ++ vadd -,HX(P0,32),4 CLRA SACC ++ vadd -,HX(Q1,32),HX(Q0,32) SACC ++ vmul -,HX(Q2,32),3 SACC ++ vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct ++ vasr HX(delta,0),HX(delta,0), 3 ++ vsub HX(delta,0),HX(delta,0),HX(Q2,32) ++ vclamps HX(delta,0), HX(delta,0), HX(tc2,0) ++ vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN ++ ++ ############################################################################## ++ # Normal filtering ++normal_filtering: ++ # Invert the decision flags ++ # make instruction more complicated as assembler has error and loses SETF ++ vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering ++ vmov -, HX(tc10,0) SETF # IFN means normal filtering ++ ++ vmov -,1 IFN SUMS r5 ++ cmp r5,0 ++ beq filtering_done ++ ++ vasr HX(tc2,0), HX(tc,0), 1 ++ vmul HX(tc10,0), HX(tc,0), 10 ++ ++ vasr HX(thresh,0), HX(beta,0), 1 ++ vadd HX(thresh,0), HX(thresh,0), HX(beta,0) ++ vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC ++ ++ vadd HX(ptest,0),HX(dp,3),HX(dp,0) ++ vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel ++ vadd HX(qtest,0),HX(dq,3),HX(dq,0) ++ vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel ++ # Expand ptest and qtest together ++ vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0) # p.p.p.p.q.q.q.q ++ vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........ ++ valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq ++ valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0) ++ valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0) ++ ++ vsub HX(delta0,0), HX(Q0,0), HX(P0,0) ++ vsub HX(delta1,0), HX(Q1,0), HX(P1,0) ++ vmov -,8 CLRA SACC ++ vmul -,HX(delta0,0), 9 SACC ++ vmul HX(delta0,0),HX(delta1,0), r6 SACC ++ vasr HX(delta0,0), HX(delta0,0), 4 ++ vdist HX(deltatest,0), HX(delta0,0), 0 ++ vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something ++ vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later ++ ++ vclamps HX(delta0,0), HX(delta0,0), HX(tc,0) ++ ++ vadd HX(deltap1,0), HX(P2,0), HX(P0,0) ++ vadd HX(deltap1,0), HX(deltap1,0), 1 ++ vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC ++ vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC ++ vasr HX(deltap1,0), HX(deltap1,0), 1 ++ vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0) ++ ++ vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0) ++ vadd HX(deltaq1,0), HX(deltaq1,0), 1 ++ vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC ++ vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0) ++ vrsub -, HX(delta0,0), 0 SACC ++ vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC ++ vasr HX(deltaq1,0), HX(deltaq1,0), 1 ++ vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0) ++ ++ vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN ++ vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN ++ ++ vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1 ++ vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN ++ ++ vmov -,HX(deltatest,0) SETF ++ vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1 ++ vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN ++ ++ #vmov HX(P2,0),1 IFN ++ ++filtering_done: ++ b lr ++ ++ ++hevc_uv_deblock_16x16: ++ push r6-r15, lr ++ mov r14,0 ++ b hevc_uv_start ++hevc_uv_deblock_16x16_with_clear: ++ push r6-r15, lr ++ mov r14,1 ++ b hevc_uv_start ++ ++hevc_uv_start: ++ mov r9,r4 ++ mov r4,r3 ++ mov r13,r2 ++ mov r2,r0 ++ mov r10,r0 ++ subscale4 r0,r1 ++ mov r8,63 ++ mov r6,-3 ++ vmov H(zeros,0),0 ++# r7 is number of blocks still to load ++# r0 is location of current block - 4 * stride ++# r1 is stride ++# r2 is location of current block ++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical ++# r4 is setup ++# r5 is for temporary calculations ++# r8 holds 63 ++# r6 holds -3 ++# r9 holds the number of 16 high rows to process ++# r10 holds the original img base ++# r11 returns 0 if no filtering was done on the edge ++# r12 saves a copy of this ++# r13 is copy of width ++# r14 is 1 if we should clear the old contents, or 0 if not ++ ++uv_process_row: ++ # First iteration does not do horizontal filtering on previous ++ mov r7, r13 ++ mov r3,0 ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # Load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) # We may wish to prefetch these ++ cmp r14,1 ++ bne uv_skip0 ++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths ++ vstb H(zeros,0),(r4) ++uv_skip0: ++ bl uv_vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8 ++ bl uv_vert_filter ++ sub r3,8 ++ b uv_start_deblock_loop ++uv_deblock_loop: ++ # Middle iterations do vertical on current block and horizontal on preceding ++ vldb H(12++,16)+r3,(r0 += r1) REP 4 # load the current block ++ vldb H(16++,16)+r3,(r2 += r1) REP 16 ++ vldb H(setup_input,0), (r4) ++ cmp r14,1 ++ bne uv_skip1 ++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths ++ vstb H(zeros,0),(r4) ++uv_skip1: ++ bl uv_vert_filter ++ add r3,8 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_vert_filter ++ sub r3,8 ++ vldb H(setup_input,0), -16(r4) ++ cmp r14,1 ++ bne uv_skip3 ++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths ++ vstb H(zeros,0),-16(r4) ++uv_skip3: ++ bl uv_horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_horz_filter ++ sub r3,8*64 ++ addcmpbeq r12,0,0,uv_skip_save_top ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++uv_skip_save_top: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++uv_start_deblock_loop: ++ # move onto next 16x16 (could do this with circular buffer support instead) ++ add r3,16 ++ and r3,r8 ++ add r4,32 ++ # Perform loop counter operations (may work with an addcmpbgt as well?) ++ add r0,16 ++ add r2,16 ++ sub r7,1 ++ cmp r7,0 # Are there still more blocks to load ++ bgt uv_deblock_loop ++ ++ # Final iteration needs to just do horizontal filtering ++ vldb H(setup_input,0), -16(r4) ++ cmp r14,1 ++ bne uv_skip2 ++ vadd H(setup_input,0),H(setup_input,4),0 # Rotate by 4 to access V strengths ++ vstb H(zeros,0),-16(r4) ++uv_skip2: ++ bl uv_horz_filter ++ mov r12,r11 ++ add r3,8*64 ++ vadd H(setup_input,0),H(setup_input,8),0 ++ bl uv_horz_filter ++ sub r3,64*8 ++ addcmpbeq r12,0,0,uv_skip_save_top2 ++ vstb H(12++,0)+r3,-16(r0 += r1) REP 4 # Save the deblocked pixels for the previous block ++uv_skip_save_top2: ++ vstb H(16++,0)+r3,-16(r2 += r1) REP 16 ++ ++# Now look to see if we should do another row ++ sub r9,1 ++ cmp r9,0 ++ bgt uv_start_again ++ pop r6-r15, pc ++uv_start_again: ++ # Need to sort out r0,r2 to point to next row down ++ addscale16 r10,r1 ++ mov r2,r10 ++ subscale4 r0,r2,r1 ++ b uv_process_row ++ ++ ++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered ++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations ++ ++uv_vert_filter: ++ push lr ++ ++ vmov HX(P1,0), V(16,14)+r3 ++ vmov HX(P0,0), V(16,15)+r3 ++ vmov HX(Q0,0), V(16,16)+r3 ++ vmov HX(Q1,0), V(16,17)+r3 ++ ++ bl do_chroma_filter ++ ++ vadds V(16,15)+r3, HX(P0,0), 0 ++ vadds V(16,16)+r3, HX(Q0,0), 0 ++ ++ pop pc ++ ++# Filter edge at H(16,0)+r3 ++uv_horz_filter: ++ push lr ++ ++ vmov HX(P1,0), H(14,0)+r3 ++ vmov HX(P0,0), H(15,0)+r3 ++ vmov HX(Q0,0), H(16,0)+r3 ++ vmov HX(Q1,0), H(17,0)+r3 ++ ++ bl do_chroma_filter ++ ++ vadds H(15,0)+r3, HX(P0,0), 0 ++ # P3 and Q3 never change so don't bother saving back ++ vadds H(16,0)+r3, HX(Q0,0), 0 ++ ++ pop pc ++ ++# r4 points to array of beta/tc for each 4 length edge ++do_chroma_filter: ++ valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8 ++ valtl HX(tc,0),H(setup,0),H(setup,0) ++ ++ vsub HX(delta,0),HX(Q0,0),HX(P0,0) ++ vshl HX(delta,0),HX(delta,0),2 CLRA SACC ++ vsub -,HX(P1,0),HX(Q1,0) SACC ++ vmov HX(delta,0),4 SACC ++ vasr HX(delta,0),HX(delta,0),3 ++ vclamps HX(delta,0), HX(delta,0), HX(tc,0) ++ vadd HX(P0,0),HX(P0,0),HX(delta,0) ++ vsub HX(Q0,0),HX(Q0,0),HX(delta,0) ++ b lr ++ ++# r0 = list ++# r1 = number ++hevc_run_command_list: ++ push r6-r7, lr ++ mov r6, r0 ++ mov r7, r1 ++loop_cmds: ++ ld r0,(r6) # How to encode r6++? ++ add r6,4 ++ ld r1,(r6) ++ add r6,4 ++ ld r2,(r6) ++ add r6,4 ++ ld r3,(r6) ++ add r6,4 ++ ld r4,(r6) ++ add r6,4 ++ ld r5,(r6) ++ add r6,4 ++ bl hevc_trans_16x16 ++ sub r7,1 ++ cmp r7,0 ++ bgt loop_cmds ++ ++ pop r6-r7, pc +diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h +new file mode 100644 +index 0000000000..ee4e357f38 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform10.h +@@ -0,0 +1,3110 @@ ++static const unsigned char rpi_hevc_transform10 [] = { ++21, ++106, ++0, ++144, ++47, ++1, ++37, ++106, ++0, ++144, ++66, ++1, ++53, ++106, ++0, ++144, ++192, ++4, ++69, ++106, ++0, ++144, ++192, ++4, ++85, ++106, ++0, ++144, ++240, ++5, ++169, ++3, ++62, ++64, ++79, ++64, ++3, ++232, ++32, ++0, ++0, ++0, ++12, ++248, ++0, ++136, ++0, ++0, ++192, ++248, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++12, ++248, ++0, ++168, ++0, ++0, ++192, ++248, ++0, ++0, ++0, ++96, ++3, ++232, ++32, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++8, ++232, ++0, ++4, ++0, ++0, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++4, ++232, ++64, ++0, ++0, ++0, ++5, ++232, ++0, ++2, ++0, ++0, ++128, ++69, ++113, ++66, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++128, ++69, ++113, ++70, ++128, ++144, ++40, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++16, ++0, ++76, ++254, ++48, ++192, ++9, ++4, ++32, ++8, ++0, ++0, ++4, ++254, ++0, ++144, ++128, ++2, ++0, ++8, ++2, ++0, ++128, ++144, ++23, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++20, ++0, ++76, ++254, ++48, ++192, ++6, ++4, ++32, ++8, ++0, ++0, ++140, ++248, ++44, ++0, ++0, ++0, ++32, ++48, ++4, ++0, ++128, ++69, ++113, ++66, ++242, ++140, ++211, ++192, ++34, ++31, ++41, ++3, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++96, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++224, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++225, ++64, ++242, ++64, ++3, ++232, ++128, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++57, ++239, ++224, ++247, ++255, ++255, ++72, ++192, ++95, ++207, ++88, ++122, ++88, ++124, ++137, ++64, ++26, ++64, ++4, ++232, ++64, ++0, ++0, ++0, ++149, ++96, ++161, ++64, ++152, ++64, ++128, ++144, ++35, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++27, ++0, ++4, ++232, ++0, ++2, ++0, ++0, ++101, ++96, ++145, ++64, ++168, ++64, ++128, ++144, ++19, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++11, ++0, ++74, ++232, ++0, ++8, ++0, ++0, ++242, ++140, ++221, ++192, ++57, ++239, ++32, ++8, ++0, ++0, ++41, ++3, ++239, ++3, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++248, ++4, ++0, ++12, ++248, ++0, ++132, ++64, ++0, ++192, ++248, ++4, ++0, ++0, ++96, ++255, ++159, ++154, ++255, ++0, ++232, ++0, ++4, ++0, ++0, ++255, ++159, ++165, ++255, ++4, ++255, ++48, ++204, ++16, ++3, ++224, ++251, ++62, ++0, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++128, ++64, ++6, ++232, ++64, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++32, ++247, ++240, ++207, ++16, ++3, ++32, ++247, ++176, ++207, ++17, ++19, ++32, ++247, ++112, ++207, ++18, ++35, ++32, ++247, ++48, ++207, ++19, ++51, ++32, ++247, ++240, ++206, ++20, ++67, ++32, ++247, ++176, ++206, ++21, ++83, ++32, ++247, ++112, ++206, ++22, ++99, ++32, ++247, ++48, ++206, ++23, ++115, ++32, ++247, ++240, ++205, ++24, ++131, ++32, ++247, ++176, ++205, ++25, ++147, ++32, ++247, ++112, ++205, ++26, ++163, ++32, ++247, ++48, ++205, ++27, ++179, ++32, ++247, ++240, ++204, ++28, ++195, ++32, ++247, ++176, ++204, ++29, ++211, ++32, ++247, ++112, ++204, ++30, ++227, ++32, ++247, ++48, ++204, ++31, ++243, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++0, ++237, ++32, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++111, ++3, ++4, ++254, ++0, ++128, ++0, ++4, ++0, ++248, ++0, ++0, ++2, ++232, ++32, ++0, ++0, ++0, ++140, ++248, ++32, ++0, ++0, ++0, ++224, ++35, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++193, ++232, ++0, ++1, ++0, ++0, ++1, ++106, ++116, ++30, ++90, ++0, ++169, ++3, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++137, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++129, ++0, ++131, ++102, ++0, ++158, ++67, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++108, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++100, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++161, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++150, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++182, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++112, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++101, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++103, ++255, ++239, ++3, ++0, ++254, ++0, ++143, ++92, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++93, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++210, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++211, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++107, ++0, ++8, ++255, ++99, ++23, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++23, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++52, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++52, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++0, ++143, ++12, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++13, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++18, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++19, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++33, ++0, ++8, ++255, ++99, ++3, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++3, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++4, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++4, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++137, ++47, ++240, ++40, ++246, ++2, ++140, ++47, ++240, ++128, ++245, ++99, ++140, ++5, ++4, ++0, ++247, ++99, ++140, ++1, ++20, ++88, ++246, ++99, ++140, ++1, ++20, ++0, ++247, ++35, ++136, ++62, ++226, ++32, ++247, ++35, ++136, ++32, ++210, ++0, ++247, ++34, ++136, ++63, ++2, ++208, ++246, ++34, ++136, ++0, ++4, ++0, ++247, ++99, ++136, ++58, ++162, ++32, ++247, ++99, ++136, ++33, ++146, ++0, ++247, ++98, ++136, ++59, ++18, ++208, ++246, ++98, ++136, ++0, ++20, ++0, ++247, ++162, ++136, ++33, ++2, ++88, ++246, ++98, ++137, ++2, ++68, ++88, ++246, ++162, ++137, ++3, ++68, ++208, ++254, ++227, ++136, ++60, ++242, ++192, ++243, ++188, ++11, ++208, ++254, ++227, ++136, ++56, ++178, ++192, ++243, ++188, ++10, ++32, ++255, ++226, ++136, ++38, ++58, ++192, ++243, ++60, ++0, ++208, ++254, ++227, ++136, ++59, ++242, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++49, ++58, ++192, ++243, ++60, ++128, ++0, ++255, ++226, ++136, ++34, ++34, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++37, ++58, ++192, ++243, ++60, ++128, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++194, ++8, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++255, ++202, ++40, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++254, ++0, ++240, ++35, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++226, ++140, ++34, ++34, ++195, ++243, ++60, ++0, ++32, ++255, ++227, ++140, ++36, ++58, ++192, ++243, ++60, ++0, ++0, ++254, ++192, ++136, ++0, ++4, ++0, ++240, ++0, ++160, ++16, ++246, ++226, ++136, ++35, ++50, ++16, ++246, ++226, ++136, ++35, ++50, ++32, ++246, ++226, ++136, ++35, ++50, ++32, ++254, ++226, ++136, ++35, ++58, ++192, ++243, ++60, ++0, ++11, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++115, ++5, ++106, ++0, ++144, ++173, ++1, ++27, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++227, ++0, ++64, ++246, ++163, ++140, ++1, ++4, ++0, ++246, ++192, ++175, ++63, ++2, ++0, ++246, ++192, ++174, ++59, ++2, ++0, ++246, ++128, ++175, ++62, ++2, ++0, ++246, ++128, ++174, ++58, ++2, ++0, ++246, ++64, ++175, ++61, ++2, ++0, ++246, ++64, ++174, ++57, ++2, ++0, ++255, ++43, ++240, ++4, ++212, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++228, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++191, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++143, ++52, ++242, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++212, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++180, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++190, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++143, ++52, ++226, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++180, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++212, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++196, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++189, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++143, ++52, ++210, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++148, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++164, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++228, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++187, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++142, ++52, ++178, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++148, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++244, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++186, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++142, ++52, ++162, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++244, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++148, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++132, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++185, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++142, ++52, ++146, ++192, ++243, ++60, ++128, ++64, ++255, ++98, ++141, ++0, ++52, ++192, ++243, ++0, ++0, ++0, ++254, ++0, ++240, ++53, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++177, ++0, ++88, ++246, ++163, ++140, ++1, ++4, ++128, ++245, ++99, ++141, ++10, ++4, ++88, ++246, ++162, ++138, ++1, ++68, ++0, ++247, ++162, ++138, ++36, ++162, ++88, ++254, ++162, ++138, ++3, ++164, ++192, ++243, ++128, ++11, ++0, ++255, ++226, ++137, ++32, ++2, ++195, ++243, ++60, ++0, ++32, ++247, ++226, ++137, ++42, ++114, ++0, ++255, ++34, ++138, ++33, ++18, ++195, ++243, ++60, ++0, ++32, ++247, ++34, ++138, ++42, ++130, ++16, ++246, ++98, ++138, ++40, ++114, ++16, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++226, ++137, ++41, ++146, ++40, ++246, ++34, ++138, ++41, ++146, ++32, ++247, ++163, ++141, ++63, ++178, ++32, ++247, ++227, ++141, ++62, ++162, ++0, ++254, ++0, ++240, ++8, ++4, ++0, ++240, ++128, ++11, ++128, ++253, ++35, ++240, ++9, ++100, ++192, ++243, ++128, ++10, ++128, ++253, ++163, ++141, ++128, ++115, ++192, ++243, ++152, ++10, ++88, ++246, ++163, ++141, ++4, ++100, ++208, ++246, ++35, ++139, ++0, ++100, ++32, ++255, ++34, ++139, ++53, ++202, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++139, ++0, ++4, ++0, ++240, ++0, ++160, ++240, ++246, ++163, ++141, ++48, ++98, ++0, ++247, ++99, ++139, ++63, ++210, ++0, ++247, ++98, ++139, ++1, ++212, ++88, ++254, ++98, ++139, ++1, ++212, ++192, ++243, ++128, ++11, ++32, ++255, ++99, ++139, ++62, ++98, ++192, ++243, ++188, ++10, ++88, ++246, ++98, ++139, ++1, ++212, ++240, ++246, ++98, ++139, ++50, ++210, ++0, ++247, ++163, ++128, ++59, ++146, ++0, ++247, ++160, ++128, ++1, ++36, ++88, ++254, ++160, ++128, ++1, ++36, ++192, ++243, ++128, ++11, ++0, ++247, ++163, ++128, ++58, ++98, ++64, ++255, ++35, ++240, ++0, ++100, ++192, ++243, ++128, ++10, ++64, ++255, ++163, ++128, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++160, ++128, ++1, ++36, ++240, ++246, ++160, ++128, ++50, ++34, ++8, ++255, ++227, ++143, ++54, ++242, ++192, ++243, ++60, ++128, ++40, ++255, ++227, ++142, ++54, ++178, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++39, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++143, ++45, ++226, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++44, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++40, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++142, ++2, ++162, ++192, ++243, ++60, ++128, ++90, ++0, ++169, ++3, ++14, ++96, ++4, ++31, ++169, ++3, ++30, ++96, ++1, ++31, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++137, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++158, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++150, ++0, ++131, ++102, ++0, ++158, ++81, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++137, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++122, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++114, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++139, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++128, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++117, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++168, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++139, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++72, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++61, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++75, ++255, ++239, ++3, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++47, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++13, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++140, ++47, ++240, ++32, ++247, ++35, ++141, ++63, ++178, ++64, ++254, ++35, ++141, ++2, ++68, ++192, ++243, ++128, ++11, ++32, ++255, ++35, ++240, ++58, ++226, ++192, ++243, ++188, ++10, ++0, ++254, ++0, ++141, ++4, ++4, ++0, ++240, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++240, ++246, ++35, ++141, ++48, ++66, ++0, ++247, ++227, ++143, ++52, ++242, ++32, ++247, ++227, ++142, ++52, ++178, ++90, ++0, ++161, ++3, ++6, ++64, ++23, ++64, ++96, ++8, ++70, ++98, ++97, ++8, ++70, ++98, ++98, ++8, ++70, ++98, ++99, ++8, ++70, ++98, ++100, ++8, ++70, ++98, ++101, ++8, ++70, ++98, ++255, ++159, ++244, ++249, ++23, ++102, ++7, ++106, ++112, ++30, ++33, ++3, ++}; +diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h +new file mode 100644 +index 0000000000..56d5206827 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform8.h +@@ -0,0 +1,3110 @@ ++static const unsigned char rpi_hevc_transform8 [] = { ++21, ++106, ++0, ++144, ++47, ++1, ++37, ++106, ++0, ++144, ++66, ++1, ++53, ++106, ++0, ++144, ++192, ++4, ++69, ++106, ++0, ++144, ++192, ++4, ++85, ++106, ++0, ++144, ++240, ++5, ++169, ++3, ++62, ++64, ++79, ++64, ++3, ++232, ++32, ++0, ++0, ++0, ++12, ++248, ++0, ++136, ++0, ++0, ++192, ++248, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++12, ++248, ++0, ++168, ++0, ++0, ++192, ++248, ++0, ++0, ++0, ++96, ++3, ++232, ++32, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++8, ++232, ++0, ++4, ++0, ++0, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++4, ++232, ++64, ++0, ++0, ++0, ++5, ++232, ++0, ++8, ++0, ++0, ++128, ++69, ++113, ++66, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++8, ++4, ++0, ++128, ++69, ++113, ++70, ++128, ++144, ++40, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++16, ++0, ++76, ++254, ++48, ++192, ++9, ++4, ++32, ++8, ++0, ++0, ++4, ++254, ++0, ++144, ++128, ++2, ++0, ++8, ++2, ++0, ++128, ++144, ++23, ++0, ++4, ++255, ++48, ++192, ++128, ++3, ++32, ++8, ++20, ++0, ++76, ++254, ++48, ++192, ++4, ++4, ++32, ++8, ++0, ++0, ++140, ++248, ++44, ++0, ++0, ++0, ++32, ++48, ++4, ++0, ++128, ++69, ++113, ++66, ++242, ++140, ++211, ++192, ++34, ++31, ++41, ++3, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++96, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++70, ++192, ++80, ++7, ++164, ++255, ++36, ++204, ++224, ++2, ++0, ++248, ++62, ++0, ++3, ++255, ++55, ++208, ++120, ++3, ++224, ++3, ++190, ++11, ++16, ++139, ++246, ++91, ++0, ++103, ++90, ++0, ++225, ++64, ++242, ++64, ++3, ++232, ++128, ++0, ++0, ++0, ++7, ++232, ++0, ++2, ++0, ++0, ++57, ++239, ++224, ++247, ++255, ++255, ++72, ++192, ++95, ++207, ++88, ++122, ++88, ++124, ++137, ++64, ++26, ++64, ++4, ++232, ++64, ++0, ++0, ++0, ++149, ++96, ++161, ++64, ++152, ++64, ++128, ++144, ++35, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++27, ++0, ++4, ++232, ++0, ++8, ++0, ++0, ++69, ++96, ++145, ++64, ++168, ++64, ++128, ++144, ++19, ++0, ++72, ++232, ++0, ++4, ++0, ++0, ++65, ++232, ++32, ++0, ++0, ++0, ++128, ++144, ++11, ++0, ++74, ++232, ++0, ++8, ++0, ++0, ++242, ++140, ++221, ++192, ++57, ++239, ++32, ++8, ++0, ++0, ++41, ++3, ++239, ++3, ++12, ++248, ++0, ++128, ++0, ++0, ++192, ++248, ++4, ++0, ++12, ++248, ++0, ++132, ++64, ++0, ++192, ++248, ++4, ++0, ++0, ++96, ++255, ++159, ++154, ++255, ++0, ++232, ++0, ++4, ++0, ++0, ++255, ++159, ++165, ++255, ++4, ++255, ++48, ++204, ++16, ++3, ++224, ++251, ++62, ++0, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++128, ++64, ++6, ++232, ++64, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++32, ++247, ++240, ++207, ++16, ++3, ++32, ++247, ++176, ++207, ++17, ++19, ++32, ++247, ++112, ++207, ++18, ++35, ++32, ++247, ++48, ++207, ++19, ++51, ++32, ++247, ++240, ++206, ++20, ++67, ++32, ++247, ++176, ++206, ++21, ++83, ++32, ++247, ++112, ++206, ++22, ++99, ++32, ++247, ++48, ++206, ++23, ++115, ++32, ++247, ++240, ++205, ++24, ++131, ++32, ++247, ++176, ++205, ++25, ++147, ++32, ++247, ++112, ++205, ++26, ++163, ++32, ++247, ++48, ++205, ++27, ++179, ++32, ++247, ++240, ++204, ++28, ++195, ++32, ++247, ++176, ++204, ++29, ++211, ++32, ++247, ++112, ++204, ++30, ++227, ++32, ++247, ++48, ++204, ++31, ++243, ++4, ++255, ++51, ++204, ++128, ++3, ++224, ++251, ++16, ++0, ++76, ++254, ++51, ++204, ++128, ++3, ++224, ++251, ++20, ++0, ++0, ++237, ++32, ++0, ++0, ++0, ++140, ++248, ++47, ++0, ++0, ++0, ++224, ++99, ++0, ++0, ++111, ++3, ++4, ++254, ++0, ++128, ++0, ++4, ++0, ++248, ++0, ++0, ++2, ++232, ++32, ++0, ++0, ++0, ++140, ++248, ++32, ++0, ++0, ++0, ++224, ++35, ++0, ++0, ++64, ++232, ++0, ++2, ++0, ++0, ++193, ++232, ++0, ++1, ++0, ++0, ++1, ++106, ++116, ++30, ++90, ++0, ++169, ++3, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++137, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++129, ++0, ++131, ++102, ++0, ++158, ++67, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++108, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++100, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++161, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++150, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++182, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++112, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++101, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++103, ++255, ++239, ++3, ++0, ++254, ++0, ++143, ++92, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++93, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++210, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++211, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++107, ++0, ++8, ++255, ++99, ++23, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++23, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++52, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++52, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++0, ++143, ++12, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++143, ++13, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++64, ++142, ++18, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++0, ++142, ++19, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++33, ++0, ++8, ++255, ++99, ++3, ++0, ++212, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++3, ++0, ++228, ++192, ++51, ++0, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++8, ++255, ++99, ++4, ++0, ++164, ++192, ++51, ++0, ++0, ++8, ++255, ++163, ++4, ++0, ++148, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++137, ++47, ++240, ++40, ++246, ++2, ++140, ++47, ++240, ++128, ++245, ++99, ++140, ++5, ++4, ++0, ++247, ++99, ++140, ++1, ++20, ++88, ++246, ++99, ++140, ++1, ++20, ++0, ++247, ++35, ++136, ++62, ++226, ++32, ++247, ++35, ++136, ++32, ++210, ++0, ++247, ++34, ++136, ++63, ++2, ++208, ++246, ++34, ++136, ++0, ++4, ++0, ++247, ++99, ++136, ++58, ++162, ++32, ++247, ++99, ++136, ++33, ++146, ++0, ++247, ++98, ++136, ++59, ++18, ++208, ++246, ++98, ++136, ++0, ++20, ++0, ++247, ++162, ++136, ++33, ++2, ++88, ++246, ++98, ++137, ++2, ++68, ++88, ++246, ++162, ++137, ++3, ++68, ++208, ++254, ++227, ++136, ++60, ++242, ++192, ++243, ++188, ++11, ++208, ++254, ++227, ++136, ++56, ++178, ++192, ++243, ++188, ++10, ++32, ++255, ++226, ++136, ++38, ++58, ++192, ++243, ++60, ++0, ++208, ++254, ++227, ++136, ++59, ++242, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++49, ++58, ++192, ++243, ++60, ++128, ++0, ++255, ++226, ++136, ++34, ++34, ++192, ++243, ++60, ++128, ++32, ++255, ++226, ++136, ++37, ++58, ++192, ++243, ++60, ++128, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++194, ++8, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++255, ++202, ++40, ++0, ++52, ++195, ++243, ++0, ++128, ++0, ++254, ++0, ++240, ++35, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++192, ++136, ++1, ++4, ++0, ++240, ++0, ++160, ++0, ++255, ++226, ++140, ++34, ++34, ++195, ++243, ++60, ++0, ++32, ++255, ++227, ++140, ++36, ++58, ++192, ++243, ++60, ++0, ++0, ++254, ++192, ++136, ++0, ++4, ++0, ++240, ++0, ++160, ++16, ++246, ++226, ++136, ++35, ++50, ++16, ++246, ++226, ++136, ++35, ++50, ++32, ++246, ++226, ++136, ++35, ++50, ++32, ++254, ++226, ++136, ++35, ++58, ++192, ++243, ++60, ++0, ++11, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++115, ++5, ++106, ++0, ++144, ++173, ++1, ++27, ++96, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++227, ++0, ++64, ++246, ++163, ++140, ++1, ++4, ++0, ++246, ++192, ++175, ++63, ++2, ++0, ++246, ++192, ++174, ++59, ++2, ++0, ++246, ++128, ++175, ++62, ++2, ++0, ++246, ++128, ++174, ++58, ++2, ++0, ++246, ++64, ++175, ++61, ++2, ++0, ++246, ++64, ++174, ++57, ++2, ++0, ++255, ++43, ++240, ++4, ++212, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++228, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++191, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++143, ++52, ++242, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++212, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++180, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++190, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++143, ++52, ++226, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++180, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++191, ++226, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++212, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++196, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++189, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++143, ++52, ++210, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++148, ++192, ++243, ++128, ++11, ++64, ++254, ++43, ++240, ++1, ++164, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++180, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++240, ++1, ++244, ++192, ++243, ++128, ++10, ++64, ++254, ++43, ++141, ++0, ++228, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++187, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++235, ++142, ++52, ++178, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++2, ++148, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++64, ++254, ++43, ++141, ++0, ++244, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++2, ++68, ++32, ++247, ++35, ++141, ++186, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++171, ++142, ++52, ++162, ++192, ++243, ++60, ++128, ++0, ++255, ++43, ++240, ++4, ++244, ++192, ++243, ++128, ++11, ++0, ++255, ++43, ++240, ++187, ++162, ++192, ++243, ++188, ++10, ++128, ++253, ++43, ++240, ++3, ++148, ++192, ++243, ++128, ++10, ++64, ++254, ++35, ++141, ++1, ++132, ++192, ++243, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++32, ++247, ++35, ++141, ++185, ++66, ++240, ++246, ++35, ++141, ++50, ++66, ++0, ++255, ++107, ++142, ++52, ++146, ++192, ++243, ++60, ++128, ++64, ++255, ++98, ++141, ++0, ++52, ++192, ++243, ++0, ++0, ++0, ++254, ++0, ++240, ++53, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++1, ++4, ++0, ++240, ++64, ++147, ++5, ++106, ++0, ++144, ++177, ++0, ++88, ++246, ++163, ++140, ++1, ++4, ++128, ++245, ++99, ++141, ++10, ++4, ++88, ++246, ++162, ++138, ++1, ++68, ++0, ++247, ++162, ++138, ++36, ++162, ++88, ++254, ++162, ++138, ++3, ++164, ++192, ++243, ++128, ++11, ++0, ++255, ++226, ++137, ++32, ++2, ++195, ++243, ++60, ++0, ++32, ++247, ++226, ++137, ++42, ++114, ++0, ++255, ++34, ++138, ++33, ++18, ++195, ++243, ++60, ++0, ++32, ++247, ++34, ++138, ++42, ++130, ++16, ++246, ++98, ++138, ++40, ++114, ++16, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++98, ++138, ++41, ++146, ++32, ++246, ++226, ++137, ++41, ++146, ++40, ++246, ++34, ++138, ++41, ++146, ++32, ++247, ++163, ++141, ++63, ++178, ++32, ++247, ++227, ++141, ++62, ++162, ++0, ++254, ++0, ++240, ++8, ++4, ++0, ++240, ++128, ++11, ++128, ++253, ++35, ++240, ++9, ++100, ++192, ++243, ++128, ++10, ++128, ++253, ++163, ++141, ++128, ++115, ++192, ++243, ++152, ++10, ++88, ++246, ++163, ++141, ++4, ++100, ++208, ++246, ++35, ++139, ++0, ++100, ++32, ++255, ++34, ++139, ++53, ++202, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++139, ++0, ++4, ++0, ++240, ++0, ++160, ++240, ++246, ++163, ++141, ++48, ++98, ++0, ++247, ++99, ++139, ++63, ++210, ++0, ++247, ++98, ++139, ++1, ++212, ++88, ++254, ++98, ++139, ++1, ++212, ++192, ++243, ++128, ++11, ++32, ++255, ++99, ++139, ++62, ++98, ++192, ++243, ++188, ++10, ++88, ++246, ++98, ++139, ++1, ++212, ++240, ++246, ++98, ++139, ++50, ++210, ++0, ++247, ++163, ++128, ++59, ++146, ++0, ++247, ++160, ++128, ++1, ++36, ++88, ++254, ++160, ++128, ++1, ++36, ++192, ++243, ++128, ++11, ++0, ++247, ++163, ++128, ++58, ++98, ++64, ++255, ++35, ++240, ++0, ++100, ++192, ++243, ++128, ++10, ++64, ++255, ++163, ++128, ++0, ++164, ++192, ++243, ++128, ++10, ++88, ++246, ++160, ++128, ++1, ++36, ++240, ++246, ++160, ++128, ++50, ++34, ++8, ++255, ++227, ++143, ++54, ++242, ++192, ++243, ++60, ++128, ++40, ++255, ++227, ++142, ++54, ++178, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++39, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++143, ++45, ++226, ++192, ++243, ++60, ++128, ++0, ++254, ++0, ++240, ++44, ++10, ++0, ++240, ++60, ++0, ++0, ++254, ++0, ++240, ++40, ++10, ++0, ++240, ++60, ++128, ++8, ++255, ++163, ++142, ++2, ++162, ++192, ++243, ++60, ++128, ++90, ++0, ++169, ++3, ++14, ++96, ++4, ++31, ++169, ++3, ++30, ++96, ++1, ++31, ++73, ++64, ++52, ++64, ++45, ++64, ++2, ++64, ++10, ++64, ++64, ++198, ++1, ++7, ++8, ++232, ++63, ++0, ++0, ++0, ++6, ++232, ++253, ++255, ++255, ++255, ++0, ++246, ++0, ++0, ++0, ++4, ++215, ++64, ++3, ++96, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++137, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++158, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++150, ++0, ++131, ++102, ++0, ++158, ++81, ++0, ++2, ++248, ++0, ++35, ++0, ++0, ++64, ++56, ++0, ++0, ++4, ++248, ++0, ++36, ++0, ++0, ++64, ++56, ++8, ++0, ++0, ++240, ++64, ++0, ++132, ++3, ++30, ++106, ++137, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++240, ++0, ++0, ++132, ++3, ++128, ++144, ++122, ++0, ++131, ++98, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++114, ++0, ++131, ++102, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++139, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++128, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++117, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++3, ++99, ++131, ++71, ++68, ++232, ++32, ++0, ++0, ++0, ++0, ++99, ++2, ++99, ++23, ++102, ++7, ++106, ++127, ++156, ++168, ++255, ++0, ++248, ++64, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++30, ++106, ++139, ++24, ++0, ++255, ++64, ++0, ++0, ++20, ++196, ++243, ++0, ++0, ++128, ++248, ++0, ++0, ++112, ++0, ++192, ++243, ++211, ++31, ++128, ++144, ++72, ++0, ++188, ++64, ++67, ++232, ++0, ++2, ++0, ++0, ++0, ++255, ++64, ++0, ++0, ++20, ++200, ++243, ++0, ++0, ++128, ++144, ++61, ++0, ++195, ++232, ++0, ++2, ++0, ++0, ++12, ++128, ++7, ++192, ++130, ++248, ++0, ++0, ++112, ++192, ++224, ++16, ++195, ++31, ++132, ++248, ++1, ++0, ++112, ++0, ++224, ++16, ++203, ++31, ++25, ++102, ++9, ++106, ++2, ++30, ++41, ++3, ++26, ++87, ++162, ++64, ++64, ++198, ++1, ++23, ++127, ++158, ++75, ++255, ++239, ++3, ++0, ++254, ++128, ++143, ++94, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++95, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++208, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++209, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++47, ++0, ++8, ++255, ++227, ++23, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++52, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++239, ++3, ++0, ++254, ++128, ++143, ++14, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++143, ++15, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++192, ++142, ++16, ++0, ++0, ++240, ++12, ++0, ++0, ++254, ++128, ++142, ++17, ++0, ++0, ++240, ++12, ++0, ++128, ++144, ++13, ++0, ++8, ++255, ++227, ++3, ++0, ++244, ++192, ++51, ++0, ++0, ++8, ++255, ++35, ++4, ++0, ++180, ++192, ++51, ++0, ++0, ++111, ++3, ++32, ++246, ++192, ++11, ++1, ++16, ++32, ++246, ++2, ++140, ++47, ++240, ++32, ++247, ++35, ++141, ++63, ++178, ++64, ++254, ++35, ++141, ++2, ++68, ++192, ++243, ++128, ++11, ++32, ++255, ++35, ++240, ++58, ++226, ++192, ++243, ++188, ++10, ++0, ++254, ++0, ++141, ++4, ++4, ++0, ++240, ++128, ++10, ++88, ++246, ++35, ++141, ++3, ++68, ++240, ++246, ++35, ++141, ++48, ++66, ++0, ++247, ++227, ++143, ++52, ++242, ++32, ++247, ++227, ++142, ++52, ++178, ++90, ++0, ++161, ++3, ++6, ++64, ++23, ++64, ++96, ++8, ++70, ++98, ++97, ++8, ++70, ++98, ++98, ++8, ++70, ++98, ++99, ++8, ++70, ++98, ++100, ++8, ++70, ++98, ++101, ++8, ++70, ++98, ++255, ++159, ++244, ++249, ++23, ++102, ++7, ++106, ++112, ++30, ++33, ++3, ++}; +diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c +new file mode 100644 +index 0000000000..00bd911a86 +--- /dev/null ++++ b/libavcodec/rpi_hevcdec.c +@@ -0,0 +1,5630 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Mickael Raulet ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2012 - 2013 Wassim Hamidouche ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/common.h" ++#include "libavutil/display.h" ++#include "libavutil/internal.h" ++#include "libavutil/mastering_display_metadata.h" ++#include "libavutil/md5.h" ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/stereo3d.h" ++ ++#include "bswapdsp.h" ++#include "bytestream.h" ++#include "cabac_functions.h" ++#include "golomb.h" ++#include "hevc.h" ++#include "rpi_hevc_data.h" ++#include "rpi_hevc_parse.h" ++#include "rpi_hevcdec.h" ++#include "profiles.h" ++ ++#include "rpi_qpu.h" ++#include "rpi_hevc_shader.h" ++#include "rpi_hevc_shader_cmd.h" ++#include "rpi_hevc_shader_template.h" ++#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#include "pthread.h" ++#include "libavutil/atomic.h" ++ ++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards ++ ++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) ++ ++#ifndef av_mod_uintp2 ++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) ++{ ++ return a & ((1 << p) - 1); ++} ++# define av_mod_uintp2 av_mod_uintp2_c ++#endif ++ ++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first); ++ ++#define MC_DUMMY_X (-32) ++#define MC_DUMMY_Y (-32) ++ ++// UV & Y both have min 4x4 pred (no 2x2 chroma) ++// Allow for even spread +1 for setup, +1 for rounding ++// As we have load sharing this can (in theory) be exceeded so we have to ++// check after each CTU, but it is a good base size ++ ++// Worst case (all 4x4) commands per CTU ++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16) ++#define QPU_C_CMD_PER_CTU_MAX (8 * 8) ++ ++#define QPU_C_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX) ++#define QPU_Y_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) + 2 * QPU_N_MAX) ++ ++// The QPU code for UV blocks only works up to a block width of 8 ++#define RPI_CHROMA_BLOCK_WIDTH 8 ++ ++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) ++ ++ ++// Actual filter goes -ve, +ve, +ve, -ve using these values ++static const uint32_t rpi_filter_coefs[8] = { ++ ENCODE_COEFFS( 0, 64, 0, 0), ++ ENCODE_COEFFS( 2, 58, 10, 2), ++ ENCODE_COEFFS( 4, 54, 16, 2), ++ ENCODE_COEFFS( 6, 46, 28, 4), ++ ENCODE_COEFFS( 4, 36, 36, 4), ++ ENCODE_COEFFS( 4, 28, 46, 6), ++ ENCODE_COEFFS( 2, 16, 54, 4), ++ ENCODE_COEFFS( 2, 10, 58, 2) ++}; ++ ++// Function arrays by QPU ++ ++static const int * const inter_pred_setup_c_qpu[12] = { ++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn ++}; ++ ++static const int * const inter_pred_setup_c10_qpu[12] = { ++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn ++}; ++ ++static const int * const inter_pred_setup_y_qpu[12] = { ++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn ++}; ++ ++static const int * const inter_pred_setup_y10_qpu[12] = { ++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn ++}; ++ ++static const int * const inter_pred_sync_qpu[12] = { ++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, ++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, ++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 ++}; ++ ++static const int * const inter_pred_sync10_qpu[12] = { ++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, ++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, ++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 ++}; ++ ++static const int * const inter_pred_exit_c_qpu[12] = { ++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn ++}; ++ ++static const int * const inter_pred_exit_c10_qpu[12] = { ++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn ++}; ++ ++static const int * const inter_pred_exit_y_qpu[12] = { ++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn ++}; ++ ++static const int * const inter_pred_exit_y10_qpu[12] = { ++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn ++}; ++ ++typedef struct ipe_chan_info_s ++{ ++ const uint8_t bit_depth; ++ const uint8_t n; ++ const int * const * setup_fns; ++ const int * const * sync_fns; ++ const int * const * exit_fns; ++} ipe_chan_info_t; ++ ++typedef struct ipe_init_info_s ++{ ++ ipe_chan_info_t luma; ++ ipe_chan_info_t chroma; ++} ipe_init_info_t; ++ ++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 ++ { // 8 ++ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, ++ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} ++ }, ++ { // 9 ++ .luma = {0}, ++ .chroma = {0} ++ }, ++ { // 10 ++ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, ++ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} ++ } ++ ++}; ++ ++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) ++{ ++ const unsigned int n = ici->n; ++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word ++ ++ ipe->n = n; ++ ipe->max_fill = q1_size - ipe->min_gap; ++ for(unsigned int i = 0; i < n; i++) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base = ++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); ++ q->code_setup = qpu_fn(ici->setup_fns[i]); ++ q->code_sync = qpu_fn(ici->sync_fns[i]); ++ q->code_exit = qpu_fn(ici->exit_fns[i]); ++ } ++} ++ ++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth) ++{ ++ av_assert0(bit_depth >= 8 && bit_depth <= 16); ++ ++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); ++} ++ ++// Unsigned Trivial MOD ++static inline unsigned int utmod(const unsigned int x, const unsigned int n) ++{ ++ return x >= n ? x - n : x; ++} ++ ++// returns pq->job_n++ ++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq) ++{ ++ unsigned int const x2 = pq->job_n; ++ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS); ++ return x2; ++} ++ ++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n) ++{ ++ pq->terminate = 0; ++ pq->job_n = 0; ++ pq->context = s; ++ pq->worker = worker; ++ pq->psem_out = psem_out; ++ pq->pass_n = n; ++ pq->started = 0; ++ sem_init(&pq->sem_in, 0, 0); ++} ++ ++static void pass_queue_kill(HEVCRpiPassQueue * const pq) ++{ ++ sem_destroy(&pq->sem_in); ++} ++ ++static inline void rpi_sem_wait(sem_t * const sem) ++{ ++ while (sem_wait(sem) != 0) { ++ av_assert0(errno == EINTR); ++ } ++} ++ ++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq) ++{ ++ sem_post(&pq->sem_in); ++} ++ ++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ // Do the various passes - common with the worker code ++ for (unsigned int i = 0; i != RPI_PASSES; ++i) { ++ s->passq[i].worker(s, jb); ++ } ++} ++ ++ ++#if 0 ++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func) ++{ ++ int x; ++ sem_getvalue((sem_t *)&jbc->sem_out, &x); ++ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x); ++} ++#endif ++ ++ ++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJob * jb; ++ HEVCRpiJobGlobal * const jbg = jbc->jbg; ++ ++ pthread_mutex_lock(&jbg->lock); ++ // Check local 1st ++ if ((jb = jbc->jb1) != NULL) ++ { ++ // Only 1 - very easy :-) ++ jbc->jb1 = NULL; ++ } ++ else ++ { ++ // Now look for global free chain ++ if ((jb = jbg->free1) != NULL) ++ { ++ // Found one - unlink it ++ jbg->free1 = jb->next; ++ jb->next = NULL; ++ } ++ else ++ { ++ // Out of places to look - wait for one to become free - add to Qs ++ ++ // Global ++ // If "good" lc then add after the last "good" el in the chain ++ // otherwise add to the tail ++ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good) ++ { ++ // Add to end as we had to wait last time or wait Q empty ++ if ((lc->jw_prev = jbg->wait_tail) == NULL) ++ jbg->wait_head = lc; ++ else ++ lc->jw_prev->jw_next = lc; ++ lc->jw_next = NULL; ++ jbg->wait_tail = lc; ++ } ++ else ++ { ++ // This is a "good" lc that we need to poke into the middle ++ // of the Q ++ // We know that the Q isn't empty and there is at least one ++ // !last_progess_good el in it from the previous test ++ ++ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after ++ ++ if (p == NULL) ++ { ++ // No current good els - add to head ++ lc->jw_next = jbg->wait_head; ++ jbg->wait_head = lc; ++ } ++ else ++ { ++ lc->jw_next = p->jw_next; ++ p->jw_next = lc; ++ } ++ ++ lc->jw_next->jw_prev = lc; ++ lc->jw_prev = p; ++ } ++ ++ // If "good" then we are now the last good waiting el ++ if (lc->last_progress_good) ++ jbg->wait_good = lc; ++ ++ // Local ++ if ((lc->ljw_prev = jbc->lcw_tail) == NULL) ++ jbc->lcw_head = lc; ++ else ++ lc->ljw_prev->ljw_next = lc; ++ lc->ljw_next = NULL; ++ jbc->lcw_tail = lc; ++ } ++ } ++ ++ pthread_mutex_unlock(&jbg->lock); ++ ++ if (jb == NULL) // Need to wait ++ { ++ rpi_sem_wait(&lc->jw_sem); ++ jb = lc->jw_job; // Set by free code ++ } ++ ++ return jb; ++} ++ ++ ++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb) ++{ ++ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock ++ HEVCRpiJobCtl * jbc = jb->jbc_local; ++ HEVCRpiLocalContext * lc = NULL; ++ ++ pthread_mutex_lock(&jbg->lock); ++ ++ if (jbc != NULL) ++ { ++ av_assert1(jbc->jb1 == NULL); ++ ++ // Release to Local if nothing waiting there ++ if ((lc = jbc->lcw_head) == NULL) ++ jbc->jb1 = jb; ++ } ++ else ++ { ++ // Release to global if nothing waiting there ++ if ((lc = jbg->wait_head) == NULL) ++ { ++ jb->next = jbg->free1; ++ jbg->free1 = jb; ++ } ++ else ++ { ++ // ? seems somehow mildy ugly... ++ jbc = lc->context->jbc; ++ } ++ } ++ ++ if (lc != NULL) ++ { ++ // Something was waiting ++ ++ // Unlink ++ // Global ++ if (lc->jw_next == NULL) ++ jbg->wait_tail = lc->jw_prev; ++ else ++ lc->jw_next->jw_prev = lc->jw_prev; ++ ++ if (lc->jw_prev == NULL) ++ jbg->wait_head = lc->jw_next; ++ else ++ lc->jw_prev->jw_next = lc->jw_next; ++ ++ // Local ++ if (lc->ljw_next == NULL) ++ jbc->lcw_tail = lc->ljw_prev; ++ else ++ lc->ljw_next->ljw_prev = lc->ljw_prev; ++ ++ if (lc->ljw_prev == NULL) ++ jbc->lcw_head = lc->ljw_next; ++ else ++ lc->ljw_prev->ljw_next = lc->ljw_next; ++ ++ // Update good if required ++ if (jbg->wait_good == lc) ++ jbg->wait_good = lc->jw_prev; ++ ++ // Prod ++ lc->jw_job = jb; ++ sem_post(&lc->jw_sem); ++ } ++ ++ pthread_mutex_unlock(&jbg->lock); ++} ++ ++static void job_lc_kill(HEVCRpiLocalContext * const lc) ++{ ++ sem_destroy(&lc->jw_sem); ++} ++ ++static void job_lc_init(HEVCRpiLocalContext * const lc) ++{ ++ lc->jw_next = NULL; ++ lc->jw_prev = NULL; ++ lc->ljw_next = NULL; ++ lc->ljw_prev = NULL; ++ lc->jw_job = NULL; ++ sem_init(&lc->jw_sem, 0, 0); ++} ++ ++// Returns: ++// 0 if we have waited for MV or expect to wait for recon ++// 1 if we haven't waited for MV & do not need to wait for recon ++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb) ++{ ++ if (jb->waited) // reset by rpi_begin ++ return 0; ++ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) ++ { ++ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL && ++ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i]) ++ return 0; ++ } ++ return 1; ++} ++ ++// Submit job if it is full (indicated by having ctu_ts_last set >= 0) ++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl *const jbc = s->jbc; ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ av_assert1(jb != NULL); ++ ++ if (jb->ctu_ts_last < 0) { ++ return; ++ } ++ ++ lc->last_progress_good = progress_good(s, jb); ++ jb->waited = !lc->last_progress_good; ++ lc->jb0 = NULL; ++ ++ if (s->offload_recon) ++ { ++ pthread_mutex_lock(&jbc->in_lock); ++ jbc->offloadq[jbc->offload_in] = jb; ++ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS); ++ pthread_mutex_unlock(&jbc->in_lock); ++ ++ pass_queue_submit_job(s->passq + 0); // Consumes job eventually ++ } ++ else ++ { ++ pass_queue_do_all(s, jb); // Consumes job before return ++ } ++} ++ ++ ++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes ++// available to receive the next job. ++// ++// Now safe against multiple callers - needed for tiles ++// "normal" and WPP will only call here one at a time ++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ ++ // It is legit for us to already have a job allocated - do nothing in this case ++ if (lc->jb0 != NULL) ++ return; ++ ++ if (s->offload_recon) ++ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much ++ ++ lc->jb0 = job_alloc(jbc, lc); ++ ++ rpi_begin(s, lc->jb0, lc->ts); ++} ++ ++// Free up a job without submission ++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ if (jb == NULL) { ++ return; ++ } ++ ++ lc->jb0 = NULL; ++ ++ job_free(jbc, jb); ++ ++ // If offload then poke sem_out too ++ if (s->offload_recon) { ++ sem_post(&jbc->sem_out); ++ } ++} ++ ++ ++// Call this to wait for all jobs to have completed at the end of a frame ++// Slightly icky as there is no clean way to wait for a sem to count up ++// Not reentrant - call on main thread only ++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ int i = 0; ++ ++ // We shouldn't reach here with an unsubmitted job ++ av_assert1(lc->jb0 == NULL); ++ ++ // If no offload then there can't be anything to wait for ++ if (!s->offload_recon) { ++ return; ++ } ++ ++ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS) ++ { ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ rpi_sem_wait(&jbc->sem_out); ++ } ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ sem_post(&jbc->sem_out); ++ } ++ } ++} ++ ++static void * pass_worker(void *arg) ++{ ++ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg; ++ HEVCRpiContext *const s = pq->context; ++ ++ for (;;) ++ { ++ rpi_sem_wait(&pq->sem_in); ++ ++ if (pq->terminate) ++ break; ++ ++ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]); ++ // * should really set jb->passes_done here ++ ++ sem_post(pq->psem_out); ++ } ++ return NULL; ++} ++ ++static void pass_queues_start_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0); ++ pqs[i].started = 1; ++ } ++} ++ ++static void pass_queues_term_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ pqs[i].terminate = 1; ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ if (pqs[i].started) ++ sem_post(&pqs[i].sem_in); ++ } ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ if (pqs[i].started) { ++ pthread_join(pqs[i].thread, NULL); ++ pqs[i].started = 0; ++ } ++ } ++} ++ ++static void pass_queues_kill_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ pass_queue_kill(pqs + i); ++} ++ ++ ++static void worker_pic_free_one(HEVCRpiJob * const jb) ++{ ++ // Free coeff stuff - allocation not the same for all buffers ++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ ++ if (cf->s[0].buf != NULL) ++ av_freep(&cf->mptr); ++ if (cf->s[2].buf != NULL) ++ gpu_free(&cf->gptr); ++ memset(cf, 0, sizeof(*cf)); ++} ++ ++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count) ++{ ++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ ++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) ++ goto fail; ++ cf->s[2].buf = (int16_t *)cf->gptr.arm; ++ cf->s[3].buf = cf->s[2].buf + coeff_count; ++ ++ // Must be 64 byte aligned for our zero zapping code so over-allocate & ++ // round ++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL) ++ goto fail; ++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); ++ return 0; ++ ++fail: ++ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__); ++ worker_pic_free_one(jb); ++ return -1; ++} ++ ++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) ++{ ++ unsigned int i; ++ for (i = 0; i != 4; ++i) { ++ cf->s[i].n = 0; ++ } ++} ++ ++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n) ++{ ++ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no; ++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); ++ cfe->n += n; ++ return coeffs; ++} ++ ++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int val, const int field) ++{ ++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { ++ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data; ++ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field; ++ sem_t * sem = NULL; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ if (((volatile int *)ref->tf.progress->data)[field] < val) { ++ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait; ++ ++ av_assert1(pwait->req == -1 && pwait->next == NULL); ++ jb->waited = 1; // Remember that we had to wait for later scheduling ++ ++ pwait->req = val; ++ pwait->next = NULL; ++ if (pstate->first == NULL) ++ pstate->first = pwait; ++ else ++ pstate->last->next = pwait; ++ pstate->last = pwait; ++ sem = &pwait->sem; ++ } ++ pthread_mutex_unlock(&pstate->lock); ++ ++ if (sem != NULL) { ++ rpi_sem_wait(sem); ++ } ++ } ++} ++ ++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field) ++{ ++ HEVCRpiFrameProgressState *const pstate = s->progress_states + field; ++ ++ ((int *)s->ref->tf.progress->data)[field] = val; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ { ++ HEVCRpiFrameProgressWait ** ppwait = &pstate->first; ++ HEVCRpiFrameProgressWait * pwait; ++ ++ while ((pwait = *ppwait) != NULL) { ++ if (pwait->req > val) ++ { ++ ppwait = &pwait->next; ++ pstate->last = pwait; ++ } ++ else ++ { ++ *ppwait = pwait->next; ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_post(&pwait->sem); ++ } ++ } ++ } ++ pthread_mutex_unlock(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate) ++{ ++ pstate->first = NULL; ++ pstate->last = NULL; ++ pthread_mutex_init(&pstate->lock, NULL); ++} ++ ++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait) ++{ ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_init(&pwait->sem, 0, 0); ++} ++ ++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate) ++{ ++ av_assert1(pstate->first == NULL); ++ pthread_mutex_destroy(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait) ++{ ++ sem_destroy(&pwait->sem); ++} ++ ++ ++/** ++ * NOTE: Each function hls_foo correspond to the function foo in the ++ * specification (HLS stands for High Level Syntax). ++ */ ++ ++/** ++ * Section 5.7 ++ */ ++ ++/* free everything allocated by pic_arrays_init() */ ++static void pic_arrays_free(HEVCRpiContext *s) ++{ ++#ifdef RPI_DEBLOCK_VPU ++ { ++ int i; ++ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) { ++ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; ++ ++ if (dvq->vpu_cmds_arm) { ++ gpu_free(&dvq->deblock_vpu_gmem); ++ dvq->vpu_cmds_arm = 0; ++ } ++ } ++ } ++#endif ++ av_freep(&s->sao); ++ av_freep(&s->deblock); ++ ++ av_freep(&s->skip_flag); ++ av_freep(&s->tab_ct_depth); ++ ++ av_freep(&s->tab_ipm); ++ av_freep(&s->cbf_luma); ++ av_freep(&s->is_pcm); ++ ++ av_freep(&s->qp_y_tab); ++ av_freep(&s->tab_slice_address); ++ av_freep(&s->filter_slice_edges); ++ ++ av_freep(&s->horizontal_bs); ++ av_freep(&s->vertical_bs); ++ ++ av_freep(&s->sh.entry_point_offset); ++ av_freep(&s->sh.size); ++ av_freep(&s->sh.offset); ++ ++ av_buffer_pool_uninit(&s->tab_mvf_pool); ++ av_buffer_pool_uninit(&s->rpl_tab_pool); ++} ++ ++/* allocate arrays that depend on frame dimensions */ ++static int pic_arrays_init(HEVCRpiContext *s, const HEVCRpiSPS *sps) ++{ ++ int log2_min_cb_size = sps->log2_min_cb_size; ++ int width = sps->width; ++ int height = sps->height; ++ int pic_size_in_ctb = ((width >> log2_min_cb_size) + 1) * ++ ((height >> log2_min_cb_size) + 1); ++ int ctb_count = sps->ctb_width * sps->ctb_height; ++ int min_pu_size = sps->min_pu_width * sps->min_pu_height; ++ ++#ifdef RPI_DEBLOCK_VPU ++ { ++ int i; ++ s->enable_rpi_deblock = !sps->sao_enabled; ++ s->setup_width = (sps->width+15) / 16; ++ s->setup_height = (sps->height+15) / 16; ++ s->uv_setup_width = ( (sps->width >> ctx_hshift(s, 1)) + 15) / 16; ++ s->uv_setup_height = ( (sps->height >> ctx_vshift(s, 1)) + 15) / 16; ++ ++ for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) ++ { ++ struct dblk_vpu_q_s * const dvq = s->dvq_ents + i; ++ const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15; ++ const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15; ++ const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15; ++ const unsigned int total_size =- cmd_size + y_size + uv_size; ++ int p_vc; ++ uint8_t * p_arm; ++#if RPI_VPU_DEBLOCK_CACHED ++ gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem); ++#else ++ gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem); ++#endif ++ p_vc = dvq->deblock_vpu_gmem.vc; ++ p_arm = dvq->deblock_vpu_gmem.arm; ++ ++ // Zap all ++ memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes); ++ ++ // Subdivide ++ dvq->vpu_cmds_arm = (void*)p_arm; ++ dvq->vpu_cmds_vc = p_vc; ++ ++ p_arm += cmd_size; ++ p_vc += cmd_size; ++ ++ dvq->y_setup_arm = (void*)p_arm; ++ dvq->y_setup_vc = (void*)p_vc; ++ ++ p_arm += y_size; ++ p_vc += y_size; ++ ++ dvq->uv_setup_arm = (void*)p_arm; ++ dvq->uv_setup_vc = (void*)p_vc; ++ } ++ ++ s->dvq_n = 0; ++ s->dvq = s->dvq_ents + s->dvq_n; ++ } ++#endif ++ ++ s->bs_width = (width >> 2) + 1; ++ s->bs_height = (height >> 2) + 1; ++ ++ s->sao = av_mallocz_array(ctb_count, sizeof(*s->sao)); ++ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); ++ if (!s->sao || !s->deblock) ++ goto fail; ++ ++ s->skip_flag = av_malloc_array(sps->min_cb_height, sps->min_cb_width); ++ s->tab_ct_depth = av_malloc_array(sps->min_cb_height, sps->min_cb_width); ++ if (!s->skip_flag || !s->tab_ct_depth) ++ goto fail; ++ ++ s->cbf_luma = av_malloc_array(sps->min_tb_width, sps->min_tb_height); ++ s->tab_ipm = av_mallocz(min_pu_size); ++ s->is_pcm = av_malloc_array(sps->min_pu_width + 1, sps->min_pu_height + 1); ++ if (!s->tab_ipm || !s->cbf_luma || !s->is_pcm) ++ goto fail; ++ ++ s->filter_slice_edges = av_mallocz(ctb_count); ++ s->tab_slice_address = av_malloc_array(pic_size_in_ctb, ++ sizeof(*s->tab_slice_address)); ++ s->qp_y_tab = av_malloc_array(pic_size_in_ctb, ++ sizeof(*s->qp_y_tab)); ++ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) ++ goto fail; ++ ++ s->horizontal_bs = av_mallocz_array(s->bs_width, s->bs_height); ++ s->vertical_bs = av_mallocz_array(s->bs_width, s->bs_height); ++ if (!s->horizontal_bs || !s->vertical_bs) ++ goto fail; ++ ++ s->tab_mvf_pool = av_buffer_pool_init(min_pu_size * sizeof(MvField), ++ av_buffer_allocz); ++ s->rpl_tab_pool = av_buffer_pool_init(ctb_count * sizeof(RefPicListTab), ++ av_buffer_allocz); ++ if (!s->tab_mvf_pool || !s->rpl_tab_pool) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ pic_arrays_free(s); ++ return AVERROR(ENOMEM); ++} ++ ++static void default_pred_weight_table(HEVCRpiContext * const s) ++{ ++ unsigned int i; ++ s->sh.luma_log2_weight_denom = 0; ++ s->sh.chroma_log2_weight_denom = 0; ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) { ++ s->sh.luma_weight_l0[i] = 1; ++ s->sh.luma_offset_l0[i] = 0; ++ s->sh.chroma_weight_l0[i][0] = 1; ++ s->sh.chroma_offset_l0[i][0] = 0; ++ s->sh.chroma_weight_l0[i][1] = 1; ++ s->sh.chroma_offset_l0[i][1] = 0; ++ } ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) { ++ s->sh.luma_weight_l1[i] = 1; ++ s->sh.luma_offset_l1[i] = 0; ++ s->sh.chroma_weight_l1[i][0] = 1; ++ s->sh.chroma_offset_l1[i][0] = 0; ++ s->sh.chroma_weight_l1[i][1] = 1; ++ s->sh.chroma_offset_l1[i][1] = 0; ++ } ++} ++ ++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb) ++{ ++ int i = 0; ++ int j = 0; ++ uint8_t luma_weight_l0_flag[16]; ++ uint8_t chroma_weight_l0_flag[16]; ++ uint8_t luma_weight_l1_flag[16]; ++ uint8_t chroma_weight_l1_flag[16]; ++ int luma_log2_weight_denom; ++ ++ luma_log2_weight_denom = get_ue_golomb_long(gb); ++ if (luma_log2_weight_denom < 0 || luma_log2_weight_denom > 7) ++ av_log(s->avctx, AV_LOG_ERROR, "luma_log2_weight_denom %d is invalid\n", luma_log2_weight_denom); ++ s->sh.luma_log2_weight_denom = av_clip_uintp2(luma_log2_weight_denom, 3); ++ if (ctx_cfmt(s) != 0) { ++ int delta = get_se_golomb(gb); ++ s->sh.chroma_log2_weight_denom = av_clip_uintp2(s->sh.luma_log2_weight_denom + delta, 3); ++ } ++ ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) { ++ luma_weight_l0_flag[i] = get_bits1(gb); ++ if (!luma_weight_l0_flag[i]) { ++ s->sh.luma_weight_l0[i] = 1 << s->sh.luma_log2_weight_denom; ++ s->sh.luma_offset_l0[i] = 0; ++ } ++ } ++ if (ctx_cfmt(s) != 0) { ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) ++ chroma_weight_l0_flag[i] = get_bits1(gb); ++ } else { ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) ++ chroma_weight_l0_flag[i] = 0; ++ } ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) { ++ if (luma_weight_l0_flag[i]) { ++ int delta_luma_weight_l0 = get_se_golomb(gb); ++ s->sh.luma_weight_l0[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l0; ++ s->sh.luma_offset_l0[i] = get_se_golomb(gb); ++ } ++ if (chroma_weight_l0_flag[i]) { ++ for (j = 0; j < 2; j++) { ++ int delta_chroma_weight_l0 = get_se_golomb(gb); ++ int delta_chroma_offset_l0 = get_se_golomb(gb); ++ ++ if ( (int8_t)delta_chroma_weight_l0 != delta_chroma_weight_l0 ++ || delta_chroma_offset_l0 < -(1<<17) || delta_chroma_offset_l0 > (1<<17)) { ++ return AVERROR_INVALIDDATA; ++ } ++ ++ s->sh.chroma_weight_l0[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l0; ++ s->sh.chroma_offset_l0[i][j] = av_clip((delta_chroma_offset_l0 - ((128 * s->sh.chroma_weight_l0[i][j]) ++ >> s->sh.chroma_log2_weight_denom) + 128), -128, 127); ++ } ++ } else { ++ s->sh.chroma_weight_l0[i][0] = 1 << s->sh.chroma_log2_weight_denom; ++ s->sh.chroma_offset_l0[i][0] = 0; ++ s->sh.chroma_weight_l0[i][1] = 1 << s->sh.chroma_log2_weight_denom; ++ s->sh.chroma_offset_l0[i][1] = 0; ++ } ++ } ++ if (s->sh.slice_type == HEVC_SLICE_B) { ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) { ++ luma_weight_l1_flag[i] = get_bits1(gb); ++ if (!luma_weight_l1_flag[i]) { ++ s->sh.luma_weight_l1[i] = 1 << s->sh.luma_log2_weight_denom; ++ s->sh.luma_offset_l1[i] = 0; ++ } ++ } ++ if (ctx_cfmt(s) != 0) { ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) ++ chroma_weight_l1_flag[i] = get_bits1(gb); ++ } else { ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) ++ chroma_weight_l1_flag[i] = 0; ++ } ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) { ++ if (luma_weight_l1_flag[i]) { ++ int delta_luma_weight_l1 = get_se_golomb(gb); ++ s->sh.luma_weight_l1[i] = (1 << s->sh.luma_log2_weight_denom) + delta_luma_weight_l1; ++ s->sh.luma_offset_l1[i] = get_se_golomb(gb); ++ } ++ if (chroma_weight_l1_flag[i]) { ++ for (j = 0; j < 2; j++) { ++ int delta_chroma_weight_l1 = get_se_golomb(gb); ++ int delta_chroma_offset_l1 = get_se_golomb(gb); ++ ++ if ( (int8_t)delta_chroma_weight_l1 != delta_chroma_weight_l1 ++ || delta_chroma_offset_l1 < -(1<<17) || delta_chroma_offset_l1 > (1<<17)) { ++ return AVERROR_INVALIDDATA; ++ } ++ ++ s->sh.chroma_weight_l1[i][j] = (1 << s->sh.chroma_log2_weight_denom) + delta_chroma_weight_l1; ++ s->sh.chroma_offset_l1[i][j] = av_clip((delta_chroma_offset_l1 - ((128 * s->sh.chroma_weight_l1[i][j]) ++ >> s->sh.chroma_log2_weight_denom) + 128), -128, 127); ++ } ++ } else { ++ s->sh.chroma_weight_l1[i][0] = 1 << s->sh.chroma_log2_weight_denom; ++ s->sh.chroma_offset_l1[i][0] = 0; ++ s->sh.chroma_weight_l1[i][1] = 1 << s->sh.chroma_log2_weight_denom; ++ s->sh.chroma_offset_l1[i][1] = 0; ++ } ++ } ++ } ++ return 0; ++} ++ ++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb) ++{ ++ const HEVCRpiSPS *sps = s->ps.sps; ++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; ++ int prev_delta_msb = 0; ++ unsigned int nb_sps = 0, nb_sh; ++ int i; ++ ++ rps->nb_refs = 0; ++ if (!sps->long_term_ref_pics_present_flag) ++ return 0; ++ ++ if (sps->num_long_term_ref_pics_sps > 0) ++ nb_sps = get_ue_golomb_long(gb); ++ nb_sh = get_ue_golomb_long(gb); ++ ++ if (nb_sps > sps->num_long_term_ref_pics_sps) ++ return AVERROR_INVALIDDATA; ++ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc)) ++ return AVERROR_INVALIDDATA; ++ ++ rps->nb_refs = nb_sh + nb_sps; ++ ++ for (i = 0; i < rps->nb_refs; i++) { ++ uint8_t delta_poc_msb_present; ++ ++ if (i < nb_sps) { ++ uint8_t lt_idx_sps = 0; ++ ++ if (sps->num_long_term_ref_pics_sps > 1) ++ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps)); ++ ++ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps]; ++ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps]; ++ } else { ++ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb); ++ rps->used[i] = get_bits1(gb); ++ } ++ ++ delta_poc_msb_present = get_bits1(gb); ++ if (delta_poc_msb_present) { ++ int64_t delta = get_ue_golomb_long(gb); ++ int64_t poc; ++ ++ if (i && i != nb_sps) ++ delta += prev_delta_msb; ++ ++ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb; ++ if (poc != (int32_t)poc) ++ return AVERROR_INVALIDDATA; ++ rps->poc[i] = poc; ++ prev_delta_msb = delta; ++ } ++ } ++ ++ return 0; ++} ++ ++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps, ++ const HEVCRpiSPS *sps) ++{ ++ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data; ++ const HEVCWindow *ow = &sps->output_window; ++ unsigned int num = 0, den = 0; ++ ++ avctx->pix_fmt = sps->pix_fmt; ++ avctx->coded_width = sps->width; ++ avctx->coded_height = sps->height; ++ avctx->width = sps->width - ow->left_offset - ow->right_offset; ++ avctx->height = sps->height - ow->top_offset - ow->bottom_offset; ++ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics; ++ avctx->profile = sps->ptl.general_ptl.profile_idc; ++ avctx->level = sps->ptl.general_ptl.level_idc; ++ ++ ff_set_sar(avctx, sps->vui.sar); ++ ++ if (sps->vui.video_signal_type_present_flag) ++ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG ++ : AVCOL_RANGE_MPEG; ++ else ++ avctx->color_range = AVCOL_RANGE_MPEG; ++ ++ if (sps->vui.colour_description_present_flag) { ++ avctx->color_primaries = sps->vui.colour_primaries; ++ avctx->color_trc = sps->vui.transfer_characteristic; ++ avctx->colorspace = sps->vui.matrix_coeffs; ++ } else { ++ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED; ++ avctx->color_trc = AVCOL_TRC_UNSPECIFIED; ++ avctx->colorspace = AVCOL_SPC_UNSPECIFIED; ++ } ++ ++ if (vps->vps_timing_info_present_flag) { ++ num = vps->vps_num_units_in_tick; ++ den = vps->vps_time_scale; ++ } else if (sps->vui.vui_timing_info_present_flag) { ++ num = sps->vui.vui_num_units_in_tick; ++ den = sps->vui.vui_time_scale; ++ } ++ ++ if (num != 0 && den != 0) ++ av_reduce(&avctx->framerate.den, &avctx->framerate.num, ++ num, den, 1 << 30); ++} ++ ++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps) ++{ ++ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts; ++ ++ // Admit to no h/w formats ++ ++ *fmt++ = sps->pix_fmt; ++ *fmt = AV_PIX_FMT_NONE; ++ ++ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts); ++} ++ ++static int is_sps_supported(const HEVCRpiSPS * const sps) ++{ ++ return av_rpi_is_sand_format(sps->pix_fmt) && ++ sps->width <= HEVC_RPI_MAX_WIDTH && ++ sps->height <= HEVC_RPI_MAX_HEIGHT; ++} ++ ++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps, ++ const enum AVPixelFormat pix_fmt) ++{ ++ int ret; ++ ++ pic_arrays_free(s); ++ s->ps.sps = NULL; ++ s->ps.vps = NULL; ++ ++ if (sps == NULL) ++ return 0; ++ ++ if (!is_sps_supported(sps)) ++ return AVERROR_DECODER_NOT_FOUND; ++ ++ ret = pic_arrays_init(s, sps); ++ if (ret < 0) ++ goto fail; ++ ++ export_stream_params(s->avctx, &s->ps, sps); ++ ++ s->avctx->pix_fmt = pix_fmt; ++ ++ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth); ++ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth); ++ ff_videodsp_init (&s->vdsp, sps->bit_depth); ++ ++ // * We don't support cross_component_prediction_enabled_flag but as that ++ // must be 0 unless we have 4:4:4 there is no point testing for it as we ++ // only deal with sand which is never 4:4:4 ++ // [support wouldn't be hard] ++ ++ rpi_hevc_qpu_set_fns(s, sps->bit_depth); ++ ++ av_freep(&s->sao_pixel_buffer_h[0]); ++ av_freep(&s->sao_pixel_buffer_v[0]); ++ ++ if (sps->sao_enabled) ++ { ++ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1; ++ unsigned int c_idx; ++ size_t vsize[3] = {0}; ++ size_t hsize[3] = {0}; ++ ++ for(c_idx = 0; c_idx < c_count; c_idx++) { ++ int w = sps->width >> ctx_hshift(s, c_idx); ++ int h = sps->height >> ctx_vshift(s, c_idx); ++ // ctb height & width are a min of 8 so this must a multiple of 16 ++ // so no point rounding up! ++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; ++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; ++ } ++ ++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] ++ // when we have plaited chroma ++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); ++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); ++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; ++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; ++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; ++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; ++ } ++ ++ s->ps.sps = sps; ++ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; ++ ++ return 0; ++ ++fail: ++ pic_arrays_free(s); ++ s->ps.sps = NULL; ++ return ret; ++} ++ ++static int hls_slice_header(HEVCRpiContext *s) ++{ ++ GetBitContext *gb = &s->HEVClc->gb; ++ SliceHeader *sh = &s->sh; ++ int i, ret; ++ ++ // Coded parameters ++ sh->first_slice_in_pic_flag = get_bits1(gb); ++ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) { ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ if (IS_IDR(s)) ++ ff_hevc_rpi_clear_refs(s); ++ } ++ sh->no_output_of_prior_pics_flag = 0; ++ if (IS_IRAP(s)) ++ sh->no_output_of_prior_pics_flag = get_bits1(gb); ++ ++ sh->pps_id = get_ue_golomb_long(gb); ++ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) { ++ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ if (!sh->first_slice_in_pic_flag && ++ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) { ++ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data; ++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1) ++ sh->no_output_of_prior_pics_flag = 1; ++ ++ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { ++ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; ++ const HEVCRpiSPS *last_sps = s->ps.sps; ++ ++ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) { ++ if (sps->width != last_sps->width || sps->height != last_sps->height || ++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering != ++ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering) ++ sh->no_output_of_prior_pics_flag = 0; ++ } ++ ff_hevc_rpi_clear_refs(s); ++ ++ ret = set_sps(s, sps, get_format(s, sps)); ++ if (ret < 0) ++ return ret; ++ ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ } ++ ++ sh->dependent_slice_segment_flag = 0; ++ if (!sh->first_slice_in_pic_flag) { ++ int slice_address_length; ++ ++ if (s->ps.pps->dependent_slice_segments_enabled_flag) ++ sh->dependent_slice_segment_flag = get_bits1(gb); ++ ++ slice_address_length = av_ceil_log2(s->ps.sps->ctb_width * ++ s->ps.sps->ctb_height); ++ sh->slice_segment_addr = get_bitsz(gb, slice_address_length); ++ if (sh->slice_segment_addr >= s->ps.sps->ctb_width * s->ps.sps->ctb_height) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid slice segment address: %u.\n", ++ sh->slice_segment_addr); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (!sh->dependent_slice_segment_flag) { ++ sh->slice_addr = sh->slice_segment_addr; ++ s->slice_idx++; ++ } ++ } else { ++ sh->slice_segment_addr = sh->slice_addr = 0; ++ s->slice_idx = 0; ++ s->slice_initialized = 0; ++ } ++ ++ if (!sh->dependent_slice_segment_flag) { ++ s->slice_initialized = 0; ++ ++ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++) ++ skip_bits(gb, 1); // slice_reserved_undetermined_flag[] ++ ++ sh->slice_type = get_ue_golomb_long(gb); ++ if (!(sh->slice_type == HEVC_SLICE_I || ++ sh->slice_type == HEVC_SLICE_P || ++ sh->slice_type == HEVC_SLICE_B)) { ++ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n", ++ sh->slice_type); ++ return AVERROR_INVALIDDATA; ++ } ++ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) { ++ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // when flag is not present, picture is inferred to be output ++ sh->pic_output_flag = 1; ++ if (s->ps.pps->output_flag_present_flag) ++ sh->pic_output_flag = get_bits1(gb); ++ ++ if (s->ps.sps->separate_colour_plane_flag) ++ sh->colour_plane_id = get_bits(gb, 2); ++ ++ if (!IS_IDR(s)) { ++ int poc, pos; ++ ++ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb); ++ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type); ++ if (!sh->first_slice_in_pic_flag && poc != s->poc) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc); ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return AVERROR_INVALIDDATA; ++ poc = s->poc; ++ } ++ s->poc = poc; ++ ++ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb); ++ pos = get_bits_left(gb); ++ if (!sh->short_term_ref_pic_set_sps_flag) { ++ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1); ++ if (ret < 0) ++ return ret; ++ ++ sh->short_term_rps = &sh->slice_rps; ++ } else { ++ int numbits, rps_idx; ++ ++ if (!s->ps.sps->nb_st_rps) { ++ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ numbits = av_ceil_log2(s->ps.sps->nb_st_rps); ++ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0; ++ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx]; ++ } ++ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb); ++ ++ pos = get_bits_left(gb); ++ ret = decode_lt_rps(s, &sh->long_term_rps, gb); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n"); ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return AVERROR_INVALIDDATA; ++ } ++ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb); ++ ++ if (s->ps.sps->sps_temporal_mvp_enabled_flag) ++ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb); ++ else ++ sh->slice_temporal_mvp_enabled_flag = 0; ++ } else { ++ s->sh.short_term_rps = NULL; ++ s->poc = 0; ++ } ++ ++ /* 8.3.1 */ ++ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 && ++ s->nal_unit_type != HEVC_NAL_TRAIL_N && ++ s->nal_unit_type != HEVC_NAL_TSA_N && ++ s->nal_unit_type != HEVC_NAL_STSA_N && ++ s->nal_unit_type != HEVC_NAL_RADL_N && ++ s->nal_unit_type != HEVC_NAL_RADL_R && ++ s->nal_unit_type != HEVC_NAL_RASL_N && ++ s->nal_unit_type != HEVC_NAL_RASL_R) ++ s->pocTid0 = s->poc; ++ ++ if (s->ps.sps->sao_enabled) { ++ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb); ++ if (ctx_cfmt(s) != 0) { ++ sh->slice_sample_adaptive_offset_flag[1] = ++ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb); ++ } ++ } else { ++ sh->slice_sample_adaptive_offset_flag[0] = 0; ++ sh->slice_sample_adaptive_offset_flag[1] = 0; ++ sh->slice_sample_adaptive_offset_flag[2] = 0; ++ } ++ ++ sh->nb_refs[L0] = sh->nb_refs[L1] = 0; ++ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) { ++ int nb_refs; ++ ++ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active; ++ ++ if (get_bits1(gb)) { // num_ref_idx_active_override_flag ++ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1; ++ } ++ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) { ++ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n", ++ sh->nb_refs[L0], sh->nb_refs[L1]); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->rpl_modification_flag[0] = 0; ++ sh->rpl_modification_flag[1] = 0; ++ nb_refs = ff_hevc_rpi_frame_nb_refs(s); ++ if (!nb_refs) { ++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) { ++ sh->rpl_modification_flag[0] = get_bits1(gb); ++ if (sh->rpl_modification_flag[0]) { ++ for (i = 0; i < sh->nb_refs[L0]; i++) ++ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs)); ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) { ++ sh->rpl_modification_flag[1] = get_bits1(gb); ++ if (sh->rpl_modification_flag[1] == 1) ++ for (i = 0; i < sh->nb_refs[L1]; i++) ++ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs)); ++ } ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->mvd_l1_zero_flag = get_bits1(gb); ++ ++ if (s->ps.pps->cabac_init_present_flag) ++ sh->cabac_init_flag = get_bits1(gb); ++ else ++ sh->cabac_init_flag = 0; ++ ++ sh->collocated_ref_idx = 0; ++ if (sh->slice_temporal_mvp_enabled_flag) { ++ sh->collocated_list = L0; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->collocated_list = !get_bits1(gb); ++ ++ if (sh->nb_refs[sh->collocated_list] > 1) { ++ sh->collocated_ref_idx = get_ue_golomb_long(gb); ++ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid collocated_ref_idx: %d.\n", ++ sh->collocated_ref_idx); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ } ++ ++ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) || ++ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) { ++ int ret = pred_weight_table(s, gb); ++ if (ret < 0) ++ return ret; ++ } ++ else ++ { ++ // Give us unit weights ++ default_pred_weight_table(s); ++ } ++ ++ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); ++ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid number of merging MVP candidates: %d.\n", ++ sh->max_num_merge_cand); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ sh->slice_qp_delta = get_se_golomb(gb); ++ ++ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) { ++ sh->slice_cb_qp_offset = get_se_golomb(gb); ++ sh->slice_cr_qp_offset = get_se_golomb(gb); ++ } else { ++ sh->slice_cb_qp_offset = 0; ++ sh->slice_cr_qp_offset = 0; ++ } ++ ++ if (s->ps.pps->chroma_qp_offset_list_enabled_flag) ++ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb); ++ else ++ sh->cu_chroma_qp_offset_enabled_flag = 0; ++ ++ if (s->ps.pps->deblocking_filter_control_present_flag) { ++ int deblocking_filter_override_flag = 0; ++ ++ if (s->ps.pps->deblocking_filter_override_enabled_flag) ++ deblocking_filter_override_flag = get_bits1(gb); ++ ++ if (deblocking_filter_override_flag) { ++ sh->disable_deblocking_filter_flag = get_bits1(gb); ++ if (!sh->disable_deblocking_filter_flag) { ++ int beta_offset_div2 = get_se_golomb(gb); ++ int tc_offset_div2 = get_se_golomb(gb) ; ++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 || ++ tc_offset_div2 < -6 || tc_offset_div2 > 6) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid deblock filter offsets: %d, %d\n", ++ beta_offset_div2, tc_offset_div2); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->beta_offset = beta_offset_div2 * 2; ++ sh->tc_offset = tc_offset_div2 * 2; ++ } ++ } else { ++ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf; ++ sh->beta_offset = s->ps.pps->beta_offset; ++ sh->tc_offset = s->ps.pps->tc_offset; ++ } ++ } else { ++ sh->disable_deblocking_filter_flag = 0; ++ sh->beta_offset = 0; ++ sh->tc_offset = 0; ++ } ++ ++ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag && ++ (sh->slice_sample_adaptive_offset_flag[0] || ++ sh->slice_sample_adaptive_offset_flag[1] || ++ !sh->disable_deblocking_filter_flag)) { ++ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb); ++ } else { ++ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag; ++ } ++ } else if (!s->slice_initialized) { ++ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->num_entry_point_offsets = 0; ++ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) { ++ unsigned num_entry_point_offsets = get_ue_golomb_long(gb); ++ // It would be possible to bound this tighter but this here is simpler ++ if (num_entry_point_offsets > get_bits_left(gb)) { ++ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->num_entry_point_offsets = num_entry_point_offsets; ++ if (sh->num_entry_point_offsets > 0) { ++ int offset_len = get_ue_golomb_long(gb) + 1; ++ ++ if (offset_len < 1 || offset_len > 32) { ++ sh->num_entry_point_offsets = 0; ++ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ av_freep(&sh->entry_point_offset); ++ av_freep(&sh->offset); ++ av_freep(&sh->size); ++ sh->entry_point_offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(unsigned)); ++ sh->offset = av_malloc_array(sh->num_entry_point_offsets, sizeof(int)); ++ sh->size = av_malloc_array(sh->num_entry_point_offsets, sizeof(int)); ++ if (!sh->entry_point_offset || !sh->offset || !sh->size) { ++ sh->num_entry_point_offsets = 0; ++ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n"); ++ return AVERROR(ENOMEM); ++ } ++ for (i = 0; i < sh->num_entry_point_offsets; i++) { ++ unsigned val = get_bits_long(gb, offset_len); ++ sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size ++ } ++ if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) { ++ s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here ++ s->threads_number = 1; ++ } else ++ s->enable_parallel_tiles = 0; ++ } else ++ s->enable_parallel_tiles = 0; ++ } ++ ++ if (s->ps.pps->slice_header_extension_present_flag) { ++ unsigned int length = get_ue_golomb_long(gb); ++ if (length*8LL > get_bits_left(gb)) { ++ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < length; i++) ++ skip_bits(gb, 8); // slice_header_extension_data_byte ++ } ++ ++ // Inferred parameters ++ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta; ++ if (sh->slice_qp > 51 || ++ sh->slice_qp < -s->ps.sps->qp_bd_offset) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "The slice_qp %d is outside the valid range " ++ "[%d, 51].\n", ++ sh->slice_qp, ++ -s->ps.sps->qp_bd_offset); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->slice_ctb_addr_rs = sh->slice_segment_addr; ++ ++ if (!s->sh.slice_ctb_addr_rs && s->sh.dependent_slice_segment_flag) { ++ av_log(s->avctx, AV_LOG_ERROR, "Impossible slice segment.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Overread slice header by %d bits\n", -get_bits_left(gb)); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ s->slice_initialized = 1; ++ return 0; ++} ++ ++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry) ++{ ++ SAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width; ++ int c_idx, i; ++ ++ if (s->sh.slice_sample_adaptive_offset_flag[0] || ++ s->sh.slice_sample_adaptive_offset_flag[1]) { ++ if (lc->ctb_left_flag) ++ { ++ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); ++ if (sao_merge_left_flag) { ++ *sao = sao[-1]; ++ return; ++ } ++ } ++ if (lc->ctb_up_flag) ++ { ++ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); ++ if (sao_merge_up_flag) { ++ *sao = sao[-(int)s->ps.sps->ctb_width]; ++ return; ++ } ++ } ++ } ++ ++ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) { ++ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma : ++ s->ps.pps->log2_sao_offset_scale_chroma; ++ int offset_abs[4]; ++ char offset_sign[4] = {0}; ++ ++ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) { ++ sao->type_idx[c_idx] = SAO_NOT_APPLIED; ++ continue; ++ } ++ ++ if (c_idx == 2) { ++ sao->type_idx[2] = sao->type_idx[1]; ++ sao->eo_class[2] = sao->eo_class[1]; ++ } else { ++ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc); ++ } ++ ++ // ** Could use BY22 here quite plausibly - this is all bypass stuff ++ // though only per CTB so not very timing critical ++ ++ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED) ++ continue; ++ ++ for (i = 0; i < 4; i++) ++ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc); ++ ++ if (sao->type_idx[c_idx] == SAO_BAND) { ++ for (i = 0; i < 4; i++) { ++ if (offset_abs[i] != 0) ++ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc); ++ } ++ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc); ++ } else if (c_idx != 2) { ++ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc); ++ } ++ ++ // Inferred parameters ++ sao->offset_val[c_idx][0] = 0; ++ for (i = 0; i < 4; i++) { ++ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale; ++ if (sao->type_idx[c_idx] == SAO_EDGE) { ++ if (i > 1) ++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; ++ } else if (offset_sign[i]) { ++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; ++ } ++ } ++ } ++} ++ ++ ++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) { ++ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); ++ ++ if (log2_res_scale_abs_plus1 != 0) { ++ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx); ++ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) * ++ (1 - 2 * res_scale_sign_flag); ++ } else { ++ lc->tu.res_scale_val = 0; ++ } ++ ++ ++ return 0; ++} ++ ++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb) ++{ ++ return jb->intra.cmds + jb->intra.n++; ++} ++ ++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx) ++{ ++ // If rpi_enabled then sand - U & V done on U call ++ if (c_idx <= 1) ++ { ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); ++ cmd->type = RPI_PRED_INTRA; ++ cmd->size = log2_trafo_size; ++ cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right; ++ cmd->c_idx = c_idx; ++ cmd->i_pred.x = x0; ++ cmd->i_pred.y = y0; ++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ } ++} ++ ++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, ++ int xBase, int yBase, int cb_xBase, int cb_yBase, ++ int log2_cb_size, int log2_trafo_size, ++ int blk_idx, int cbf_luma, int *cbf_cb, int *cbf_cr) ++{ ++// const int log2_trafo_size_c = log2_trafo_size - s->ps.sps->hshift[1]; ++ const int log2_trafo_size_c = log2_trafo_size - ctx_hshift(s, 1); ++ int i; ++ ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ int trafo_size = 1 << log2_trafo_size; ++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size, trafo_size); ++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0); ++ } ++ ++ if (cbf_luma || cbf_cb[0] || cbf_cr[0] || ++ (ctx_cfmt(s) == 2 && (cbf_cb[1] || cbf_cr[1]))) { ++ int scan_idx = SCAN_DIAG; ++ int scan_idx_c = SCAN_DIAG; ++ int cbf_chroma = cbf_cb[0] || cbf_cr[0] || ++ (ctx_cfmt(s) == 2 && ++ (cbf_cb[1] || cbf_cr[1])); ++ ++ if (s->ps.pps->cu_qp_delta_enabled_flag && !lc->tu.is_cu_qp_delta_coded) { ++ lc->tu.cu_qp_delta = ff_hevc_rpi_cu_qp_delta_abs(lc); ++ if (lc->tu.cu_qp_delta != 0) ++ if (ff_hevc_rpi_cu_qp_delta_sign_flag(lc) == 1) ++ lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta; ++ lc->tu.is_cu_qp_delta_coded = 1; ++ ++ if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) || ++ lc->tu.cu_qp_delta > (25 + s->ps.sps->qp_bd_offset / 2)) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "The cu_qp_delta %d is outside the valid range " ++ "[%d, %d].\n", ++ lc->tu.cu_qp_delta, ++ -(26 + s->ps.sps->qp_bd_offset / 2), ++ (25 + s->ps.sps->qp_bd_offset / 2)); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ff_hevc_rpi_set_qPy(s, lc, cb_xBase, cb_yBase, log2_cb_size); ++ } ++ ++ if (!lc->tu.is_cu_chroma_qp_offset_coded && cbf_chroma && ++ !lc->cu.cu_transquant_bypass_flag) { ++ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc); ++ if (cu_chroma_qp_offset_flag) { ++ int cu_chroma_qp_offset_idx = 0; ++ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) { ++ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc); ++ av_log(s->avctx, AV_LOG_ERROR, ++ "cu_chroma_qp_offset_idx not yet tested.\n"); ++ } ++ lc->tu.cu_qp_offset_cb = s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx]; ++ lc->tu.cu_qp_offset_cr = s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx]; ++ } ++ lc->tu.is_cu_chroma_qp_offset_coded = 1; ++ } ++ ++ if (lc->cu.pred_mode == MODE_INTRA && log2_trafo_size < 4) { ++ if (lc->tu.intra_pred_mode >= 6 && ++ lc->tu.intra_pred_mode <= 14) { ++ scan_idx = SCAN_VERT; ++ } else if (lc->tu.intra_pred_mode >= 22 && ++ lc->tu.intra_pred_mode <= 30) { ++ scan_idx = SCAN_HORIZ; ++ } ++ ++ if (lc->tu.intra_pred_mode_c >= 6 && ++ lc->tu.intra_pred_mode_c <= 14) { ++ scan_idx_c = SCAN_VERT; ++ } else if (lc->tu.intra_pred_mode_c >= 22 && ++ lc->tu.intra_pred_mode_c <= 30) { ++ scan_idx_c = SCAN_HORIZ; ++ } ++ } ++ ++ lc->tu.cross_pf = 0; ++ ++ if (cbf_luma) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); ++ if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) { ++ const int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1)); ++ const int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1)); ++ lc->tu.cross_pf = (s->ps.pps->cross_component_prediction_enabled_flag && cbf_luma && ++ (lc->cu.pred_mode == MODE_INTER || ++ (lc->tu.chroma_mode_c == 4))); ++ ++ if (lc->tu.cross_pf) { ++ hls_cross_component_pred(lc, 0); ++ } ++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1); ++ } ++ if (cbf_cb[i]) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), ++ log2_trafo_size_c, scan_idx_c, 1); ++ else ++ if (lc->tu.cross_pf) { ++ const ptrdiff_t stride = frame_stride1(s->frame, 1); ++ const int hshift = ctx_hshift(s, 1); ++ const int vshift = ctx_vshift(s, 1); ++ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer; ++ int16_t * const coeffs = (int16_t*)lc->edge_emu_buffer2; ++ int size = 1 << log2_trafo_size_c; ++ ++ uint8_t *dst = &s->frame->data[1][(y0 >> vshift) * stride + ++ ((x0 >> hshift) << s->ps.sps->pixel_shift)]; ++ for (i = 0; i < (size * size); i++) { ++ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); ++ } ++ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride); ++ } ++ } ++ ++ if (lc->tu.cross_pf) { ++ hls_cross_component_pred(lc, 1); ++ } ++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2); ++ } ++ if (cbf_cr[i]) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0 + (i << log2_trafo_size_c), ++ log2_trafo_size_c, scan_idx_c, 2); ++ else ++ if (lc->tu.cross_pf) { ++ ptrdiff_t stride = frame_stride1(s->frame, 2); ++ const int hshift = ctx_hshift(s, 2); ++ const int vshift = ctx_vshift(s, 2); ++ int16_t *coeffs_y = (int16_t*)lc->edge_emu_buffer; ++ int16_t *coeffs = (int16_t*)lc->edge_emu_buffer2; ++ const int size = 1 << log2_trafo_size_c; ++ ++ uint8_t *dst = &s->frame->data[2][(y0 >> vshift) * stride + ++ ((x0 >> hshift) << s->ps.sps->pixel_shift)]; ++ for (i = 0; i < (size * size); i++) { ++ coeffs[i] = ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); ++ } ++ s->hevcdsp.add_residual[log2_trafo_size_c-2](dst, coeffs, stride); ++ } ++ } ++ } else if (ctx_cfmt(s) != 0 && blk_idx == 3) { ++ int trafo_size_h = 1 << (log2_trafo_size + 1); ++ int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1)); ++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), ++ trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1); ++ } ++ if (cbf_cb[i]) ++ ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), ++ log2_trafo_size, scan_idx_c, 1); ++ } ++ for (i = 0; i < (ctx_cfmt(s) == 2 ? 2 : 1); i++) { ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (i << log2_trafo_size), ++ trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2); ++ } ++ if (cbf_cr[i]) ++ ff_hevc_rpi_hls_residual_coding(s, lc, xBase, yBase + (i << log2_trafo_size), ++ log2_trafo_size, scan_idx_c, 2); ++ } ++ } ++ } else if (ctx_cfmt(s) != 0 && lc->cu.pred_mode == MODE_INTRA) { ++ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) { ++ int trafo_size_h = 1 << (log2_trafo_size_c + ctx_hshift(s, 1)); ++ int trafo_size_v = 1 << (log2_trafo_size_c + ctx_vshift(s, 1)); ++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 1); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0, 2); ++ if (ctx_cfmt(s) == 2) { ++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0 + (1 << log2_trafo_size_c), ++ trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1); ++ do_intra_pred(s, lc, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2); ++ } ++ } else if (blk_idx == 3) { ++ int trafo_size_h = 1 << (log2_trafo_size + 1); ++ int trafo_size_v = 1 << (log2_trafo_size + ctx_vshift(s, 1)); ++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase, ++ trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 1); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase, 2); ++ if (ctx_cfmt(s) == 2) { ++ ff_hevc_rpi_set_neighbour_available(s, lc, xBase, yBase + (1 << (log2_trafo_size)), ++ trafo_size_h, trafo_size_v); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1); ++ do_intra_pred(s, lc, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2); ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++static void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size) ++{ ++ int cb_size = 1 << log2_cb_size; ++ int log2_min_pu_size = s->ps.sps->log2_min_pu_size; ++ ++ int min_pu_width = s->ps.sps->min_pu_width; ++ int x_end = FFMIN(x0 + cb_size, s->ps.sps->width); ++ int y_end = FFMIN(y0 + cb_size, s->ps.sps->height); ++ int i, j; ++ ++ for (j = (y0 >> log2_min_pu_size); j < (y_end >> log2_min_pu_size); j++) ++ for (i = (x0 >> log2_min_pu_size); i < (x_end >> log2_min_pu_size); i++) ++ s->is_pcm[i + j * min_pu_width] = 2; ++} ++ ++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, ++ int xBase, int yBase, int cb_xBase, int cb_yBase, ++ int log2_cb_size, int log2_trafo_size, ++ int trafo_depth, int blk_idx, ++ const int *base_cbf_cb, const int *base_cbf_cr) ++{ ++ uint8_t split_transform_flag; ++ int cbf_cb[2]; ++ int cbf_cr[2]; ++ int ret; ++ ++ cbf_cb[0] = base_cbf_cb[0]; ++ cbf_cb[1] = base_cbf_cb[1]; ++ cbf_cr[0] = base_cbf_cr[0]; ++ cbf_cr[1] = base_cbf_cr[1]; ++ ++ if (lc->cu.intra_split_flag) { ++ if (trafo_depth == 1) { ++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx]; ++ if (ctx_cfmt(s) == 3) { ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx]; ++ } else { ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; ++ } ++ } ++ } else { ++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0]; ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; ++ } ++ ++ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size && ++ log2_trafo_size > s->ps.sps->log2_min_tb_size && ++ trafo_depth < lc->cu.max_trafo_depth && ++ !(lc->cu.intra_split_flag && trafo_depth == 0)) { ++ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size); ++ } else { ++ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 && ++ lc->cu.pred_mode == MODE_INTER && ++ lc->cu.part_mode != PART_2Nx2N && ++ trafo_depth == 0; ++ ++ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size || ++ (lc->cu.intra_split_flag && trafo_depth == 0) || ++ inter_split; ++ } ++ ++ if (ctx_cfmt(s) != 0 && (log2_trafo_size > 2 || ctx_cfmt(s) == 3)) { ++ if (trafo_depth == 0 || cbf_cb[0]) { ++ cbf_cb[0] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth); ++ if (ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3)) { ++ cbf_cb[1] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth); ++ } ++ } ++ ++ if (trafo_depth == 0 || cbf_cr[0]) { ++ cbf_cr[0] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth); ++ if (ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3)) { ++ cbf_cr[1] = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth); ++ } ++ } ++ } ++ ++ if (split_transform_flag) { ++ const int trafo_size_split = 1 << (log2_trafo_size - 1); ++ const int x1 = x0 + trafo_size_split; ++ const int y1 = y0 + trafo_size_split; ++ ++#define SUBDIVIDE(x, y, idx) \ ++do { \ ++ ret = hls_transform_tree(s, lc, x, y, x0, y0, cb_xBase, cb_yBase, log2_cb_size, \ ++ log2_trafo_size - 1, trafo_depth + 1, idx, \ ++ cbf_cb, cbf_cr); \ ++ if (ret < 0) \ ++ return ret; \ ++} while (0) ++ ++ SUBDIVIDE(x0, y0, 0); ++ SUBDIVIDE(x1, y0, 1); ++ SUBDIVIDE(x0, y1, 2); ++ SUBDIVIDE(x1, y1, 3); ++ ++#undef SUBDIVIDE ++ } else { ++ int min_tu_size = 1 << s->ps.sps->log2_min_tb_size; ++ int log2_min_tu_size = s->ps.sps->log2_min_tb_size; ++ int min_tu_width = s->ps.sps->min_tb_width; ++ int cbf_luma = 1; ++ ++ if (lc->cu.pred_mode == MODE_INTRA || trafo_depth != 0 || ++ cbf_cb[0] || cbf_cr[0] || ++ (ctx_cfmt(s) == 2 && (cbf_cb[1] || cbf_cr[1]))) { ++ cbf_luma = ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth); ++ } ++ ++ ret = hls_transform_unit(s, lc, x0, y0, xBase, yBase, cb_xBase, cb_yBase, ++ log2_cb_size, log2_trafo_size, ++ blk_idx, cbf_luma, cbf_cb, cbf_cr); ++ if (ret < 0) ++ return ret; ++ // TODO: store cbf_luma somewhere else ++ if (cbf_luma) { ++ int i, j; ++ for (i = 0; i < (1 << log2_trafo_size); i += min_tu_size) ++ for (j = 0; j < (1 << log2_trafo_size); j += min_tu_size) { ++ int x_tu = (x0 + j) >> log2_min_tu_size; ++ int y_tu = (y0 + i) >> log2_min_tu_size; ++ s->cbf_luma[y_tu * min_tu_width + x_tu] = 1; ++ } ++ } ++ if (!s->sh.disable_deblocking_filter_flag) { ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size); ++ if (s->ps.pps->transquant_bypass_enable_flag && ++ lc->cu.cu_transquant_bypass_flag) ++ set_deblocking_bypass(s, x0, y0, log2_trafo_size); ++ } ++ } ++ return 0; ++} ++ ++ ++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) ++{ ++ GetBitContext gb; ++ int ret; ++ ++ ret = init_get_bits(&gb, pcm, length); ++ if (ret < 0) ++ return ret; ++ ++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), ++ frame_stride1(s->frame, 0), ++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ ++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)), ++ s->frame->linesize[1], ++ cb_size >> ctx_hshift(s, 1), ++ cb_size >> ctx_vshift(s, 1), ++ &gb, s->ps.sps->pcm.bit_depth_chroma); ++ ++ return 0; ++} ++ ++ ++// x * 2^(y*2) ++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) ++{ ++ return x << (y * 2); ++} ++ ++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size) ++{ ++ // Length in bits ++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2)); ++ ++ const uint8_t * const pcm = skip_bytes(&lc->cc, (length + 7) >> 3); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size); ++ ++ // Copy coeffs ++ { ++ const int blen = (length + 7) >> 3; ++ // Round allocated bytes up to nearest 32 to avoid alignment confusion ++ // Allocation is in int16_t s ++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per ++ // sample this rounding doesn't affect the total size we need to allocate for ++ // the coeff buffer ++ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1); ++ memcpy(coeffs, pcm, blen); ++ ++ // Our coeff stash assumes that any partially allocated 64byte lump ++ // is zeroed so make that true. ++ { ++ uint8_t * const eopcm = (uint8_t *)coeffs + blen; ++ if ((-(intptr_t)eopcm & 63) != 0) ++ memset(eopcm, 0, -(intptr_t)eopcm & 63); ++ } ++ ++ // Add command ++ { ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); ++ cmd->type = RPI_PRED_I_PCM; ++ cmd->size = log2_cb_size; ++ cmd->i_pcm.src = coeffs; ++ cmd->i_pcm.x = x0; ++ cmd->i_pcm.y = y0; ++ cmd->i_pcm.src_len = length; ++ } ++ return 0; ++ } ++} ++ ++ ++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCFrame * const ref, ++ const Mv * const mv, const int y0, const int height) ++{ ++ if (s->threads_type == FF_THREAD_FRAME) { ++ const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9); ++ ++ // Progress has to be attached to current job as the actual wait ++ // is in worker_core which can't use lc ++ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no; ++ if (*pr < y) { ++ *pr = y; ++ } ++ } ++} ++ ++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, const int nPbW, ++ const int nPbH, const int log2_cb_size, const int part_idx, ++ const int merge_idx, MvField * const mv) ++{ ++ enum InterPredIdc inter_pred_idc = PRED_L0; ++ int mvp_flag; ++ ++ ff_hevc_rpi_set_neighbour_available(s, lc, x0, y0, nPbW, nPbH); ++ mv->pred_flag = 0; ++ if (s->sh.slice_type == HEVC_SLICE_B) ++ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); ++ ++ if (inter_pred_idc != PRED_L1) { ++ if (s->sh.nb_refs[L0]) ++ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]); ++ ++ mv->pred_flag = PF_L0; ++ ff_hevc_rpi_hls_mvd_coding(lc); ++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ part_idx, merge_idx, mv, mvp_flag, 0); ++ mv->mv[0].x += lc->pu.mvd.x; ++ mv->mv[0].y += lc->pu.mvd.y; ++ } ++ ++ if (inter_pred_idc != PRED_L0) { ++ if (s->sh.nb_refs[L1]) ++ mv->ref_idx[1]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); ++ ++ if (s->sh.mvd_l1_zero_flag == 1 && inter_pred_idc == PRED_BI) { ++ AV_ZERO32(&lc->pu.mvd); ++ } else { ++ ff_hevc_rpi_hls_mvd_coding(lc); ++ } ++ ++ mv->pred_flag += PF_L1; ++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ part_idx, merge_idx, mv, mvp_flag, 1); ++ mv->mv[1].x += lc->pu.mvd.x; ++ mv->mv[1].y += lc->pu.mvd.y; ++ } ++} ++ ++ ++static HEVCRpiInterPredQ * ++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) ++{ ++ HEVCRpiInterPredQ * yp = ipe->q + ipe->curr; ++ HEVCRpiInterPredQ * ypt = yp + 1; ++ for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) { ++ if (ypt->load < yp->load) ++ yp = ypt; ++ } ++ ++ yp->load += load_val; ++ ipe->used_grp = 1; ++ yp->qpu_mc_curr->data[-1] = fn; // Link is always last el of previous cmd ++ ++ return yp; ++} ++ ++ ++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) ++{ ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base; ++ ++ q->qpu_mc_curr->data[-1] = q->code_sync; ++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1); ++ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage ++ } ++} ++ ++// Returns 0 on success, -1 if Q is dangerously full ++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) ++{ ++ if (!ipe->used_grp) ++ return 0; ++ ++ if ((ipe->curr += ipe->n_grp) >= ipe->n) ++ { ++ ipe->curr = 0; ++ rpi_inter_pred_sync(ipe); ++ } ++ ipe->used = 1; ++ ipe->used_grp = 0; ++ ++ for (unsigned int i = 0; i != ipe->n_grp; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr; ++ if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) { ++ return -1; ++ } ++ } ++ return 0; ++} ++ ++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ ++ ipe->curr = 0; ++ ipe->used = 0; ++ ipe->used_grp = 0; ++ for (i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base; ++ q->load = 0; ++ q->last_l0 = NULL; ++ q->last_l1 = NULL; ++ } ++} ++ ++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, ++ const unsigned int n_max, const unsigned int n_grp, ++ const unsigned int total_size, const unsigned int min_gap) ++{ ++ memset(ipe, 0, sizeof(*ipe)); ++ av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL); ++ ipe->n_grp = n_grp; ++ ipe->min_gap = min_gap; ++ ++ gpu_malloc_cached(total_size, &ipe->gptr); ++} ++ ++ ++#if RPI_QPU_EMU_Y ++#define get_mc_address_y(f) ((f)->data[0]) ++#else ++#define get_mc_address_y(f) get_vc_address_y(f) ++#endif ++#if RPI_QPU_EMU_C ++#define get_mc_address_u(f) ((f)->data[1]) ++#else ++#define get_mc_address_u(f) get_vc_address_u(f) ++#endif ++ ++static inline int offset_depth_adj(const HEVCRpiContext *const s, const int wt) ++{ ++ return s->ps.sps->high_precision_offsets_enabled_flag ? wt : ++ wt << (s->ps.sps->bit_depth - 8); ++} ++ ++static void ++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const Mv *const mv, ++ const int weight_mul, ++ const int weight_offset, ++ AVFrame *const src_frame) ++{ ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my << 8) | mx; ++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; ++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); ++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; ++ const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul); ++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ ++ if (my_mx == 0) ++ { ++ const int x1 = x0 + (mv->x >> 2); ++ const int y1 = y0 + (mv->y >> 2); ++ const int bh = nPbH; ++ ++ for (int start_x = 0; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; ++ ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred1_x0y0; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src_vc_address_y; ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->wo1 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ const int x1_m3 = x0 + (mv->x >> 2) - 3; ++ const int y1_m3 = y0 + (mv->y >> 2) - 3; ++ const unsigned int bh = nPbH; ++ int start_x = 0; ++ ++#if 1 ++ // As Y-pred operates on two independant 8-wide src blocks we can merge ++ // this pred with the previous one if it the previous one is 8 pel wide, ++ // the same height as the current block, immediately to the left of our ++ // current dest block and mono-pred. ++ ++ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p; ++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) ++ { ++ const int bw = FFMIN(nPbW, 8); ++ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1; ++ ++ last_y8_src2->x = x1_m3; ++ last_y8_src2->y = y1_m3; ++ last_y8_src2->base = src_vc_address_y; ++ last_y8_p->w += bw; ++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); ++ last_y8_p->wo2 = wo; ++ ++ jb->last_y8_p = NULL; ++ jb->last_y8_l1 = NULL; ++ start_x = bw; ++#if RPI_TSTATS ++ ++s->tstats.y_pred1_y8_merge; ++#endif ++ } ++#endif ++ ++ for (; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ if (mx == 0 && my == 0) ++ ++ts->y_pred1_x0y0; ++ else if (mx == 0) ++ ++ts->y_pred1_x0; ++ else if (my == 0) ++ ++ts->y_pred1_y0; ++ else ++ ++ts->y_pred1_xy; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ src1->x = x1_m3 + start_x; ++ src1->y = y1_m3; ++ src1->base = src_vc_address_y; ++ if (bw <= 8) ++ { ++ src2->x = MC_DUMMY_X; ++ src2->y = MC_DUMMY_Y; ++#if RPI_QPU_EMU_Y ++ src2->base = s->qpu_dummy_frame_emu; ++#else ++ src2->base = s->qpu_dummy_frame_qpu; ++#endif ++ } ++ else ++ { ++ src2->x = x1_m3 + start_x + 8; ++ src2->y = y1_m3; ++ src2->base = src_vc_address_y; ++ } ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo; ++ cmd_y->wo2 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ ++ if (bw == 8) { ++ jb->last_y8_l1 = src2; ++ jb->last_y8_p = cmd_y; ++ } ++ } ++ } ++} ++ ++static void ++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const struct MvField *const mv_field, ++ const AVFrame *const src_frame, ++ const AVFrame *const src_frame2) ++{ ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; ++ ++ const unsigned int mx = mv->x & 3; ++ const unsigned int my = mv->y & 3; ++ const unsigned int my_mx = (my<<8) | mx; ++ const unsigned int mx2 = mv2->x & 3; ++ const unsigned int my2 = mv2->y & 3; ++ const unsigned int my2_mx2 = (my2<<8) | mx2; ++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const unsigned int ref_idx0 = mv_field->ref_idx[0]; ++ const unsigned int ref_idx1 = mv_field->ref_idx[1]; ++ const uint32_t wt_offset = ++ offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1; ++ const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]); ++ ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; ++ ++ if (my2_mx2_my_mx == 0) ++ { ++ const int x1 = x0 + (mv->x >> 2); ++ const int y1 = y0 + (mv->y >> 2); ++ const int x2 = x0 + (mv2->x >> 2); ++ const int y2 = y0 + (mv2->y >> 2); ++ const int bh = nPbH; ++ ++ // Can do chunks a full 16 wide if we don't want the H filter ++ for (int start_x=0; start_x < nPbW; start_x += 16) ++ { ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ts->y_pred2_x0y0; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 16); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = 0; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ // Filter requires a run-up of 3 ++ const int x1 = x0 + (mv->x >> 2) - 3; ++ const int y1 = y0 + (mv->y >> 2) - 3; ++ const int x2 = x0 + (mv2->x >> 2) - 3; ++ const int y2 = y0 + (mv2->y >> 2) - 3; ++ const int bh = nPbH; ++ ++ for (int start_x=0; start_x < nPbW; start_x += 8) ++ { // B blocks work 8 at a time ++ // B weights aren't doubled as the QPU code does the same ++ // amount of work as it does for P ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ const unsigned int mmx = mx | mx2; ++ const unsigned int mmy = my | my2; ++ if (mmx == 0 && mmy == 0) ++ ++ts->y_pred2_x0y0; ++ else if (mmx == 0) ++ ++ts->y_pred2_x0; ++ else if (mmy == 0) ++ ++ts->y_pred2_y0; ++ else ++ ++ts->y_pred2_xy; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 8); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++} ++ ++// h/v shifts fixed at one as that is all the qasm copes with ++static void ++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const unsigned int lx, const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const Mv * const mv, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ AVFrame * const src_frame) ++{ ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // = s->ps.sps->hshift[1]; ++ const int vshift = 1; // = s->ps.sps->vshift[1]; ++ ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)]; ++ const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]); ++ const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]); ++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; ++ ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) ++ { ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); ++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; ++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; ++ qpu_mc_src_t * const last_lx = *plast_lx; ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ last_lx->x = x1_c + start_x; ++ last_lx->y = y1_c; ++ last_lx->base = src_base_u; ++ cmd_c->h = bh; ++ cmd_c->w = bw; ++ cmd_c->coeffs_x = x_coeffs; ++ cmd_c->coeffs_y = y_coeffs; ++ cmd_c->wo_u = wo_u; ++ cmd_c->wo_v = wo_v; ++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); ++ *plast_lx = &cmd_c->next_src; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); ++ } ++ return; ++} ++ ++// h/v shifts fixed at one as that is all the qasm copes with ++static void ++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const struct MvField * const mv_field, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ const int16_t * const c_weights2, ++ const int16_t * const c_offsets2, ++ AVFrame * const src_frame, ++ AVFrame * const src_frame2) ++{ ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // s->ps.sps->hshift[1]; ++ const int vshift = 1; // s->ps.sps->vshift[1]; ++ const Mv * const mv = mv_field->mv + 0; ++ const Mv * const mv2 = mv_field->mv + 1; ++ ++ const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift); ++ const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift); ++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; ++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector ++ const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1; ++ ++ const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift); ++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; ++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ ++ const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1; ++ ++ const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]); ++ const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]); ++ ++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) ++ { ++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); ++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; ++ qpu_mc_src_t * const src_l0 = cp->last_l0; ++ qpu_mc_src_t * const src_l1 = cp->last_l1; ++ ++ src_l0->x = x1_c + start_x; ++ src_l0->y = y1_c; ++ src_l0->base = src1_base; ++ src_l1->x = x2_c + start_x; ++ src_l1->y = y2_c; ++ src_l1->base = src2_base; ++ ++ u[0].h = bh; ++ u[0].w = bw; ++ u[0].coeffs_x1 = coefs0_x; ++ u[0].coeffs_y1 = coefs0_y; ++ u[0].weight_u1 = c_weights[0]; // Weight L0 U ++ u[0].weight_v1 = c_weights[1]; // Weight L0 V ++ u[0].coeffs_x2 = coefs1_x; ++ u[0].coeffs_y2 = coefs1_y; ++ u[0].wo_u2 = wo_u2; ++ u[0].wo_v2 = wo_v2; ++ u[0].dst_addr_c = dst_base_u + (start_x << xshl); ++ ++ cp->last_l0 = &u[0].next_src1; ++ cp->last_l1 = &u[0].next_src2; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ } ++} ++ ++ ++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) ++{ ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ int merge_idx = 0; ++ struct MvField current_mv = {{{ 0 }}}; ++ ++ int min_pu_width = s->ps.sps->min_pu_width; ++ ++ MvField * const tab_mvf = s->ref->tab_mvf; ++ const RefPicList *const refPicList = s->ref->refPicList; ++ const HEVCFrame *ref0 = NULL, *ref1 = NULL; ++ int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ int min_cb_width = s->ps.sps->min_cb_width; ++ int x_cb = x0 >> log2_min_cb_size; ++ int y_cb = y0 >> log2_min_cb_size; ++ int x_pu, y_pu; ++ int i, j; ++ const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb); ++ ++ if (!skip_flag) ++ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc); ++ ++ if (skip_flag || lc->pu.merge_flag) { ++ if (s->sh.max_num_merge_cand > 1) ++ merge_idx = ff_hevc_rpi_merge_idx_decode(s, lc); ++ else ++ merge_idx = 0; ++ ++ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ partIdx, merge_idx, ¤t_mv); ++ } else { ++ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ partIdx, merge_idx, ¤t_mv); ++ } ++ ++ x_pu = x0 >> s->ps.sps->log2_min_pu_size; ++ y_pu = y0 >> s->ps.sps->log2_min_pu_size; ++ ++ for (j = 0; j < nPbH >> s->ps.sps->log2_min_pu_size; j++) ++ for (i = 0; i < nPbW >> s->ps.sps->log2_min_pu_size; i++) ++ tab_mvf[(y_pu + j) * min_pu_width + x_pu + i] = current_mv; ++ ++ if (current_mv.pred_flag & PF_L0) { ++ ref0 = refPicList[0].ref[current_mv.ref_idx[0]]; ++ if (!ref0) ++ return; ++ hevc_await_progress(s, lc, ref0, ¤t_mv.mv[0], y0, nPbH); ++ } ++ if (current_mv.pred_flag & PF_L1) { ++ ref1 = refPicList[1].ref[current_mv.ref_idx[1]]; ++ if (!ref1) ++ return; ++ hevc_await_progress(s, lc, ref1, ¤t_mv.mv[1], y0, nPbH); ++ } ++ ++ if (current_mv.pred_flag == PF_L0) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 0, ++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ return; ++ } ++ } else if (current_mv.pred_flag == PF_L1) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.mv + 1, ++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1, ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ return; ++ } ++ } else if (current_mv.pred_flag == PF_BI) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c, ++ ¤t_mv, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], ++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref0->frame, ++ ref1->frame); ++ return; ++ } ++ } ++} ++ ++/** ++ * 8.4.1 ++ */ ++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int pu_size, ++ int prev_intra_luma_pred_flag) ++{ ++ int x_pu = x0 >> s->ps.sps->log2_min_pu_size; ++ int y_pu = y0 >> s->ps.sps->log2_min_pu_size; ++ int min_pu_width = s->ps.sps->min_pu_width; ++ int size_in_pus = pu_size >> s->ps.sps->log2_min_pu_size; ++ int x0b = av_mod_uintp2(x0, s->ps.sps->log2_ctb_size); ++ int y0b = av_mod_uintp2(y0, s->ps.sps->log2_ctb_size); ++ ++ int cand_up = (lc->ctb_up_flag || y0b) ? ++ s->tab_ipm[(y_pu - 1) * min_pu_width + x_pu] : INTRA_DC; ++ int cand_left = (lc->ctb_left_flag || x0b) ? ++ s->tab_ipm[y_pu * min_pu_width + x_pu - 1] : INTRA_DC; ++ ++ int y_ctb = (y0 >> (s->ps.sps->log2_ctb_size)) << (s->ps.sps->log2_ctb_size); ++ ++ MvField *tab_mvf = s->ref->tab_mvf; ++ int intra_pred_mode; ++ int candidate[3]; ++ int i, j; ++ ++ // intra_pred_mode prediction does not cross vertical CTB boundaries ++ if ((y0 - 1) < y_ctb) ++ cand_up = INTRA_DC; ++ ++ if (cand_left == cand_up) { ++ if (cand_left < 2) { ++ candidate[0] = INTRA_PLANAR; ++ candidate[1] = INTRA_DC; ++ candidate[2] = INTRA_ANGULAR_26; ++ } else { ++ candidate[0] = cand_left; ++ candidate[1] = 2 + ((cand_left - 2 - 1 + 32) & 31); ++ candidate[2] = 2 + ((cand_left - 2 + 1) & 31); ++ } ++ } else { ++ candidate[0] = cand_left; ++ candidate[1] = cand_up; ++ if (candidate[0] != INTRA_PLANAR && candidate[1] != INTRA_PLANAR) { ++ candidate[2] = INTRA_PLANAR; ++ } else if (candidate[0] != INTRA_DC && candidate[1] != INTRA_DC) { ++ candidate[2] = INTRA_DC; ++ } else { ++ candidate[2] = INTRA_ANGULAR_26; ++ } ++ } ++ ++ if (prev_intra_luma_pred_flag) { ++ intra_pred_mode = candidate[lc->pu.mpm_idx]; ++ } else { ++ if (candidate[0] > candidate[1]) ++ FFSWAP(uint8_t, candidate[0], candidate[1]); ++ if (candidate[0] > candidate[2]) ++ FFSWAP(uint8_t, candidate[0], candidate[2]); ++ if (candidate[1] > candidate[2]) ++ FFSWAP(uint8_t, candidate[1], candidate[2]); ++ ++ intra_pred_mode = lc->pu.rem_intra_luma_pred_mode; ++ for (i = 0; i < 3; i++) ++ if (intra_pred_mode >= candidate[i]) ++ intra_pred_mode++; ++ } ++ ++ /* write the intra prediction units into the mv array */ ++ if (!size_in_pus) ++ size_in_pus = 1; ++ for (i = 0; i < size_in_pus; i++) { ++ memset(&s->tab_ipm[(y_pu + i) * min_pu_width + x_pu], ++ intra_pred_mode, size_in_pus); ++ ++ for (j = 0; j < size_in_pus; j++) { ++ tab_mvf[(y_pu + j) * min_pu_width + x_pu + i].pred_flag = PF_INTRA; ++ } ++ } ++ ++ return intra_pred_mode; ++} ++ ++static av_always_inline void set_ct_depth(const HEVCRpiContext * const s, int x0, int y0, ++ int log2_cb_size, int ct_depth) ++{ ++ int length = (1 << log2_cb_size) >> s->ps.sps->log2_min_cb_size; ++ int x_cb = x0 >> s->ps.sps->log2_min_cb_size; ++ int y_cb = y0 >> s->ps.sps->log2_min_cb_size; ++ int y; ++ ++ for (y = 0; y < length; y++) ++ memset(&s->tab_ct_depth[(y_cb + y) * s->ps.sps->min_cb_width + x_cb], ++ ct_depth, length); ++} ++ ++static const uint8_t tab_mode_idx[] = { ++ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, ++ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31}; ++ ++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, ++ const int log2_cb_size) ++{ ++ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 }; ++ uint8_t prev_intra_luma_pred_flag[4]; ++ int split = lc->cu.part_mode == PART_NxN; ++ int pb_size = (1 << log2_cb_size) >> split; ++ int side = split + 1; ++ int chroma_mode; ++ int i, j; ++ ++ for (i = 0; i < side; i++) ++ for (j = 0; j < side; j++) ++ prev_intra_luma_pred_flag[2 * i + j] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc); ++ ++ for (i = 0; i < side; i++) { ++ for (j = 0; j < side; j++) { ++ if (prev_intra_luma_pred_flag[2 * i + j]) ++ lc->pu.mpm_idx = ff_hevc_rpi_mpm_idx_decode(lc); ++ else ++ lc->pu.rem_intra_luma_pred_mode = ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc); ++ ++ lc->pu.intra_pred_mode[2 * i + j] = ++ luma_intra_pred_mode(s, lc, x0 + pb_size * j, y0 + pb_size * i, pb_size, ++ prev_intra_luma_pred_flag[2 * i + j]); ++ } ++ } ++ ++ if (ctx_cfmt(s) == 3) { ++ for (i = 0; i < side; i++) { ++ for (j = 0; j < side; j++) { ++ lc->pu.chroma_mode_c[2 * i + j] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[2 * i + j] == intra_chroma_table[chroma_mode]) ++ lc->pu.intra_pred_mode_c[2 * i + j] = 34; ++ else ++ lc->pu.intra_pred_mode_c[2 * i + j] = intra_chroma_table[chroma_mode]; ++ } else { ++ lc->pu.intra_pred_mode_c[2 * i + j] = lc->pu.intra_pred_mode[2 * i + j]; ++ } ++ } ++ } ++ } else if (ctx_cfmt(s) == 2) { ++ int mode_idx; ++ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) ++ mode_idx = 34; ++ else ++ mode_idx = intra_chroma_table[chroma_mode]; ++ } else { ++ mode_idx = lc->pu.intra_pred_mode[0]; ++ } ++ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx]; ++ } else if (ctx_cfmt(s) != 0) { ++ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) ++ lc->pu.intra_pred_mode_c[0] = 34; ++ else ++ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode]; ++ } else { ++ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0]; ++ } ++ } ++} ++ ++static void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ int x0, int y0, ++ int log2_cb_size) ++{ ++ int pb_size = 1 << log2_cb_size; ++ int size_in_pus = pb_size >> s->ps.sps->log2_min_pu_size; ++ int min_pu_width = s->ps.sps->min_pu_width; ++ MvField *tab_mvf = s->ref->tab_mvf; ++ int x_pu = x0 >> s->ps.sps->log2_min_pu_size; ++ int y_pu = y0 >> s->ps.sps->log2_min_pu_size; ++ int j, k; ++ ++ if (size_in_pus == 0) ++ size_in_pus = 1; ++ for (j = 0; j < size_in_pus; j++) ++ memset(&s->tab_ipm[(y_pu + j) * min_pu_width + x_pu], INTRA_DC, size_in_pus); ++ if (lc->cu.pred_mode == MODE_INTRA) ++ for (j = 0; j < size_in_pus; j++) ++ for (k = 0; k < size_in_pus; k++) ++ tab_mvf[(y_pu + j) * min_pu_width + x_pu + k].pred_flag = PF_INTRA; ++} ++ ++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int log2_cb_size) ++{ ++ int cb_size = 1 << log2_cb_size; ++ int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ int length = cb_size >> log2_min_cb_size; ++ int min_cb_width = s->ps.sps->min_cb_width; ++ int x_cb = x0 >> log2_min_cb_size; ++ int y_cb = y0 >> log2_min_cb_size; ++ int idx = log2_cb_size - 2; ++ int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1; ++ int x, y, ret; ++ ++ lc->cu.x = x0; ++ lc->cu.y = y0; ++ lc->cu.pred_mode = MODE_INTRA; ++ lc->cu.part_mode = PART_2Nx2N; ++ lc->cu.intra_split_flag = 0; ++ ++ SAMPLE_CTB(s->skip_flag, x_cb, y_cb) = 0; ++ for (x = 0; x < 4; x++) ++ lc->pu.intra_pred_mode[x] = 1; ++ if (s->ps.pps->transquant_bypass_enable_flag) { ++ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc); ++ if (lc->cu.cu_transquant_bypass_flag) ++ set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } else ++ lc->cu.cu_transquant_bypass_flag = 0; ++ ++ if (s->sh.slice_type != HEVC_SLICE_I) { ++ uint8_t skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb); ++ ++ x = y_cb * min_cb_width + x_cb; ++ for (y = 0; y < length; y++) { ++ memset(&s->skip_flag[x], skip_flag, length); ++ x += min_cb_width; ++ } ++ lc->cu.pred_mode = skip_flag ? MODE_SKIP : MODE_INTER; ++ } else { ++ x = y_cb * min_cb_width + x_cb; ++ for (y = 0; y < length; y++) { ++ memset(&s->skip_flag[x], 0, length); ++ x += min_cb_width; ++ } ++ } ++ ++ if (SAMPLE_CTB(s->skip_flag, x_cb, y_cb)) { ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size); ++ } else { ++ int pcm_flag = 0; ++ ++ if (s->sh.slice_type != HEVC_SLICE_I) ++ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc); ++ if (lc->cu.pred_mode != MODE_INTRA || ++ log2_cb_size == s->ps.sps->log2_min_cb_size) { ++ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size); ++ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN && ++ lc->cu.pred_mode == MODE_INTRA; ++ } ++ ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ if (lc->cu.part_mode == PART_2Nx2N && s->ps.sps->pcm_enabled_flag && ++ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size && ++ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size) { ++ pcm_flag = ff_hevc_rpi_pcm_flag_decode(lc); ++ } ++ if (pcm_flag) { ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size); ++ if (s->ps.sps->pcm.loop_filter_disable_flag) ++ { ++ set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } ++ ++ if (ret < 0) ++ return ret; ++ } else { ++ intra_prediction_unit(s, lc, x0, y0, log2_cb_size); ++ } ++ } else { ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ switch (lc->cu.part_mode) { ++ case PART_2Nx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); ++ break; ++ case PART_2NxN: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx); ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx); ++ break; ++ case PART_Nx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1); ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1); ++ break; ++ case PART_2NxnU: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx); ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size * 3 / 4, log2_cb_size, 1, idx); ++ break; ++ case PART_2NxnD: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size * 3 / 4, log2_cb_size, 0, idx); ++ hls_prediction_unit(s, lc, x0, y0 + cb_size * 3 / 4, cb_size, cb_size / 4, log2_cb_size, 1, idx); ++ break; ++ case PART_nLx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2); ++ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2); ++ break; ++ case PART_nRx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 0, idx - 2); ++ hls_prediction_unit(s, lc, x0 + cb_size * 3 / 4, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2); ++ break; ++ case PART_NxN: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1); ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1); ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1); ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1); ++ break; ++ } ++ } ++ ++ if (!pcm_flag) { ++ int rqt_root_cbf = 1; ++ ++ if (lc->cu.pred_mode != MODE_INTRA && ++ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) { ++ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc); ++ } ++ if (rqt_root_cbf) { ++ const static int cbf[2] = { 0 }; ++ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ? ++ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag : ++ s->ps.sps->max_transform_hierarchy_depth_inter; ++ ret = hls_transform_tree(s, lc, x0, y0, x0, y0, x0, y0, ++ log2_cb_size, ++ log2_cb_size, 0, 0, cbf, cbf); ++ if (ret < 0) ++ return ret; ++ } else { ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size); ++ } ++ } ++ } ++ ++ if (s->ps.pps->cu_qp_delta_enabled_flag && lc->tu.is_cu_qp_delta_coded == 0) ++ ff_hevc_rpi_set_qPy(s, lc, x0, y0, log2_cb_size); ++ ++ x = y_cb * min_cb_width + x_cb; ++ for (y = 0; y < length; y++) { ++ memset(&s->qp_y_tab[x], lc->qp_y, length); ++ x += min_cb_width; ++ } ++ ++ if(((x0 + (1<qPy_pred = lc->qp_y; ++ } ++ ++ set_ct_depth(s, x0, y0, log2_cb_size, lc->ct_depth); ++ ++ return 0; ++} ++ ++// Returns: ++// < 0 Error ++// 0 More data wanted ++// 1 EoSlice / EoPicture ++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, ++ const int log2_cb_size, const int cb_depth) ++{ ++ const int cb_size = 1 << log2_cb_size; ++ int ret; ++ int split_cu; ++ ++ lc->ct_depth = cb_depth; ++ if (x0 + cb_size <= s->ps.sps->width && ++ y0 + cb_size <= s->ps.sps->height && ++ log2_cb_size > s->ps.sps->log2_min_cb_size) { ++ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0); ++ } else { ++ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size); ++ } ++ if (s->ps.pps->cu_qp_delta_enabled_flag && ++ log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth) { ++ lc->tu.is_cu_qp_delta_coded = 0; ++ lc->tu.cu_qp_delta = 0; ++ } ++ ++ lc->tu.is_cu_chroma_qp_offset_coded = !(s->sh.cu_chroma_qp_offset_enabled_flag && ++ log2_cb_size >= s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_chroma_qp_offset_depth); ++ lc->tu.cu_qp_offset_cb = 0; ++ lc->tu.cu_qp_offset_cr = 0; ++ ++ if (split_cu) { ++ int qp_block_mask = (1<<(s->ps.sps->log2_ctb_size - s->ps.pps->diff_cu_qp_delta_depth)) - 1; ++ const int cb_size_split = cb_size >> 1; ++ const int x1 = x0 + cb_size_split; ++ const int y1 = y0 + cb_size_split; ++ ++ int more_data = 0; ++ ++ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ ++ if (more_data && x1 < s->ps.sps->width) { ++ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ if (more_data && y1 < s->ps.sps->height) { ++ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ if (more_data && x1 < s->ps.sps->width && ++ y1 < s->ps.sps->height) { ++ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ ++ if(((x0 + (1<qPy_pred = lc->qp_y; ++ ++ if (more_data) ++ return ((x1 + cb_size_split) < s->ps.sps->width || ++ (y1 + cb_size_split) < s->ps.sps->height); ++ else ++ return 0; ++ } else { ++ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size); ++ if (ret < 0) ++ return ret; ++ if ((!((x0 + cb_size) % ++ (1 << (s->ps.sps->log2_ctb_size))) || ++ (x0 + cb_size >= s->ps.sps->width)) && ++ (!((y0 + cb_size) % ++ (1 << (s->ps.sps->log2_ctb_size))) || ++ (y0 + cb_size >= s->ps.sps->height))) { ++ int end_of_slice_flag = ff_hevc_rpi_end_of_slice_flag_decode(lc); ++ return !end_of_slice_flag; ++ } else { ++ return 1; ++ } ++ } ++ ++ return 0; // NEVER ++} ++ ++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x_ctb, const int y_ctb, const int ctb_addr_ts) ++{ ++ const int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ const int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice ++ const int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size]; ++ ++ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; ++ ++ lc->end_of_tiles_x = idxX + 1 >= s->ps.pps->num_tile_columns ? s->ps.sps->width : ++ (s->ps.pps->col_bd[idxX + 1] << s->ps.sps->log2_ctb_size); ++ ++ if (ctb_addr_ts == 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1] || ++ (s->ps.pps->entropy_coding_sync_enabled_flag && (x_ctb >> s->ps.sps->log2_ctb_size) == s->ps.pps->col_bd[idxX])) ++ { ++// lc->first_qp_group = 1; ++ lc->qPy_pred = s->sh.slice_qp; ++ } ++ ++ lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); ++ ++ lc->boundary_flags = 0; ++ ++ if (x_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]]) ++ lc->boundary_flags |= BOUNDARY_LEFT_TILE; ++ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) ++ lc->boundary_flags |= BOUNDARY_LEFT_SLICE; ++ if (y_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]) ++ lc->boundary_flags |= BOUNDARY_UPPER_TILE; ++ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width]) ++ lc->boundary_flags |= BOUNDARY_UPPER_SLICE; ++ ++ lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0; ++ lc->ctb_up_flag = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0; ++ lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && ++ (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width); ++ ++ lc->ctb_up_right_flag = ((y_ctb > 0) && (x_ctb + ctb_size) < lc->end_of_tiles_x && ++ (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) && ++ (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]])); ++} ++ ++ ++static void rpi_execute_dblk_cmds(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ const unsigned int x0 = FFMAX(jb->bounds.x, ctb_size) - ctb_size; ++ const unsigned int y0 = FFMAX(jb->bounds.y, ctb_size) - ctb_size; ++ const unsigned int bound_r = jb->bounds.x + jb->bounds.w; ++ const unsigned int bound_b = jb->bounds.y + jb->bounds.h; ++ const int x_end = (bound_r >= s->ps.sps->width); ++ const int y_end = (bound_b >= s->ps.sps->height); ++ const unsigned int xr = bound_r - (x_end ? 0 : ctb_size); ++ const unsigned int yb = bound_b - (y_end ? 0 : ctb_size); ++ unsigned int x, y; ++ ++ for (y = y0; y < yb; y += ctb_size ) { ++ for (x = x0; x < xr; x += ctb_size ) { ++ ff_hevc_rpi_hls_filter(s, x, y, ctb_size); ++ } ++ } ++ ++ // Flush (SAO) ++ if (y > y0) { ++ const int tile_end = y_end || ++ s->ps.pps->tile_id[jb->ctu_ts_last] != s->ps.pps->tile_id[jb->ctu_ts_last + 1]; ++ const unsigned int xl = x0 > ctb_size ? x0 - ctb_size : 0; ++ const unsigned int yt = y0 > ctb_size ? y0 - ctb_size : 0; ++ const unsigned int yb = tile_end ? bound_b : y - ctb_size; ++ ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ xl, yt, bound_r - xl, yb - yt, ++ ctx_vshift(s, 1), 1, 1); ++ rpi_cache_flush_finish(rfe); ++ } ++ ++ // Signal ++ if (s->threads_type == FF_THREAD_FRAME && x_end && y0 > 0) { ++ ff_hevc_rpi_progress_signal_recon(s, y_end ? INT_MAX : y0 - 1); ++ } ++ ++ // Job done now ++ // ? Move outside this fn ++ job_free(s->jbc, jb); ++} ++ ++ ++// I-pred, transform_and_add for all blocks types done here ++// All ARM ++static void rpi_execute_pred_cmds(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ unsigned int i; ++ HEVCRpiIntraPredEnv * const iap = &jb->intra; ++ const HEVCPredCmd *cmd = iap->cmds; ++ ++ for (i = iap->n; i > 0; i--, cmd++) ++ { ++ switch (cmd->type) ++ { ++ case RPI_PRED_INTRA: ++ { ++ HEVCRpiLocalContextIntra lci; // Abbreviated local context ++ HEVCRpiLocalContext * const lc = (HEVCRpiLocalContext *)&lci; ++ lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode; ++ lc->na.cand_bottom_left = (cmd->na >> 4) & 1; ++ lc->na.cand_left = (cmd->na >> 3) & 1; ++ lc->na.cand_up_left = (cmd->na >> 2) & 1; ++ lc->na.cand_up = (cmd->na >> 1) & 1; ++ lc->na.cand_up_right = (cmd->na >> 0) & 1; ++ if (cmd->c_idx == 0) ++ s->hpc.intra_pred[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ else ++ s->hpc.intra_pred_c[cmd->size - 2](s, lc, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx); ++ break; ++ } ++ ++ case RPI_PRED_ADD_RESIDUAL: ++ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_DC: ++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_U: ++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_V: ++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_C: ++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_DC_U: ++ case RPI_PRED_ADD_DC_V: ++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++ ++ case RPI_PRED_I_PCM: ++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); ++ break; ++ ++ default: ++ av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); ++ abort(); ++ } ++ } ++ ++ // Mark done ++ iap->n = 0; ++} ++ ++ ++// Set initial uniform job values & zero ctu_count ++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first) ++{ ++ unsigned int i; ++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; ++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; ++ const HEVCRpiSPS * const sps = s->ps.sps; ++ ++ const uint16_t pic_width_y = sps->width; ++ const uint16_t pic_height_y = sps->height; ++ ++ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1); ++ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1); ++ ++ // We expect the pointer to change if we use another sps ++ if (sps != jb->sps) ++ { ++ worker_pic_free_one(jb); ++ ++ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma); ++ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma); ++ ++ { ++ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH; ++ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1)); ++ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma); ++ } ++ ++ jb->sps = sps; ++ } ++ ++ jb->waited = 0; ++ jb->ctu_ts_first = ctu_ts_first; ++ jb->ctu_ts_last = -1; ++ ++ rpi_inter_pred_reset(cipe); ++ for (i = 0; i < cipe->n; i++) { ++ HEVCRpiInterPredQ * const cp = cipe->q + i; ++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; ++ ++ u->next_src1.x = 0; ++ u->next_src1.y = 0; ++ u->next_src1.base = 0; ++ u->pic_cw = pic_width_c; ++ u->pic_ch = pic_height_c; ++ u->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ u->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ u->wdenom = s->sh.chroma_log2_weight_denom; ++ cp->last_l0 = &u->next_src1; ++ ++ u->next_fn = 0; ++ u->next_src2.x = 0; ++ u->next_src2.y = 0; ++ u->next_src2.base = 0; ++ cp->last_l1 = &u->next_src2; ++ ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ } ++ ++ rpi_inter_pred_reset(yipe); ++ for (i = 0; i < yipe->n; i++) { ++ HEVCRpiInterPredQ * const yp = yipe->q + i; ++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; ++ ++ y->next_src1.x = 0; ++ y->next_src1.y = 0; ++ y->next_src1.base = 0; ++ y->next_src2.x = 0; ++ y->next_src2.y = 0; ++ y->next_src2.base = 0; ++ y->pic_h = pic_height_y; ++ y->pic_w = pic_width_y; ++ y->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ y->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ y->wdenom = s->sh.luma_log2_weight_denom; ++ y->next_fn = 0; ++ yp->last_l0 = &y->next_src1; ++ yp->last_l1 = &y->next_src2; ++ ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); ++ } ++ ++ jb->last_y8_p = NULL; ++ jb->last_y8_l1 = NULL; ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { ++ jb->progress_req[i] = -1; ++ } ++ ++ worker_pic_reset(&jb->coeffs); ++} ++ ++ ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; ++ unsigned int max_block = 0; ++ ++ if (!ipe->used) { ++ return 0; ++ } ++ ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; ++ ++ if (block_size > max_block) ++ max_block = block_size; ++ ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_qpu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_qpu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ ++ // Add to mailbox list ++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); ++ mail[i][1] = yp->code_setup; ++ } ++ ++ // We don't need invalidate here as the uniforms aren't changed by the QPU ++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing ++ // new values which seems to give us a small performance advantage ++ // ++ // In most cases we will not have a completely packed set of uniforms and as ++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the ++ // fullest ++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, ++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, ++ ipe->n, ipe->max_fill + ipe->min_gap); ++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); ++ ++ return 1; ++} ++#endif ++ ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ if (!ipe->used) { ++ return 0; ++ } ++ ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_emu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_emu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ } ++ ++ return 1; ++} ++#endif ++ ++ ++#if RPI_QPU_EMU_Y ++#define mc_terminate_add_y mc_terminate_add_emu ++#else ++#define mc_terminate_add_y mc_terminate_add_qpu ++#endif ++#if RPI_QPU_EMU_C ++#define mc_terminate_add_c mc_terminate_add_emu ++#else ++#define mc_terminate_add_c mc_terminate_add_qpu ++#endif ++ ++ ++static void flush_frame(HEVCRpiContext *s,AVFrame *frame) ++{ ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ rpi_cache_flush_finish(rfe); ++} ++ ++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first]; ++ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last]; ++ const unsigned int ctb_width = s->ps.sps->ctb_width; ++ RpiBlk *const bounds = &jb->bounds; ++ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last); ++ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size; ++ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size; ++ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size; ++ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size; ++} ++ ++#if RPI_PASSES == 2 ++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ // Perform intra prediction and residual reconstruction ++ rpi_execute_pred_cmds(s, jb); ++ ++ // Perform deblocking for CTBs in this row ++ rpi_execute_dblk_cmds(s, jb); ++} ++#endif ++ ++ ++// Core execution tasks ++static void worker_core(HEVCRpiContext * const s0, HEVCRpiJob * const jb) ++{ ++ const HEVCRpiContext * const s = s0; ++ vpu_qpu_wait_h sync_y; ++ int pred_y, pred_c; ++ const vpu_qpu_job_h vqj = vpu_qpu_job_new(); ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(); ++ ++ { ++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ if (cf->s[3].n + cf->s[2].n != 0) ++ { ++ const unsigned int csize = sizeof(cf->s[3].buf[0]); ++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; ++ vpu_qpu_job_add_vpu(vqj, ++ vpu_get_fn(s->ps.sps->bit_depth), ++ vpu_get_constants(), ++ cf->gptr.vc, ++ cf->s[2].n >> 8, ++ cf->gptr.vc + offset32, ++ cf->s[3].n >> 10, ++ 0); ++ ++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); ++ rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); ++ } ++ } ++ ++ pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip); ++ ++// We could take a sync here and try to locally overlap QPU processing with ARM ++// but testing showed a slightly negative benefit with noticable extra complexity ++ ++ pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip); ++ ++ vpu_qpu_job_add_sync_this(vqj, &sync_y); ++ ++ rpi_cache_flush_execute(rfe); ++ ++ // Await progress as required ++ // jb->waited will only be clear if we have already tested the progress values ++ // (in worker_submit_job) and found we don't have to wait ++ if (jb->waited) ++ { ++ unsigned int i; ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { ++ if (jb->progress_req[i] >= 0) { ++ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]); ++ } ++ } ++ } ++ ++ vpu_qpu_job_finish(vqj); ++ ++ // We always work on a rectangular block ++ if (pred_y || pred_c) ++ { ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, ++ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h, ++ ctx_vshift(s, 1), pred_y, pred_c); ++ } ++ ++ // If we have emulated VPU ops - do it here ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ if (av_rpi_is_sand8_frame(s->frame)) ++ { ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL); ++#else ++ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip); ++#endif ++ } ++ else ++ { ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL); ++#else ++ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip); ++#endif ++ } ++#endif ++ ++ // Wait for transform completion ++ // ? Could/should be moved to next pass which would let us add more jobs ++ // to the VPU Q on this thread but when I tried that it all went a bit slower ++ vpu_qpu_wait(&sync_y); ++ ++ rpi_cache_flush_finish(rfe); ++} ++ ++ ++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) ++{ ++ av_freep(&ipe->q); ++ gpu_free(&ipe->gptr); ++} ++ ++static HEVCRpiJob * job_new(void) ++{ ++ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob)); ++ ++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait); ++ ++ jb->intra.n = 0; ++ jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS); ++ ++ // * Sizeof the union structure might be overkill but at the moment it ++ // is correct (it certainly isn't going to be too small) ++ // *** really should add per ctu sync words to be accurate ++ ++ rpi_inter_pred_alloc(&jb->chroma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t), ++ QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t)); ++ rpi_inter_pred_alloc(&jb->luma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t), ++ QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t)); ++ ++ return jb; ++} ++ ++static void job_delete(HEVCRpiJob * const jb) ++{ ++ worker_pic_free_one(jb); ++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); ++ av_freep(&jb->intra.cmds); ++ rpi_free_inter_pred(&jb->chroma_ip); ++ rpi_free_inter_pred(&jb->luma_ip); ++} ++ ++static void jbg_delete(HEVCRpiJobGlobal * const jbg) ++{ ++ HEVCRpiJob * jb; ++ ++ if (jbg == NULL) ++ return; ++ ++ jb = jbg->free1; ++ while (jb != NULL) ++ { ++ HEVCRpiJob * const jb2 = jb; ++ jb = jb2->next; ++ job_delete(jb2); ++ } ++ ++ pthread_mutex_destroy(&jbg->lock); ++ av_free(jbg); ++} ++ ++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count) ++{ ++ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal)); ++ if (jbg == NULL) ++ return NULL; ++ ++ pthread_mutex_init(&jbg->lock, NULL); ++ ++ while (job_count-- != 0) ++ { ++ HEVCRpiJob * const jb = job_new(); ++ if (jb == NULL) ++ goto fail; ++ ++ jb->next = jbg->free1; ++ jbg->free1 = jb; ++ } ++ ++ return jbg; ++ ++fail: ++ jbg_delete(jbg); ++ return NULL; ++} ++ ++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc) ++{ ++ HEVCRpiJobGlobal * jbg; ++ ++ if (jbc == NULL) ++ return; ++ ++ jbg = jbc->jbg; ++ ++ if (jbc->jb1 != NULL) ++ job_delete(jbc->jb1); ++ ++ pthread_mutex_destroy(&jbc->in_lock); ++ sem_destroy(&jbc->sem_out); ++ av_free(jbc); ++ ++ // Deref the global job context ++ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1) ++ jbg_delete(jbg); ++} ++ ++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg) ++{ ++ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl)); ++ ++ if (jbc == NULL) ++ return NULL; ++ ++ jbc->jbg = jbg; ++ atomic_fetch_add(&jbg->ref_count, 1); ++ ++ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS); ++ pthread_mutex_init(&jbc->in_lock, NULL); ++ ++ if ((jbc->jb1 = job_new()) == NULL) ++ goto fail; ++ jbc->jb1->jbc_local = jbc; ++ ++ return jbc; ++ ++fail: ++ rpi_job_ctl_delete(jbc); ++ return NULL; ++} ++ ++ ++ ++static av_cold void hevc_init_worker(HEVCRpiContext * const s) ++{ ++#if RPI_PASSES == 2 ++ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1); ++#elif RPI_PASSES == 3 ++ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2); ++ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1); ++#else ++#error Passes confused ++#endif ++ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0); ++ ++ pass_queues_start_all(s); ++} ++ ++static av_cold void hevc_exit_worker(HEVCRpiContext *s) ++{ ++ pass_queues_term_all(s); ++ ++ pass_queues_kill_all(s); ++ ++ rpi_job_ctl_delete(s->jbc); ++ s->jbc = NULL; ++} ++ ++ ++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc) ++{ ++ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; ++ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns; ++ ++ // Check for obvious disasters ++ if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) { ++ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (s->sh.dependent_slice_segment_flag) { ++ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; ++ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { ++ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ if (!s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->ps.pps->tile_id[ctb_addr_ts] + s->sh.num_entry_point_offsets >= tiles) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // Tiled stuff must start at start of tile if it has multiple entry points ++ if (!s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->sh.num_entry_point_offsets != 0 && ++ s->sh.slice_ctb_addr_rs != s->ps.pps->tile_pos_rs[s->ps.pps->tile_id[ctb_addr_ts]]) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // Setup any required decode vars ++ if (!s->sh.dependent_slice_segment_flag) ++ lc->qPy_pred = s->sh.slice_qp; ++ ++ lc->qp_y = s->sh.slice_qp; ++ ++ // General setup ++ lc->wpp_init = 0; ++ lc->bt_line_no = 0; ++ lc->ts = ctb_addr_ts; ++ return 0; ++} ++ ++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal) ++{ ++ const GetBitContext * const gb = &s->HEVClc->gb; ++ int i, j; ++ ++ const unsigned int length = nal->size; ++ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte ++ unsigned int cmpt; ++ unsigned int startheader; ++ ++ if (s->sh.num_entry_point_offsets == 0) { ++ return 0; ++ } ++ ++ for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) { ++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { ++ startheader--; ++ cmpt++; ++ } ++ } ++ ++ for (i = 1; i < s->sh.num_entry_point_offsets; i++) { ++ offset += (s->sh.entry_point_offset[i - 1] - cmpt); ++ for (j = 0, cmpt = 0, startheader = offset ++ + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) { ++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { ++ startheader--; ++ cmpt++; ++ } ++ } ++ s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt; ++ s->sh.offset[i - 1] = offset; ++ } ++ if (s->sh.num_entry_point_offsets != 0) { ++ offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt; ++ if (length < offset) { ++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset; ++ s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset; ++ } ++ s->data = nal->data; ++ return 0; ++} ++ ++ ++// Return ++// < 0 Error ++// 0 OK ++// ++// jb->ctu_ts_last < 0 Job still filling ++// jb->ctu_ts_last >= 0 Job ready ++ ++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks) ++{ ++ const int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ HEVCRpiJob * const jb = lc->jb0; ++ int more_data = 1; ++ int ctb_addr_ts = lc->ts; ++ ++ lc->unit_done = 0; ++ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) ++ { ++ const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; ++ const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size; ++ int q_full; ++ ++ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); ++ ++ ff_hevc_rpi_cabac_init(s, lc, ctb_addr_ts); ++ ++ hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size); ++ ++ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; ++ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; ++ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; ++ ++ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0); ++ ++ if (more_data < 0) { ++ s->tab_slice_address[ctb_addr_rs] = -1; ++ return more_data; ++ } ++ ++ // Inc TS to next. ++ // N.B. None of the other position vars have changed ++ ctb_addr_ts++; ++ ff_hevc_rpi_save_states(s, lc, ctb_addr_ts); ++ ++ // Report progress so we can use our MVs in other frames ++ if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) { ++ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1); ++ } ++ ++ // End of line || End of tile line || End of tile ++ // (EoL covers end of frame for our purposes here) ++ q_full = x_ctb + ctb_size >= s->ps.sps->width || ++ s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts] != ctb_addr_rs + 1 || ++ s->ps.pps->tile_id[ctb_addr_ts - 1] != s->ps.pps->tile_id[ctb_addr_ts]; ++ ++ // Allocate QPU chuncks on fixed size 64 pel boundries rather than ++ // whatever ctb_size is today. ++ // * We might quite like to continue to 64 pel vertical too but that ++ // currently confuses WPP ++ if (((x_ctb + ctb_size) & 63) == 0 || q_full) ++ { ++ int overflow = 0; ++ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0) ++ overflow = 1; ++ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0) ++ overflow = 1; ++ if (overflow) ++ { ++ // * This is very annoying (and slow) to cope with in WPP so ++ // we treat it as an error there (no known stream triggers this ++ // with the current buffer sizes). Non-wpp should cope fine. ++ av_log(s, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); ++ q_full = 1; ++ } ++ } ++ ++ if (q_full) ++ { ++ // Do job ++ // Prep for submission ++ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced ++ job_gen_bounds(s, jb); ++ break; ++ } ++ ++ // If max_blocks started as 0 then this will never be true ++ if (--max_blocks == 0) ++ break; ++ } ++ ++ lc->unit_done = (more_data <= 0); ++ lc->ts = ctb_addr_ts; ++ return 0; ++} ++ ++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n) ++{ ++ lc->context = s; ++ lc->jb0 = NULL; ++ lc->lc_n = n; ++ lc->bt_terminate = 0; ++ lc->bt_psem_out = NULL; ++ sem_init(&lc->bt_sem_in, 0, 0); ++} ++ ++#define TRACE_WPP 0 ++#if RPI_EXTRA_BIT_THREADS > 0 ++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts) ++{ ++ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts]; ++ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]]; ++} ++ ++// Move local context parameters from an aux bit thread back to the main ++// thread at the end of a slice as processing is going to continue there. ++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep) ++{ ++ if (src_lc == dst_lc) { ++ return; ++ } ++ ++ // Move the job ++ // We will still have an active job if the final line terminates early ++ // Dest should always be null by now ++ av_assert1(dst_lc->jb0 == NULL); ++ dst_lc->jb0 = src_lc->jb0; ++ src_lc->jb0 = NULL; ++ ++ // Always need to store where we are in the bitstream ++ dst_lc->ts = src_lc->ts; ++ dst_lc->gb = src_lc->gb; ++ // Need to store context if we might have a dependent seg ++ if (is_dep) ++ { ++ dst_lc->qPy_pred = src_lc->qPy_pred; ++ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state)); ++ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff)); ++ } ++} ++ ++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc) ++{ ++ rpi_sem_wait(&lc->bt_sem_in); ++ return lc->bt_terminate; ++} ++ ++// Do one WPP line ++// Will not work correctly over horizontal tile boundries - vertical should be OK ++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first) ++{ ++ const int is_tile = lc->bt_is_tile; ++ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts]; ++ const unsigned int line = lc->bt_line_no; ++ const unsigned int line_inc = lc->bt_line_inc; ++ const int is_last = (line >= lc->bt_last_line); ++ ++ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width); ++ const unsigned int ts_next = ++ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ? ++ INT_MAX : ++ is_tile ? ++ s->ps.pps->ctb_addr_rs_to_ts[s->ps.pps->tile_pos_rs[tile_id + line_inc]] : ++ lc->ts + lc->bt_line_width * line_inc; ++ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work) ++ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2; ++ unsigned int ts_prev; ++ int loop_n = 0; ++ int err = 0; ++ ++ av_assert1(line <= s->sh.num_entry_point_offsets); ++ ++#if TRACE_WPP ++ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__, ++ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id, ++ line, lc->bt_last_line, s->sh.num_entry_point_offsets, ++ lc->ts, ts_eol, ts_next, partial_size, lc->jb0); ++#endif ++ if (line != 0) ++ { ++ const uint8_t * const data = s->data + s->sh.offset[line - 1]; ++ const unsigned int len = s->sh.size[line - 1]; ++ if ((err = init_get_bits8(&lc->gb, data, len)) < 0) ++ return err; ++ ++ ff_init_cabac_decoder(&lc->cc, data, len); ++ ++ lc->wpp_init = 1; // Stop ff_hevc_rpi_cabac_init trying to read non-existant termination bits ++ } ++ ++ // We should never be processing a dependent slice here so reset is good ++ // ?? These probably shouldn't be needed (as they should be set by later ++ // logic) but do seem to be required ++ lc->qPy_pred = s->sh.slice_qp; ++ lc->qp_y = s->sh.slice_qp; ++ ++ do ++ { ++ if (!is_last && loop_n > 1) { ++#if TRACE_WPP ++ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ } ++ if (!is_first && loop_n != 0) ++ { ++#if TRACE_WPP ++ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in); ++#endif ++ if (wait_bt_sem_in(lc) != 0) ++ return AVERROR_EXIT; ++ } ++ ++#if TRACE_WPP ++ { ++ int n; ++ sem_getvalue(&lc->bt_sem_in, &n); ++ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in); ++ } ++#endif ++ ++ ts_prev = lc->ts; ++ ++ // If we have had an error - do no further decode but do continue ++ // moving signals around so the other threads continue to operate ++ // correctly (or at least as correctly as they can with this line missing) ++ // ++ // Errors in WPP/Tile are less fatal than normal as we have a good idea ++ // of how to restart on the next line so there is no need to give up totally ++ if (err != 0) ++ { ++ lc->unit_done = 0; ++ lc->ts += partial_size; ++ } ++ else ++ { ++ worker_pass0_ready(s, lc); ++ ++ if ((err = fill_job(s, lc, partial_size)) < 0 || ++ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done))) ++ { ++ if (err == 0) { ++ av_log(s, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); ++ err = AVERROR_INVALIDDATA; ++ } ++ worker_free(s, lc); ++ lc->ts = ts_prev + partial_size; // Pretend we did all that ++ lc->unit_done = 0; ++ } ++ else if (is_tile) ++ { ++ worker_submit_job(s, lc); ++ } ++ } ++ ++ ++loop_n; ++ } while (lc->ts < ts_eol && !lc->unit_done); ++ ++ // If we are on the last line & we didn't get a whole line we must wait for ++ // and sink the sem_posts from the line above / tile to the left. ++ while ((ts_prev += partial_size) < ts_eol) ++ { ++#if TRACE_WPP ++ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in); ++#endif ++ if (wait_bt_sem_in(lc) != 0) ++ return AVERROR_EXIT; ++ } ++ ++ lc->bt_line_no += line_inc; ++ ++ if (!is_tile && err == 0) ++ worker_submit_job(s, lc); ++ ++ if (!is_last) { ++ lc->ts = ts_next; ++ ++#if TRACE_WPP ++ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ if (loop_n > 1) { ++#if TRACE_WPP ++ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ } ++ } ++ else ++ { ++ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); ++ ++ // When all done poke the thread 0 sem_in one final time ++#if TRACE_WPP ++ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in); ++#endif ++ sem_post(&s->HEVClcList[0]->bt_sem_in); ++ } ++ ++#if TRACE_WPP ++ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag); ++#endif ++ return err; ++} ++ ++static void wpp_setup_lcs(HEVCRpiContext * const s) ++{ ++ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const unsigned int line_width = line_ts_width(s, ts); ++ ++ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i) ++ { ++ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; ++ lc->ts = ts; ++ lc->bt_is_tile = 0; ++ lc->bt_line_no = i; ++ lc->bt_line_width = line_width; ++ lc->bt_last_line = s->sh.num_entry_point_offsets; ++ lc->bt_line_inc = RPI_BIT_THREADS; ++ ts += line_width; ++ } ++} ++ ++ ++// Can only process tile single row at once ++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row) ++{ ++ const HEVCRpiPPS * const pps = s->ps.pps; ++ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const unsigned int tile0 = pps->tile_id[ts0]; ++ const unsigned int col0 = tile0 % pps->num_tile_columns; ++ ++ const unsigned int col = (slice_row == 0) ? col0 : 0; ++ unsigned int line = slice_row * pps->num_tile_columns - col0 + col; ++ const unsigned int last_line = FFMIN( ++ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets); ++ ++ const unsigned int par = ++ FFMIN(RPI_BIT_THREADS, last_line + 1 - line); ++#if TRACE_WPP ++ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row, ++ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line); ++#endif ++ for (unsigned int i = 0; i != par; ++i, ++line) ++ { ++ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; ++ const unsigned int tile = tile0 + line; ++ ++ lc->ts = pps->ctb_addr_rs_to_ts[pps->tile_pos_rs[tile]]; ++ lc->bt_line_no = line; ++ lc->bt_is_tile = 1; ++ lc->bt_line_width = line_ts_width(s, lc->ts); ++ lc->bt_last_line = last_line; ++ lc->bt_line_inc = par; ++ } ++} ++ ++ ++static void * bit_thread(void * v) ++{ ++ HEVCRpiLocalContext * const lc = v; ++ HEVCRpiContext *const s = lc->context; ++ ++ while (wait_bt_sem_in(lc) == 0) ++ { ++ int err; ++ ++ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp ++ if (lc->bt_terminate) { ++ av_log(s, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); ++ break; ++ } ++ av_log(s, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); ++ } ++ } ++ ++ return NULL; ++} ++ ++static int bit_threads_start(HEVCRpiContext * const s) ++{ ++ if (s->bt_started) ++ return 0; ++ ++ for (int i = 1; i < RPI_BIT_THREADS; ++i) ++ { ++ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS] ++ if (s->HEVClcList[i] == NULL) { ++ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL) ++ return -1; ++ } ++ ++ bt_lc_init(s, s->HEVClcList[i], i); ++ job_lc_init(s->HEVClcList[i]); ++ } ++ ++ // Link the sems in a circle ++ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i) ++ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in; ++ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in; ++ ++ // Init all lc before starting any threads ++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) ++ { ++ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0) ++ return -1; ++ } ++ ++ s->bt_started = 1; ++ return 0; ++} ++ ++static int bit_threads_kill(HEVCRpiContext * const s) ++{ ++ if (!s->bt_started) ++ return 0; ++ s->bt_started = 0; ++ ++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) ++ { ++ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1]; ++ if (lc == NULL) ++ break; ++ ++ lc->bt_terminate = 1; ++ sem_post(&lc->bt_sem_in); ++ pthread_join(s->bit_threads[i], NULL); ++ ++ sem_destroy(&lc->bt_sem_in); ++ job_lc_kill(lc); ++ } ++ return 0; ++} ++#endif ++ ++ ++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread) ++{ ++ HEVCRpiContext * const s = avctxt->priv_data; ++ HEVCRpiLocalContext * const lc = s->HEVClc; ++ int err; ++ ++ // Start of slice ++ if ((err = slice_start(s, lc)) != 0) ++ return err; ++ ++#if RPI_EXTRA_BIT_THREADS > 0 ++ ++ if (s->sh.num_entry_point_offsets != 0 && ++ s->ps.pps->num_tile_columns > 1) ++ { ++ unsigned int slice_row = 0; ++ ++#if TRACE_WPP ++ printf("%s: Do Tiles\n", __func__); ++#endif ++ // Generate & start extra bit threads if they aren't already running ++ bit_threads_start(s); ++ ++ do ++ { ++ // Reset lc lines etc. ++ tile_one_row_setup_lcs(s, slice_row); ++ ++#if TRACE_WPP ++ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n", ++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); ++#endif ++ ++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads ++#if TRACE_WPP ++ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n", ++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); ++#endif ++ ++ while (lc->bt_line_no <= lc->bt_last_line) { ++ rpi_sem_wait(&lc->bt_sem_in); ++ rpi_run_one_line(s, lc, 0); ++ } ++#if TRACE_WPP ++ printf("%s: Done body\n", __func__); ++#endif ++ ++ // Wait for everything else to finish ++ rpi_sem_wait(&lc->bt_sem_in); ++ ++ ++slice_row; ++ } while (lc->bt_last_line < s->sh.num_entry_point_offsets); ++ ++ ++#if TRACE_WPP ++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ else ++ ++ // * We only cope with WPP in a single column ++ // Probably want to deal with that case as tiles rather than WPP anyway ++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly ++ if (s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->ps.pps->num_tile_columns == 1 && ++ s->sh.num_entry_point_offsets != 0) ++ { ++#if TRACE_WPP ++ printf("%s: Do WPP\n", __func__); ++#endif ++ // Generate & start extra bit threads if they aren't already running ++ bit_threads_start(s); ++ ++ // Reset lc lines etc. ++ wpp_setup_lcs(s); ++ ++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads ++#if TRACE_WPP ++ printf("%s: Done 1st\n", __func__); ++#endif ++ ++ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) { ++ rpi_sem_wait(&lc->bt_sem_in); ++ rpi_run_one_line(s, lc, 0); ++ } ++#if TRACE_WPP ++ printf("%s: Done body\n", __func__); ++#endif ++ ++ // Wait for everything else to finish ++ rpi_sem_wait(&lc->bt_sem_in); ++ ++#if TRACE_WPP ++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ else ++#endif ++ { ++#if TRACE_WPP ++ printf("%s: Single start: ts=%d\n", __func__, lc->ts); ++#endif ++ // Single bit thread ++ do { ++ // Make sure we have space to prepare the next job ++ worker_pass0_ready(s, lc); ++ ++ if ((err = fill_job(s, lc, 0)) < 0) ++ goto fail; ++ ++ worker_submit_job(s, lc); ++ } while (!lc->unit_done); ++ ++#if TRACE_WPP ++ printf("%s: Single end: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ ++ // If we have reached the end of the frame then wait for the worker to finish all its jobs ++ if (lc->ts >= s->ps.sps->ctb_size) { ++ worker_wait(s, lc); ++ } ++ ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", ++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, ++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, ++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, ++ ts->y_pred2_hgt16, ts->y_pred2_hle16); ++ memset(ts, 0, sizeof(*ts)); ++ } ++#endif ++ ++ return lc->ts; ++ ++fail: ++ // Cleanup ++ av_log(s, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); ++ // Free our job & wait for temination ++ worker_free(s, lc); ++ worker_wait(s, lc); ++ return err; ++} ++ ++ ++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal) ++{ ++ int err; ++ if ((err = gen_entry_points(s, nal)) < 0) ++ return err; ++ ++ return rpi_decode_entry(s->avctx, NULL); ++} ++ ++static int set_side_data(HEVCRpiContext *s) ++{ ++ AVFrame *out = s->ref->frame; ++ ++ if (s->sei.frame_packing.present && ++ s->sei.frame_packing.arrangement_type >= 3 && ++ s->sei.frame_packing.arrangement_type <= 5 && ++ s->sei.frame_packing.content_interpretation_type > 0 && ++ s->sei.frame_packing.content_interpretation_type < 3) { ++ AVStereo3D *stereo = av_stereo3d_create_side_data(out); ++ if (!stereo) ++ return AVERROR(ENOMEM); ++ ++ switch (s->sei.frame_packing.arrangement_type) { ++ case 3: ++ if (s->sei.frame_packing.quincunx_subsampling) ++ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX; ++ else ++ stereo->type = AV_STEREO3D_SIDEBYSIDE; ++ break; ++ case 4: ++ stereo->type = AV_STEREO3D_TOPBOTTOM; ++ break; ++ case 5: ++ stereo->type = AV_STEREO3D_FRAMESEQUENCE; ++ break; ++ } ++ ++ if (s->sei.frame_packing.content_interpretation_type == 2) ++ stereo->flags = AV_STEREO3D_FLAG_INVERT; ++ } ++ ++ if (s->sei.display_orientation.present && ++ (s->sei.display_orientation.anticlockwise_rotation || ++ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) { ++ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16); ++ AVFrameSideData *rotation = av_frame_new_side_data(out, ++ AV_FRAME_DATA_DISPLAYMATRIX, ++ sizeof(int32_t) * 9); ++ if (!rotation) ++ return AVERROR(ENOMEM); ++ ++ av_display_rotation_set((int32_t *)rotation->data, angle); ++ av_display_matrix_flip((int32_t *)rotation->data, ++ s->sei.display_orientation.hflip, ++ s->sei.display_orientation.vflip); ++ } ++ ++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 ++ // so the side data persists for the entire coded video sequence. ++ if (s->sei.mastering_display.present > 0 && ++ IS_IRAP(s) && s->no_rasl_output_flag) { ++ s->sei.mastering_display.present--; ++ } ++ if (s->sei.mastering_display.present) { ++ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b ++ const int mapping[3] = {2, 0, 1}; ++ const int chroma_den = 50000; ++ const int luma_den = 10000; ++ int i; ++ AVMasteringDisplayMetadata *metadata = ++ av_mastering_display_metadata_create_side_data(out); ++ if (!metadata) ++ return AVERROR(ENOMEM); ++ ++ for (i = 0; i < 3; i++) { ++ const int j = mapping[i]; ++ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0]; ++ metadata->display_primaries[i][0].den = chroma_den; ++ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1]; ++ metadata->display_primaries[i][1].den = chroma_den; ++ } ++ metadata->white_point[0].num = s->sei.mastering_display.white_point[0]; ++ metadata->white_point[0].den = chroma_den; ++ metadata->white_point[1].num = s->sei.mastering_display.white_point[1]; ++ metadata->white_point[1].den = chroma_den; ++ ++ metadata->max_luminance.num = s->sei.mastering_display.max_luminance; ++ metadata->max_luminance.den = luma_den; ++ metadata->min_luminance.num = s->sei.mastering_display.min_luminance; ++ metadata->min_luminance.den = luma_den; ++ metadata->has_luminance = 1; ++ metadata->has_primaries = 1; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n"); ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n", ++ av_q2d(metadata->display_primaries[0][0]), ++ av_q2d(metadata->display_primaries[0][1]), ++ av_q2d(metadata->display_primaries[1][0]), ++ av_q2d(metadata->display_primaries[1][1]), ++ av_q2d(metadata->display_primaries[2][0]), ++ av_q2d(metadata->display_primaries[2][1]), ++ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1])); ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "min_luminance=%f, max_luminance=%f\n", ++ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance)); ++ } ++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 ++ // so the side data persists for the entire coded video sequence. ++ if (s->sei.content_light.present > 0 && ++ IS_IRAP(s) && s->no_rasl_output_flag) { ++ s->sei.content_light.present--; ++ } ++ if (s->sei.content_light.present) { ++ AVContentLightMetadata *metadata = ++ av_content_light_metadata_create_side_data(out); ++ if (!metadata) ++ return AVERROR(ENOMEM); ++ metadata->MaxCLL = s->sei.content_light.max_content_light_level; ++ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n"); ++ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n", ++ metadata->MaxCLL, metadata->MaxFALL); ++ } ++ ++ if (s->sei.a53_caption.a53_caption) { ++ AVFrameSideData* sd = av_frame_new_side_data(out, ++ AV_FRAME_DATA_A53_CC, ++ s->sei.a53_caption.a53_caption_size); ++ if (sd) ++ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size); ++ av_freep(&s->sei.a53_caption.a53_caption); ++ s->sei.a53_caption.a53_caption_size = 0; ++ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS; ++ } ++ ++ if (s->sei.alternative_transfer.present && ++ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) && ++ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) { ++ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics; ++ } ++ ++ return 0; ++} ++ ++static int hevc_frame_start(HEVCRpiContext * const s) ++{ ++ int pic_size_in_ctb = ((s->ps.sps->width >> s->ps.sps->log2_min_cb_size) + 1) * ++ ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1); ++ int ret; ++ ++ memset(s->horizontal_bs, 0, s->bs_width * s->bs_height); ++ memset(s->vertical_bs, 0, s->bs_width * s->bs_height); ++ memset(s->cbf_luma, 0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height); ++ memset(s->is_pcm, 0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1)); ++ memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address)); ++ ++ s->is_decoded = 0; ++ s->first_nal_type = s->nal_unit_type; ++ ++ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos); ++ ++ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc); ++ if (ret < 0) ++ goto fail; ++ ++ ret = ff_hevc_rpi_frame_rps(s); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n"); ++ goto fail; ++ } ++ ++ s->ref->frame->key_frame = IS_IRAP(s); ++ ++ ret = set_side_data(s); ++ if (ret < 0) ++ goto fail; ++ ++ s->frame->pict_type = 3 - s->sh.slice_type; ++ ++ if (!IS_IRAP(s)) ++ ff_hevc_rpi_bump_frame(s); ++ ++ av_frame_unref(s->output_frame); ++ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0); ++ if (ret < 0) ++ goto fail; ++ ++ ff_thread_finish_setup(s->avctx); ++ ++ return 0; ++ ++fail: ++ if (s->ref) ++ ff_hevc_rpi_unref_frame(s, s->ref, ~0); ++ s->ref = NULL; ++ return ret; ++} ++ ++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal) ++{ ++ GetBitContext * const gb = &s->HEVClc->gb; ++ int ctb_addr_ts, ret; ++ ++ *gb = nal->gb; ++ s->nal_unit_type = nal->type; ++ s->temporal_id = nal->temporal_id; ++ ++ switch (s->nal_unit_type) { ++ case HEVC_NAL_VPS: ++ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_SPS: ++ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps, ++ s->apply_defdispwin); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_PPS: ++ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_SEI_PREFIX: ++ case HEVC_NAL_SEI_SUFFIX: ++ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_TRAIL_R: ++ case HEVC_NAL_TRAIL_N: ++ case HEVC_NAL_TSA_N: ++ case HEVC_NAL_TSA_R: ++ case HEVC_NAL_STSA_N: ++ case HEVC_NAL_STSA_R: ++ case HEVC_NAL_BLA_W_LP: ++ case HEVC_NAL_BLA_W_RADL: ++ case HEVC_NAL_BLA_N_LP: ++ case HEVC_NAL_IDR_W_RADL: ++ case HEVC_NAL_IDR_N_LP: ++ case HEVC_NAL_CRA_NUT: ++ case HEVC_NAL_RADL_N: ++ case HEVC_NAL_RADL_R: ++ case HEVC_NAL_RASL_N: ++ case HEVC_NAL_RASL_R: ++ ret = hls_slice_header(s); ++ if (ret < 0) ++ return ret; ++ ++ // The definition of _N unit types is "non-reference for other frames ++ // with the same temporal_id" so they may/will be ref frames for pics ++ // with a higher temporal_id. ++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || ++ !(s->nal_unit_type == HEVC_NAL_TRAIL_N || ++ s->nal_unit_type == HEVC_NAL_TSA_N || ++ s->nal_unit_type == HEVC_NAL_STSA_N || ++ s->nal_unit_type == HEVC_NAL_RADL_N || ++ s->nal_unit_type == HEVC_NAL_RASL_N); ++ s->offload_recon = s->used_for_ref; ++// s->offload_recon = 0; ++ ++#if DEBUG_DECODE_N ++ { ++ static int z = 0; ++ if (IS_IDR(s)) { ++ z = 1; ++ } ++ if (z != 0 && z++ > DEBUG_DECODE_N) { ++ s->is_decoded = 0; ++ break; ++ } ++ } ++#endif ++ if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) { ++ s->is_decoded = 0; ++ break; ++ } ++ ++ if (s->sh.first_slice_in_pic_flag) { ++ if (s->max_ra == INT_MAX) { ++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { ++ s->max_ra = s->poc; ++ } else { ++ if (IS_IDR(s)) ++ s->max_ra = INT_MIN; ++ } ++ } ++ ++ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) && ++ s->poc <= s->max_ra) { ++ s->is_decoded = 0; ++ break; ++ } else { ++ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra) ++ s->max_ra = INT_MIN; ++ } ++ ++ ret = hevc_frame_start(s); ++ if (ret < 0) ++ return ret; ++ } else if (!s->ref) { ++ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n"); ++ goto fail; ++ } ++ ++ if (s->nal_unit_type != s->first_nal_type) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Non-matching NAL types of the VCL NALUs: %d %d\n", ++ s->first_nal_type, s->nal_unit_type); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (!s->sh.dependent_slice_segment_flag && ++ s->sh.slice_type != HEVC_SLICE_I) { ++ ret = ff_hevc_rpi_slice_rpl(s); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Error constructing the reference lists for the current slice.\n"); ++ goto fail; ++ } ++ } ++ ++ ctb_addr_ts = hls_slice_data(s, nal); ++ if (ctb_addr_ts >= (s->ps.sps->ctb_width * s->ps.sps->ctb_height)) { ++ s->is_decoded = 1; ++ } ++ ++ if (ctb_addr_ts < 0) { ++ ret = ctb_addr_ts; ++ goto fail; ++ } ++ break; ++ case HEVC_NAL_EOS_NUT: ++ case HEVC_NAL_EOB_NUT: ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ break; ++ case HEVC_NAL_AUD: ++ case HEVC_NAL_FD_NUT: ++ break; ++ default: ++ av_log(s->avctx, AV_LOG_INFO, ++ "Skipping NAL unit %d\n", s->nal_unit_type); ++ } ++ ++ return 0; ++fail: ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return ret; ++ return 0; ++} ++ ++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length) ++{ ++ int i, ret = 0; ++ int eos_at_start = 1; ++ ++ s->ref = NULL; ++ s->last_eos = s->eos; ++ s->eos = 0; ++ ++ /* split the input packet into NAL units, so we know the upper bound on the ++ * number of slices in the frame */ ++ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff, ++ s->nal_length_size, s->avctx->codec_id, 1); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Error splitting the input into NAL units.\n"); ++ return ret; ++ } ++ ++ for (i = 0; i < s->pkt.nb_nals; i++) { ++ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT || ++ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) { ++ if (eos_at_start) { ++ s->last_eos = 1; ++ } else { ++ s->eos = 1; ++ } ++ } else { ++ eos_at_start = 0; ++ } ++ } ++ ++ /* decode the NAL units */ ++ for (i = 0; i < s->pkt.nb_nals; i++) { ++ ret = decode_nal_unit(s, &s->pkt.nals[i]); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Error parsing NAL unit #%d.\n", i); ++ goto fail; ++ } ++ } ++ ++fail: // Also success path ++ if (s->ref != NULL) { ++ if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) { ++ ff_hevc_rpi_progress_signal_all_done(s); ++ } ++ else { ++ // Flush frame to real memory as we expect to be able to pass ++ // it straight on to mmal ++ flush_frame(s, s->frame); ++ } ++ } ++ return ret; ++} ++ ++static void print_md5(void *log_ctx, int level, uint8_t md5[16]) ++{ ++ int i; ++ for (i = 0; i < 16; i++) ++ av_log(log_ctx, level, "%02"PRIx8, md5[i]); ++} ++ ++static int verify_md5(HEVCRpiContext *s, AVFrame *frame) ++{ ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); ++ int pixel_shift; ++ int i, j; ++ ++ if (!desc) ++ return AVERROR(EINVAL); ++ ++ pixel_shift = desc->comp[0].depth > 8; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ", ++ s->poc); ++ ++ /* the checksums are LE, so we have to byteswap for >8bpp formats ++ * on BE arches */ ++#if HAVE_BIGENDIAN ++ if (pixel_shift && !s->checksum_buf) { ++ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size, ++ FFMAX3(frame->linesize[0], frame->linesize[1], ++ frame->linesize[2])); ++ if (!s->checksum_buf) ++ return AVERROR(ENOMEM); ++ } ++#endif ++ ++ for (i = 0; frame->data[i]; i++) { ++ int width = s->avctx->coded_width; ++ int height = s->avctx->coded_height; ++ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width; ++ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height; ++ uint8_t md5[16]; ++ ++ av_md5_init(s->sei.picture_hash.md5_ctx); ++ for (j = 0; j < h; j++) { ++ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1); ++#if HAVE_BIGENDIAN ++ if (pixel_shift) { ++ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf, ++ (const uint16_t *) src, w); ++ src = s->checksum_buf; ++ } ++#endif ++ av_md5_update(s->sei.picture_hash.md5_ctx, src, w << pixel_shift); ++ } ++ av_md5_final(s->sei.picture_hash.md5_ctx, md5); ++ ++ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) { ++ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i); ++ print_md5(s->avctx, AV_LOG_DEBUG, md5); ++ av_log (s->avctx, AV_LOG_DEBUG, "; "); ++ } else { ++ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i); ++ print_md5(s->avctx, AV_LOG_ERROR, md5); ++ av_log (s->avctx, AV_LOG_ERROR, " != "); ++ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]); ++ av_log (s->avctx, AV_LOG_ERROR, "\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "\n"); ++ ++ return 0; ++} ++ ++static int all_sps_supported(const HEVCRpiContext * const s) ++{ ++ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ if (s->ps.sps_list[i] != NULL) ++ { ++ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; ++ if (!is_sps_supported(sps)) ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first) ++{ ++ int ret, i; ++ ++ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff, ++ &s->nal_length_size, s->avctx->err_recognition, ++ s->apply_defdispwin, s->avctx); ++ if (ret < 0) ++ return ret; ++ ++ /* export stream parameters from the first SPS */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ if (first && s->ps.sps_list[i]) { ++ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; ++ export_stream_params(s->avctx, &s->ps, sps); ++ break; ++ } ++ } ++ ++ return 0; ++} ++ ++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output, ++ AVPacket *avpkt) ++{ ++ int ret; ++ int new_extradata_size; ++ uint8_t *new_extradata; ++ HEVCRpiContext *s = avctx->priv_data; ++ ++ if (!avpkt->size) { ++ ret = ff_hevc_rpi_output_frame(s, data, 1); ++ if (ret < 0) ++ return ret; ++ ++ *got_output = ret; ++ return 0; ++ } ++ ++ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, ++ &new_extradata_size); ++ if (new_extradata && new_extradata_size > 0) { ++ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0); ++ if (ret < 0) ++ return ret; ++ } ++ ++ s->ref = NULL; ++ ret = decode_nal_units(s, avpkt->data, avpkt->size); ++ if (ret < 0) ++ return ret; ++ ++ /* verify the SEI checksum */ ++ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded && ++ s->sei.picture_hash.is_md5) { ++ ret = verify_md5(s, s->ref->frame); ++ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) { ++ ff_hevc_rpi_unref_frame(s, s->ref, ~0); ++ return ret; ++ } ++ } ++ s->sei.picture_hash.is_md5 = 0; ++ ++ if (s->is_decoded) { ++ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc); ++ s->is_decoded = 0; ++ } ++ ++ if (s->output_frame->buf[0]) { ++ av_frame_move_ref(data, s->output_frame); ++ *got_output = 1; ++ } ++ ++ return avpkt->size; ++} ++ ++static int hevc_ref_frame(HEVCRpiContext *s, HEVCFrame *dst, HEVCFrame *src) ++{ ++ int ret; ++ ++ ret = ff_thread_ref_frame(&dst->tf, &src->tf); ++ if (ret < 0) ++ return ret; ++ ++ dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf); ++ if (!dst->tab_mvf_buf) ++ goto fail; ++ dst->tab_mvf = src->tab_mvf; ++ ++ dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf); ++ if (!dst->rpl_tab_buf) ++ goto fail; ++ dst->rpl_tab = src->rpl_tab; ++ ++ dst->rpl_buf = av_buffer_ref(src->rpl_buf); ++ if (!dst->rpl_buf) ++ goto fail; ++ ++ dst->poc = src->poc; ++ dst->ctb_count = src->ctb_count; ++ dst->flags = src->flags; ++ dst->sequence = src->sequence; ++ return 0; ++ ++fail: ++ ff_hevc_rpi_unref_frame(s, dst, ~0); ++ return AVERROR(ENOMEM); ++} ++ ++ ++static av_cold int hevc_decode_free(AVCodecContext *avctx) ++{ ++ HEVCRpiContext * const s = avctx->priv_data; ++ int i; ++ ++ pic_arrays_free(s); ++ ++ av_freep(&s->sei.picture_hash.md5_ctx); ++ ++ av_freep(&s->cabac_state); ++ ++#if RPI_EXTRA_BIT_THREADS ++ bit_threads_kill(s); ++#endif ++ ++ hevc_exit_worker(s); ++ vpu_qpu_term(); ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_kill_state(s->progress_states + i); ++ } ++ job_lc_kill(s->HEVClc); ++ av_rpi_zc_uninit(avctx); ++ ++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] ++ av_freep(&s->sao_pixel_buffer_v[0]); ++ av_frame_free(&s->output_frame); ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++ av_frame_free(&s->DPB[i].frame); ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) ++ av_buffer_unref(&s->ps.vps_list[i]); ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) ++ av_buffer_unref(&s->ps.sps_list[i]); ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) ++ av_buffer_unref(&s->ps.pps_list[i]); ++ s->ps.sps = NULL; ++ s->ps.pps = NULL; ++ s->ps.vps = NULL; ++ ++ av_freep(&s->sh.entry_point_offset); ++ av_freep(&s->sh.offset); ++ av_freep(&s->sh.size); ++ ++ for (i = 1; i < s->threads_number; i++) { ++ if (s->sList[i] != NULL) { ++ av_freep(&s->sList[i]); ++ } ++ } ++ ++ // Free separately from sLists as used that way by RPI WPP ++ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) { ++ av_freep(s->HEVClcList + i); ++ } ++ s->HEVClc = NULL; // Allocated as part of HEVClcList ++ ++ ff_h2645_packet_uninit(&s->pkt); ++ ++ return 0; ++} ++ ++ ++static av_cold int hevc_init_context(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ int i; ++ ++ s->avctx = avctx; ++ ++ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext)); ++ if (!s->HEVClc) ++ goto fail; ++ s->HEVClcList[0] = s->HEVClc; ++ s->sList[0] = s; ++ ++ // Whilst FFmpegs init fn is only called once the close fn is called as ++ // many times as we have threads (init_thread_copy is called for the ++ // threads). So to match init & term put the init here where it will be ++ // called by both init & copy ++ av_rpi_zc_init(avctx); ++ ++ if (vpu_qpu_init() != 0) ++ goto fail; ++ ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ { ++ static const uint32_t dframe[1] = {0x80808080}; ++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe; ++ } ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ s->qpu_dummy_frame_qpu = qpu_fn(mc_start); // Use our code as a dummy frame ++#endif ++ ++ bt_lc_init(s, s->HEVClc, 0); ++ job_lc_init(s->HEVClc); ++ ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_init_state(s->progress_states + i); ++ } ++ ++ s->cabac_state = av_malloc(HEVC_CONTEXTS); ++ if (!s->cabac_state) ++ goto fail; ++ ++ s->output_frame = av_frame_alloc(); ++ if (!s->output_frame) ++ goto fail; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ s->DPB[i].frame = av_frame_alloc(); ++ if (!s->DPB[i].frame) ++ goto fail; ++ s->DPB[i].tf.f = s->DPB[i].frame; ++ s->DPB[i].dpb_no = i; ++ } ++ ++ s->max_ra = INT_MAX; ++ ++ s->sei.picture_hash.md5_ctx = av_md5_alloc(); ++ if (!s->sei.picture_hash.md5_ctx) ++ goto fail; ++ ++ ff_bswapdsp_init(&s->bdsp); ++ ++ s->context_initialized = 1; ++ s->eos = 0; ++ ++ ff_hevc_rpi_reset_sei(&s->sei); ++ ++ return 0; ++ ++fail: ++ av_log(s, AV_LOG_ERROR, "%s: Failed\n", __func__); ++ hevc_decode_free(avctx); ++ return AVERROR(ENOMEM); ++} ++ ++static int hevc_update_thread_context(AVCodecContext *dst, ++ const AVCodecContext *src) ++{ ++ HEVCRpiContext *s = dst->priv_data; ++ HEVCRpiContext *s0 = src->priv_data; ++ int i, ret; ++ ++ if (!s->context_initialized) { ++ ret = hevc_init_context(dst); ++ if (ret < 0) ++ return ret; ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++ if (s0->DPB[i].frame->buf[0]) { ++ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]); ++ if (ret < 0) ++ return ret; ++ } ++ } ++ ++ if (s->ps.sps != s0->ps.sps) ++ s->ps.sps = NULL; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) { ++ av_buffer_unref(&s->ps.vps_list[i]); ++ if (s0->ps.vps_list[i]) { ++ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]); ++ if (!s->ps.vps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ av_buffer_unref(&s->ps.sps_list[i]); ++ if (s0->ps.sps_list[i]) { ++ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]); ++ if (!s->ps.sps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) { ++ av_buffer_unref(&s->ps.pps_list[i]); ++ if (s0->ps.pps_list[i]) { ++ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]); ++ if (!s->ps.pps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ if (s->ps.sps != s0->ps.sps) ++ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0) ++ return ret; ++ ++ s->seq_decode = s0->seq_decode; ++ s->seq_output = s0->seq_output; ++ s->pocTid0 = s0->pocTid0; ++ s->max_ra = s0->max_ra; ++ s->eos = s0->eos; ++ s->no_rasl_output_flag = s0->no_rasl_output_flag; ++ ++ s->is_nalff = s0->is_nalff; ++ s->nal_length_size = s0->nal_length_size; ++ ++ s->threads_number = s0->threads_number; ++ s->threads_type = s0->threads_type; ++ ++ if (s0->eos) { ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ } ++ ++ s->sei.frame_packing = s0->sei.frame_packing; ++ s->sei.display_orientation = s0->sei.display_orientation; ++ s->sei.mastering_display = s0->sei.mastering_display; ++ s->sei.content_light = s0->sei.content_light; ++ s->sei.alternative_transfer = s0->sei.alternative_transfer; ++ ++ // * We do this here as it allows us to easily locate our parents ++ // global job pool, but there really should be a less nasty way ++ if (s->jbc == NULL) ++ { ++ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL); ++ hevc_init_worker(s); ++ } ++ ++ return 0; ++} ++ ++static av_cold int hevc_decode_init(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ int ret; ++ ++ avctx->internal->allocate_progress = 1; ++ ++ { ++ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5)); ++ if (jbg == NULL) ++ { ++ av_log(s, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); ++ return -1; ++ } ++ ++ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) ++ { ++ av_log(s, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); ++ return -1; ++ } ++ } ++ ++ ret = hevc_init_context(avctx); ++ if (ret < 0) ++ return ret; ++ ++ hevc_init_worker(s); ++ ++ s->enable_parallel_tiles = 0; ++ s->sei.picture_timing.picture_struct = 0; ++ s->eos = 1; ++ ++ atomic_init(&s->wpp_err, 0); ++ ++ if(avctx->active_thread_type & FF_THREAD_SLICE) ++ s->threads_number = avctx->thread_count; ++ else ++ s->threads_number = 1; ++ ++ if (avctx->extradata_size > 0 && avctx->extradata) { ++ ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1); ++ ++ if (ret == 0 && !all_sps_supported(s)) ++ ret = AVERROR_DECODER_NOT_FOUND; ++ ++ if (ret < 0) ++ { ++ hevc_decode_free(avctx); ++ return ret; ++ } ++ } ++ ++ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) ++ s->threads_type = FF_THREAD_FRAME; ++ else ++ s->threads_type = FF_THREAD_SLICE; ++ ++ return 0; ++} ++ ++static av_cold int hevc_init_thread_copy(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ int ret; ++ ++ memset(s, 0, sizeof(*s)); ++ ++ ret = hevc_init_context(avctx); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static void hevc_decode_flush(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ ff_hevc_rpi_flush_dpb(s); ++ s->max_ra = INT_MAX; ++ s->eos = 1; ++} ++ ++#define OFFSET(x) offsetof(HEVCRpiContext, x) ++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) ++ ++ ++static const AVOption options[] = { ++ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin), ++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, ++ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin), ++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, ++ { NULL }, ++}; ++ ++static const AVClass hevc_rpi_decoder_class = { ++ .class_name = "HEVC RPI decoder", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++}; ++ ++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = { ++ AV_PIX_FMT_SAND128, ++ AV_PIX_FMT_SAND64_10, ++ AV_PIX_FMT_NONE ++}; ++ ++AVCodec ff_hevc_rpi_decoder = { ++ .name = "hevc_rpi", ++ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"), ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .priv_data_size = sizeof(HEVCRpiContext), ++ .priv_class = &hevc_rpi_decoder_class, ++ .init = hevc_decode_init, ++ .close = hevc_decode_free, ++ .decode = hevc_rpi_decode_frame, ++ .flush = hevc_decode_flush, ++ .update_thread_context = hevc_update_thread_context, ++ .init_thread_copy = hevc_init_thread_copy, ++ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | ++#if 0 ++ // Debugging is often easier without threads getting in the way ++ 0, ++#warning H265 threading turned off ++#else ++ // We only have decent optimisation for frame - so only admit to that ++ AV_CODEC_CAP_FRAME_THREADS, ++#endif ++ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING, ++ .pix_fmts = hevc_rpi_pix_fmts, ++ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), ++}; ++ +diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h +new file mode 100644 +index 0000000000..f61b29e669 +--- /dev/null ++++ b/libavcodec/rpi_hevcdec.h +@@ -0,0 +1,1054 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCDEC_H ++#define AVCODEC_RPI_HEVCDEC_H ++ ++#include "config.h" ++ ++#include ++ ++#include "libavutil/buffer.h" ++ ++#include "avcodec.h" ++#include "bswapdsp.h" ++#include "cabac.h" ++#include "get_bits.h" ++#include "rpi_hevcpred.h" ++#include "h2645_parse.h" ++#include "hevc.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++#include "rpi_hevcdsp.h" ++#include "internal.h" ++#include "thread.h" ++#include "videodsp.h" ++ ++#define MAX_NB_THREADS 16 ++#define SHIFT_CTB_WPP 2 ++ ++//TODO: check if this is really the maximum ++#define MAX_TRANSFORM_DEPTH 5 ++ ++#define MAX_TB_SIZE 32 ++#define MAX_QP 51 ++#define DEFAULT_INTRA_TC_OFFSET 2 ++ ++#define HEVC_CONTEXTS 199 ++ ++#define MRG_MAX_NUM_CANDS 5 ++ ++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64 ++ ++// Size of DPB array ++#define HEVC_DPB_ELS 32 ++ ++#define L0 0 ++#define L1 1 ++ ++#define EPEL_EXTRA_BEFORE 1 ++#define EPEL_EXTRA_AFTER 2 ++#define EPEL_EXTRA 3 ++#define QPEL_EXTRA_BEFORE 3 ++#define QPEL_EXTRA_AFTER 4 ++#define QPEL_EXTRA 7 ++ ++#define EDGE_EMU_BUFFER_STRIDE 80 ++ ++#include ++#include "rpi_qpu.h" ++ ++// Max jobs per frame thread. Actual usage will be limited by the size ++// of the global job pool ++// ?? Limits ++#define RPI_MAX_JOBS 8 ++ ++// This is the number of _extra_ bit threads - we will have ++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing ++// ++// 0 is legitimate and will disable our WPP processing ++//#define RPI_EXTRA_BIT_THREADS 0 ++#define RPI_EXTRA_BIT_THREADS 2 ++ ++// Number of separate threads/passes in worker ++// 2 and 3 are the currently valid numbers ++// At the moment 3 seems fractionally faster ++//#define RPI_PASSES 2 ++#define RPI_PASSES 3 ++ ++// Print out various usage stats ++#define RPI_TSTATS 0 ++ ++// Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs ++// (currently slower than deblocking on the ARM) ++// #define RPI_DEBLOCK_VPU ++ ++#define RPI_VPU_DEBLOCK_CACHED 0 ++ ++// Use ARM emulation of QPU pred ++// These are for debug only as the emulation makes only limited ++// effort to be fast ++#define RPI_QPU_EMU_Y 0 ++#define RPI_QPU_EMU_C 0 ++ ++// Max width & height we are prepared to consider ++// Sand frame shape calc becomes confused with large frames ++// Some buffer alloc also depends on this ++#define HEVC_RPI_MAX_WIDTH 2048 ++#define HEVC_RPI_MAX_HEIGHT 1088 ++ ++ ++/** ++ * Value of the luma sample at position (x, y) in the 2D array tab. ++ */ ++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)]) ++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)]) ++ ++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP) ++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \ ++ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP) ++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23) ++ ++enum RPSType { ++ ST_CURR_BEF = 0, ++ ST_CURR_AFT, ++ ST_FOLL, ++ LT_CURR, ++ LT_FOLL, ++ NB_RPS_TYPE, ++}; ++ ++enum SyntaxElement { ++ SAO_MERGE_FLAG = 0, ++ SAO_TYPE_IDX, ++ SAO_EO_CLASS, ++ SAO_BAND_POSITION, ++ SAO_OFFSET_ABS, ++ SAO_OFFSET_SIGN, ++ END_OF_SLICE_FLAG, ++ SPLIT_CODING_UNIT_FLAG, ++ CU_TRANSQUANT_BYPASS_FLAG, ++ SKIP_FLAG, ++ CU_QP_DELTA, ++ PRED_MODE_FLAG, ++ PART_MODE, ++ PCM_FLAG, ++ PREV_INTRA_LUMA_PRED_FLAG, ++ MPM_IDX, ++ REM_INTRA_LUMA_PRED_MODE, ++ INTRA_CHROMA_PRED_MODE, ++ MERGE_FLAG, ++ MERGE_IDX, ++ INTER_PRED_IDC, ++ REF_IDX_L0, ++ REF_IDX_L1, ++ ABS_MVD_GREATER0_FLAG, ++ ABS_MVD_GREATER1_FLAG, ++ ABS_MVD_MINUS2, ++ MVD_SIGN_FLAG, ++ MVP_LX_FLAG, ++ NO_RESIDUAL_DATA_FLAG, ++ SPLIT_TRANSFORM_FLAG, ++ CBF_LUMA, ++ CBF_CB_CR, ++ TRANSFORM_SKIP_FLAG, ++ EXPLICIT_RDPCM_FLAG, ++ EXPLICIT_RDPCM_DIR_FLAG, ++ LAST_SIGNIFICANT_COEFF_X_PREFIX, ++ LAST_SIGNIFICANT_COEFF_Y_PREFIX, ++ LAST_SIGNIFICANT_COEFF_X_SUFFIX, ++ LAST_SIGNIFICANT_COEFF_Y_SUFFIX, ++ SIGNIFICANT_COEFF_GROUP_FLAG, ++ SIGNIFICANT_COEFF_FLAG, ++ COEFF_ABS_LEVEL_GREATER1_FLAG, ++ COEFF_ABS_LEVEL_GREATER2_FLAG, ++ COEFF_ABS_LEVEL_REMAINING, ++ COEFF_SIGN_FLAG, ++ LOG2_RES_SCALE_ABS, ++ RES_SCALE_SIGN_FLAG, ++ CU_CHROMA_QP_OFFSET_FLAG, ++ CU_CHROMA_QP_OFFSET_IDX, ++}; ++ ++enum PartMode { ++ PART_2Nx2N = 0, ++ PART_2NxN = 1, ++ PART_Nx2N = 2, ++ PART_NxN = 3, ++ PART_2NxnU = 4, ++ PART_2NxnD = 5, ++ PART_nLx2N = 6, ++ PART_nRx2N = 7, ++}; ++ ++enum PredMode { ++ MODE_INTER = 0, ++ MODE_INTRA, ++ MODE_SKIP, ++}; ++ ++enum InterPredIdc { ++ PRED_L0 = 0, ++ PRED_L1, ++ PRED_BI, ++}; ++ ++enum PredFlag { ++ PF_INTRA = 0, ++ PF_L0, ++ PF_L1, ++ PF_BI, ++}; ++ ++enum IntraPredMode { ++ INTRA_PLANAR = 0, ++ INTRA_DC, ++ INTRA_ANGULAR_2, ++ INTRA_ANGULAR_3, ++ INTRA_ANGULAR_4, ++ INTRA_ANGULAR_5, ++ INTRA_ANGULAR_6, ++ INTRA_ANGULAR_7, ++ INTRA_ANGULAR_8, ++ INTRA_ANGULAR_9, ++ INTRA_ANGULAR_10, ++ INTRA_ANGULAR_11, ++ INTRA_ANGULAR_12, ++ INTRA_ANGULAR_13, ++ INTRA_ANGULAR_14, ++ INTRA_ANGULAR_15, ++ INTRA_ANGULAR_16, ++ INTRA_ANGULAR_17, ++ INTRA_ANGULAR_18, ++ INTRA_ANGULAR_19, ++ INTRA_ANGULAR_20, ++ INTRA_ANGULAR_21, ++ INTRA_ANGULAR_22, ++ INTRA_ANGULAR_23, ++ INTRA_ANGULAR_24, ++ INTRA_ANGULAR_25, ++ INTRA_ANGULAR_26, ++ INTRA_ANGULAR_27, ++ INTRA_ANGULAR_28, ++ INTRA_ANGULAR_29, ++ INTRA_ANGULAR_30, ++ INTRA_ANGULAR_31, ++ INTRA_ANGULAR_32, ++ INTRA_ANGULAR_33, ++ INTRA_ANGULAR_34, ++}; ++ ++enum SAOType { ++ SAO_NOT_APPLIED = 0, ++ SAO_BAND, ++ SAO_EDGE, ++ SAO_APPLIED ++}; ++ ++enum SAOEOClass { ++ SAO_EO_HORIZ = 0, ++ SAO_EO_VERT, ++ SAO_EO_135D, ++ SAO_EO_45D, ++}; ++ ++enum ScanType { ++ SCAN_DIAG = 0, ++ SCAN_HORIZ, ++ SCAN_VERT, ++}; ++ ++typedef struct RefPicList { ++ struct HEVCFrame *ref[HEVC_MAX_REFS]; ++ int list[HEVC_MAX_REFS]; ++ int isLongTerm[HEVC_MAX_REFS]; ++ int nb_refs; ++} RefPicList; ++ ++typedef struct RefPicListTab { ++ RefPicList refPicList[2]; ++} RefPicListTab; ++ ++typedef struct CodingUnit { ++ int x; ++ int y; ++ ++ enum PredMode pred_mode; ///< PredMode ++ enum PartMode part_mode; ///< PartMode ++ ++ // Inferred parameters ++ uint8_t intra_split_flag; ///< IntraSplitFlag ++ uint8_t max_trafo_depth; ///< MaxTrafoDepth ++ uint8_t cu_transquant_bypass_flag; ++} CodingUnit; ++ ++typedef struct NeighbourAvailable { ++ int cand_bottom_left; ++ int cand_left; ++ int cand_up; ++ int cand_up_left; ++ int cand_up_right; ++ int cand_up_right_sap; ++} NeighbourAvailable; ++ ++typedef struct PredictionUnit { ++ int mpm_idx; ++ int rem_intra_luma_pred_mode; ++ uint8_t intra_pred_mode[4]; ++ Mv mvd; ++ uint8_t merge_flag; ++ uint8_t intra_pred_mode_c[4]; ++ uint8_t chroma_mode_c[4]; ++} PredictionUnit; ++ ++typedef struct TransformUnit { ++ int cu_qp_delta; ++ ++ int res_scale_val; ++ ++ // Inferred parameters; ++ int intra_pred_mode; ++ int intra_pred_mode_c; ++ int chroma_mode_c; ++ uint8_t is_cu_qp_delta_coded; ++ uint8_t is_cu_chroma_qp_offset_coded; ++ int8_t cu_qp_offset_cb; ++ int8_t cu_qp_offset_cr; ++ uint8_t cross_pf; ++} TransformUnit; ++ ++typedef struct DBParams { ++ int8_t beta_offset; // -12 to +12 ++ int8_t tc_offset; // -12 to +12 ++} DBParams; ++ ++#define HEVC_FRAME_FLAG_OUTPUT (1 << 0) ++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1) ++#define HEVC_FRAME_FLAG_LONG_REF (1 << 2) ++#define HEVC_FRAME_FLAG_BUMPING (1 << 3) ++ ++struct HEVCRpiJob; ++ ++typedef struct HEVCFrame { ++ AVFrame *frame; ++ ThreadFrame tf; ++ MvField *tab_mvf; ++ RefPicList *refPicList; ++ RefPicListTab **rpl_tab; ++ int ctb_count; ++ int poc; ++ struct HEVCFrame *collocated_ref; ++ ++ AVBufferRef *tab_mvf_buf; ++ AVBufferRef *rpl_tab_buf; ++ AVBufferRef *rpl_buf; ++ ++ /** ++ * A sequence counter, so that old frames are output first ++ * after a POC reset ++ */ ++ uint16_t sequence; ++ ++ /** ++ * A combination of HEVC_FRAME_FLAG_* ++ */ ++ uint8_t flags; ++ ++ // Entry no in DPB - can be used as a small unique ++ // frame identifier (within the current thread) ++ uint8_t dpb_no; ++} HEVCFrame; ++ ++typedef struct HEVCRpiLocalContextIntra { ++ TransformUnit tu; ++ NeighbourAvailable na; ++} HEVCRpiLocalContextIntra; ++ ++typedef struct HEVCRpiLocalContext { ++ TransformUnit tu; // Moved to start to match HEVCRpiLocalContextIntra (yuk!) ++ NeighbourAvailable na; ++ ++ // Vars that allow us to locate everything from just an lc ++ struct HEVCRpiContext * context; // ??? make const ??? ++ unsigned int lc_n; // lc list el no ++ ++ // Job wait links ++ struct HEVCRpiLocalContext * jw_next; ++ struct HEVCRpiLocalContext * jw_prev; ++ struct HEVCRpiLocalContext * ljw_next; ++ struct HEVCRpiLocalContext * ljw_prev; ++ struct HEVCRpiJob * volatile jw_job; ++ sem_t jw_sem; ++ ++ // ?? Wrap in structure ?? ++ sem_t bt_sem_in; ++ sem_t * bt_psem_out; ++ volatile int bt_terminate; ++ unsigned int ts; ++ unsigned int bt_last_line; // Last line in this bit_thread chunk ++ unsigned int bt_line_no; ++ unsigned int bt_line_width; ++ unsigned int bt_line_inc; ++ ++ struct HEVCRpiJob * jb0; ++ char unit_done; // Set once we have dealt with this slice ++// char max_done; ++ char bt_is_tile; ++ char last_progress_good; ++ ++ char wpp_init; // WPP/Tile bitstream init has happened ++ ++ uint8_t cabac_state[HEVC_CONTEXTS]; ++ ++ uint8_t stat_coeff[4]; ++ ++// uint8_t first_qp_group; ++ ++ GetBitContext gb; ++ CABACContext cc; ++ ++ int8_t qp_y; ++ int8_t curr_qp_y; ++ ++ int qPy_pred; ++ ++ uint8_t ctb_left_flag; ++ uint8_t ctb_up_flag; ++ uint8_t ctb_up_right_flag; ++ uint8_t ctb_up_left_flag; ++ int end_of_tiles_x; ++ int end_of_tiles_y; ++ /* +7 is for subpixel interpolation, *2 for high bit depths */ ++ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++ /* The extended size between the new edge emu buffer is abused by SAO */ ++ DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++ DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); ++ ++ int ct_depth; ++ CodingUnit cu; ++ PredictionUnit pu; ++ ++#define BOUNDARY_LEFT_SLICE (1 << 0) ++#define BOUNDARY_LEFT_TILE (1 << 1) ++#define BOUNDARY_UPPER_SLICE (1 << 2) ++#define BOUNDARY_UPPER_TILE (1 << 3) ++ /* properties of the boundary of the current CTB for the purposes ++ * of the deblocking filter */ ++ int boundary_flags; ++} HEVCRpiLocalContext; ++ ++ ++// Each block can have an intra prediction and an add_residual command ++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH ++ ++// Sand only has 2 planes (Y/C) ++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4)) ++ ++#ifdef RPI_DEBLOCK_VPU ++// Worst case is 16x16 CTUs ++#define RPI_MAX_DEBLOCK_CMDS (HEVC_RPI_MAX_WIDTH*4/16) ++#endif ++ ++// Command for intra prediction and transform_add of predictions to coefficients ++enum rpi_pred_cmd_e ++{ ++ RPI_PRED_ADD_RESIDUAL, ++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V ++ RPI_PRED_ADD_DC, ++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C ++ RPI_PRED_ADD_DC_V, ++ RPI_PRED_INTRA, ++ RPI_PRED_I_PCM, ++ RPI_PRED_CMD_MAX ++}; ++ ++typedef struct HEVCPredCmd { ++ uint8_t type; ++ uint8_t size; // log2 "size" used by all variants ++ uint8_t na; // i_pred - but left here as they pack well ++ uint8_t c_idx; // i_pred ++ union { ++ struct { // TRANSFORM_ADD ++ uint8_t * dst; ++ const int16_t * buf; ++ uint16_t stride; // Should be good enough for all pic fmts we use ++ int16_t dc; ++ } ta; ++ struct { ++ uint8_t * dst; ++ uint32_t stride; ++ int dc; ++ } dc; ++ struct { // INTRA ++ uint16_t x; ++ uint16_t y; ++ enum IntraPredMode mode; ++ } i_pred; ++ struct { // I_PCM ++ uint16_t x; ++ uint16_t y; ++ const void * src; ++ uint32_t src_len; ++ } i_pcm; ++ }; ++} HEVCPredCmd; ++ ++union qpu_mc_pred_cmd_s; ++struct qpu_mc_pred_y_p_s; ++struct qpu_mc_src_s; ++ ++typedef struct HEVCRpiInterPredQ ++{ ++ union qpu_mc_pred_cmd_u *qpu_mc_base; ++ union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ struct qpu_mc_src_s *last_l0; ++ struct qpu_mc_src_s *last_l1; ++ unsigned int load; ++ uint32_t code_setup; ++ uint32_t code_sync; ++ uint32_t code_exit; ++} HEVCRpiInterPredQ; ++ ++typedef struct HEVCRpiInterPredEnv ++{ ++ HEVCRpiInterPredQ * q; ++ uint8_t n; // Number of Qs ++ uint8_t n_grp; // Number of Q in a group ++ uint8_t curr; // Current Q number (0..n-1) ++ uint8_t used; // 0 if nothing in any Q, 1 otherwise ++ uint8_t used_grp; // 0 if nothing in any Q in the current group ++ unsigned int max_fill; ++ unsigned int min_gap; ++ GPU_MEM_PTR_T gptr; ++} HEVCRpiInterPredEnv; ++ ++typedef struct HEVCRpiIntraPredEnv { ++ unsigned int n; // Number of commands ++ HEVCPredCmd * cmds; ++} HEVCRpiIntraPredEnv; ++ ++typedef struct HEVCRpiCoeffEnv { ++ unsigned int n; ++ int16_t * buf; ++} HEVCRpiCoeffEnv; ++ ++typedef struct HEVCRpiCoeffsEnv { ++ HEVCRpiCoeffEnv s[4]; ++ GPU_MEM_PTR_T gptr; ++ void * mptr; ++} HEVCRpiCoeffsEnv; ++ ++typedef struct HEVCRpiFrameProgressWait { ++ int req; ++ struct HEVCRpiFrameProgressWait * next; ++ sem_t sem; ++} HEVCRpiFrameProgressWait; ++ ++typedef struct HEVCRpiFrameProgressState { ++ struct HEVCRpiFrameProgressWait * first; ++ struct HEVCRpiFrameProgressWait * last; ++ pthread_mutex_t lock; ++} HEVCRpiFrameProgressState; ++ ++typedef struct RpiBlk ++{ ++ unsigned int x; ++ unsigned int y; ++ unsigned int w; ++ unsigned int h; ++} RpiBlk; ++ ++typedef struct HEVCRpiJob { ++ struct HEVCRpiJob * next; // Free chain ++ struct HEVCRpiJobCtl * jbc_local; ++ const HEVCRpiSPS * sps; // sps used to set up this job ++ ++ int waited; ++ int ctu_ts_first; ++ int ctu_ts_last; ++ RpiBlk bounds; // Bounding box of job ++ ++ struct qpu_mc_pred_y_p_s * last_y8_p; ++ struct qpu_mc_src_s * last_y8_l1; ++ ++ HEVCRpiInterPredEnv chroma_ip; ++ HEVCRpiInterPredEnv luma_ip; ++ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no ++ HEVCRpiIntraPredEnv intra; ++ HEVCRpiCoeffsEnv coeffs; ++ HEVCRpiFrameProgressWait progress_wait; ++} HEVCRpiJob; ++ ++struct HEVCRpiContext; ++ ++typedef void HEVCRpiWorkerFn(struct HEVCRpiContext * const s, HEVCRpiJob * const jb); ++ ++typedef struct HEVCRpiPassQueue ++{ ++// int pending; ++ volatile int terminate; ++ sem_t sem_in; ++ sem_t * psem_out; ++ unsigned int job_n; ++ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread ++ HEVCRpiWorkerFn * worker; ++ pthread_t thread; ++ uint8_t pass_n; // Pass number - debug ++ uint8_t started; ++} HEVCRpiPassQueue; ++ ++ ++struct HEVCRpiJobGlobal; ++ ++typedef struct HEVCRpiJobCtl ++{ ++ sem_t sem_out; ++ ++ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated ++ struct HEVCRpiJobGlobal * jbg; ++ ++ HEVCRpiLocalContext * lcw_head; ++ HEVCRpiLocalContext * lcw_tail; ++ ++ pthread_mutex_t in_lock; ++ int offload_in; ++ ++ HEVCRpiJob *offloadq[RPI_MAX_JOBS]; ++} HEVCRpiJobCtl; ++ ++ ++typedef struct HEVCRpiJobGlobal ++{ ++ intptr_t ref_count; ++ pthread_mutex_t lock; ++ HEVCRpiJob * free1; // Singly linked list of free jobs ++ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job ++ HEVCRpiLocalContext * wait_good; // Last good tail ++ HEVCRpiLocalContext * wait_tail; ++ ++} HEVCRpiJobGlobal; ++ ++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1) ++ ++#if RPI_TSTATS ++typedef struct HEVCRpiStats { ++ int y_pred1_y8_merge; ++ int y_pred1_xy; ++ int y_pred1_x0; ++ int y_pred1_y0; ++ int y_pred1_x0y0; ++ int y_pred1_wle8; ++ int y_pred1_wgt8; ++ int y_pred1_hle16; ++ int y_pred1_hgt16; ++ int y_pred2_xy; ++ int y_pred2_x0; ++ int y_pred2_y0; ++ int y_pred2_x0y0; ++ int y_pred2_hle16; ++ int y_pred2_hgt16; ++} HEVCRpiStats; ++#endif ++ ++ ++typedef struct HEVCRpiContext { ++ const AVClass *c; // needed by private avoptions ++ AVCodecContext *avctx; ++ ++ struct HEVCRpiContext *sList[MAX_NB_THREADS]; ++ ++ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS]; ++ HEVCRpiLocalContext *HEVClc; ++ ++ uint8_t threads_type; ++ uint8_t threads_number; ++ ++ int width; ++ int height; ++ ++ char used_for_ref; // rpi ++ char offload_recon; ++ ++ HEVCRpiJobCtl * jbc; ++ ++ // Function pointers ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ const uint8_t * qpu_dummy_frame_emu; ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory ++#endif ++ HEVCRpiQpu qpu; ++ ++ HEVCRpiFrameProgressState progress_states[2]; ++ ++#ifdef RPI_DEBLOCK_VPU ++// With the new scheme of rpi_execute_dblk_cmds ++// it looks like ff_hevc_rpi_hls_filter is no longer called in raster order. ++// This causes trouble if RPI_DEBLOCK_VPU_Q_COUNT > 1 because we prepare setup ++// data for more than one row at a time before triggering the deblocker for one row. ++// This means that the deblock of the final row can use the wrong setup buffer. ++// ++// Also concerned that the thread progress and waiting for job completion is ++// not done correctly with RPI_DEBLOCK_VPU at the end of the frame, or for small CTU sizes. ++#define RPI_DEBLOCK_VPU_Q_COUNT 1 ++ ++ int enable_rpi_deblock; ++ ++ int uv_setup_width; ++ int uv_setup_height; ++ int setup_width; // Number of 16x16 blocks across the image ++ int setup_height; // Number of 16x16 blocks down the image ++ ++ struct dblk_vpu_q_s ++ { ++ GPU_MEM_PTR_T deblock_vpu_gmem; ++ ++ uint8_t (*y_setup_arm)[2][2][2][4]; ++ uint8_t (*y_setup_vc)[2][2][2][4]; ++ ++ uint8_t (*uv_setup_arm)[2][2][2][4]; ++ uint8_t (*uv_setup_vc)[2][2][2][4]; ++ ++ int (*vpu_cmds_arm)[6]; // r0-r5 for each command ++ int vpu_cmds_vc; ++ ++ vpu_qpu_wait_h cmd_id; ++ } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT]; ++ ++ struct dblk_vpu_q_s * dvq; ++ unsigned int dvq_n; ++#endif ++ ++ uint8_t *cabac_state; ++ ++ /** 1 if the independent slice segment header was successfully parsed */ ++ uint8_t slice_initialized; ++ ++ AVFrame *frame; ++ AVFrame *output_frame; ++ uint8_t *sao_pixel_buffer_h[3]; ++ uint8_t *sao_pixel_buffer_v[3]; ++ ++ HEVCRpiParamSets ps; ++ ++ AVBufferPool *tab_mvf_pool; ++ AVBufferPool *rpl_tab_pool; ++ ++ ///< candidate references for the current frame ++ RefPicList rps[5]; ++ ++ SliceHeader sh; ++ SAOParams *sao; ++ DBParams *deblock; ++ enum HEVCNALUnitType nal_unit_type; ++ int temporal_id; ///< temporal_id_plus1 - 1 ++ HEVCFrame *ref; ++ HEVCFrame DPB[HEVC_DPB_ELS]; ++ int poc; ++ int pocTid0; ++ int slice_idx; ///< number of the slice being currently decoded ++ int eos; ///< current packet contains an EOS/EOB NAL ++ int last_eos; ///< last packet contains an EOS/EOB NAL ++ int max_ra; ++ int bs_width; ++ int bs_height; ++ ++ int is_decoded; ++ int no_rasl_output_flag; ++ ++ HEVCPredContext hpc; ++ HEVCDSPContext hevcdsp; ++ VideoDSPContext vdsp; ++ BswapDSPContext bdsp; ++ int8_t *qp_y_tab; ++ uint8_t *horizontal_bs; ++ uint8_t *vertical_bs; ++ ++ int32_t *tab_slice_address; ++ ++ // CU ++ uint8_t *skip_flag; ++ uint8_t *tab_ct_depth; ++ // PU ++ uint8_t *tab_ipm; ++ ++ uint8_t *cbf_luma; // cbf_luma of colocated TU ++ uint8_t *is_pcm; ++ ++ // CTB-level flags affecting loop filter operation ++ uint8_t *filter_slice_edges; ++ ++ /** used on BE to byteswap the lines for checksumming */ ++ uint8_t *checksum_buf; ++ int checksum_buf_size; ++ ++ /** ++ * Sequence counters for decoded and output frames, so that old ++ * frames are output first after a POC reset ++ */ ++ uint16_t seq_decode; ++ uint16_t seq_output; ++ ++ int enable_parallel_tiles; ++ atomic_int wpp_err; ++ ++ const uint8_t *data; ++ ++ H2645Packet pkt; ++ // type of the first VCL NAL of the current frame ++ enum HEVCNALUnitType first_nal_type; ++ ++ uint8_t context_initialized; ++ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated ++ ///< as a format defined in 14496-15 ++ int apply_defdispwin; ++ ++ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) ++ int nuh_layer_id; ++ ++ HEVCSEIContext sei; ++ ++ // Put structures that allocate non-trivial storage at the end ++ // These are mostly used indirectly so position in the structure doesn't matter ++ HEVCRpiLocalContextIntra HEVClcIntra; ++ HEVCRpiPassQueue passq[RPI_PASSES]; ++#if RPI_EXTRA_BIT_THREADS > 0 ++ int bt_started; ++ // This simply contains thread descriptors - task setup is held elsewhere ++ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS]; ++#endif ++#if RPI_TSTATS ++ HEVCRpiStats tstats; ++#endif ++} HEVCRpiContext; ++ ++/** ++ * Mark all frames in DPB as unused for reference. ++ */ ++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s); ++ ++/** ++ * Drop all frames currently in DPB. ++ */ ++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s); ++ ++const RefPicList *ff_hevc_rpi_get_ref_list(const HEVCRpiContext * const s, const HEVCFrame * const ref, ++ int x0, int y0); ++ ++/** ++ * Construct the reference picture sets for the current frame. ++ */ ++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s); ++ ++/** ++ * Construct the reference picture list(s) for the current slice. ++ */ ++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s); ++ ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts); ++int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts); ++int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_end_of_slice_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, const int x_cb, const int y_cb); ++int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int ct_depth, ++ const int x0, const int y0); ++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size); ++int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH); ++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx); ++int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size); ++int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth); ++int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth); ++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx); ++int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx); ++ ++/** ++ * Get the number of candidate references for the current frame. ++ */ ++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s); ++ ++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc); ++ ++/** ++ * Find next frame in output order and put a reference to it in frame. ++ * @return 1 if a frame was output, 0 otherwise ++ */ ++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush); ++ ++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s); ++ ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCFrame *frame, int flags); ++ ++void ff_hevc_rpi_set_neighbour_available(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, ++ const int nPbW, const int nPbH); ++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, ++ int merge_idx, MvField * const mv); ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext *lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, ++ int merge_idx, MvField * const mv, ++ int mvp_lx_flag, int LX); ++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase, int log2_cb_size); ++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, ++ int log2_trafo_size); ++int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_qp_delta_abs(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++void ff_hevc_rpi_hls_filter(HEVCRpiContext * const s, const int x, const int y, const int ctb_size); ++void ff_hevc_rpi_hls_filters(HEVCRpiContext *s, int x_ctb, int y_ctb, int ctb_size); ++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, ++ const int log2_trafo_size, const enum ScanType scan_idx, ++ const int c_idx); ++ ++void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); ++ ++ ++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4]; ++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4]; ++extern const uint8_t ff_hevc_rpi_qpel_extra[4]; ++ ++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n); ++ ++// arm/hevc_misc_neon.S ++// Neon coeff zap fn ++#if HAVE_NEON ++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); ++#endif ++ ++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int val, const int field); ++ ++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field); ++ ++// All of these expect that s->threads_type == FF_THREAD_FRAME ++ ++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int y) ++{ ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); ++} ++ ++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y) ++{ ++ if (s->used_for_ref) ++ ff_hevc_rpi_progress_signal_field(s, y, 1); ++} ++ ++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCFrame * const ref, const int y) ++{ ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); ++} ++ ++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y) ++{ ++ if (s->used_for_ref) ++ { ++ ff_hevc_rpi_progress_signal_field(s, y, 0); ++ } ++} ++ ++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s) ++{ ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); ++} ++ ++ ++// Set all done - signal nothing (used in missing refs) ++// Works for both rpi & non-rpi ++static inline void ff_hevc_rpi_progress_set_all_done(HEVCFrame * const ref) ++{ ++ if (ref->tf.progress != NULL) ++ { ++ int * const p = (int *)&ref->tf.progress->data; ++ p[0] = INT_MAX; ++ p[1] = INT_MAX; ++ } ++} ++ ++#define HEVC_RPI_420_ONLY 1 ++#define HEVC_RPI_SAND128_ONLY 1 ++ ++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx) ++{ ++#if HEVC_RPI_420_ONLY ++ return cidx == 0 ? 0 : 1; ++#else ++ return s->ps.sps->hshift[cidx]; ++#endif ++} ++ ++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx) ++{ ++#if HEVC_RPI_420_ONLY ++ return cidx == 0 ? 0 : 1; ++#else ++ return s->ps.sps->vshift[cidx]; ++#endif ++} ++ ++static inline int ctx_cfmt(const HEVCRpiContext * const s) ++{ ++#if HEVC_RPI_420_ONLY ++ return 1; ++#else ++ return s->ps.sps->chroma_format_idc; ++#endif ++} ++ ++static inline int frame_stride1(const AVFrame * const frame, const int c_idx) ++{ ++#if HEVC_RPI_SAND128_ONLY ++ return 128; ++#else ++ return frame->linesize[c_idx]; ++#endif ++} ++ ++#if HEVC_RPI_SAND128_ONLY ++// Propagate this decision to later zc includes ++#define RPI_ZC_SAND128_ONLY 1 ++#endif ++ ++#endif /* AVCODEC_RPI_HEVCDEC_H */ +diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c +new file mode 100644 +index 0000000000..3e4cfe8d46 +--- /dev/null ++++ b/libavcodec/rpi_hevcdsp.c +@@ -0,0 +1,415 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere ++ * ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcdsp.h" ++ ++static const int8_t transform[32][32] = { ++ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, ++ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, ++ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 }, ++ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, ++ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 }, ++ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, ++ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 }, ++ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, ++ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, ++ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, ++ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 }, ++ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, ++ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 }, ++ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, ++ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 }, ++ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, ++ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, ++ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, ++ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 }, ++ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, ++ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 }, ++ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, ++ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 }, ++ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, ++ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, ++ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, ++ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 }, ++ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, ++ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 }, ++ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, ++ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 }, ++ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, ++ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, ++ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, ++ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 }, ++ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, ++ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 }, ++ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, ++ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 }, ++ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, ++ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, ++ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, ++ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 }, ++ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, ++ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 }, ++ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, ++ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 }, ++ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, ++ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, ++ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, ++ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 }, ++ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, ++ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 }, ++ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, ++ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 }, ++ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, ++ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, ++ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, ++ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 }, ++ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, ++ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 }, ++ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, ++ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }, ++}; ++ ++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = { ++ { -2, 58, 10, -2}, ++ { -4, 54, 16, -2}, ++ { -6, 46, 28, -4}, ++ { -4, 36, 36, -4}, ++ { -4, 28, 46, -6}, ++ { -2, 16, 54, -4}, ++ { -2, 10, 58, -2}, ++}; ++ ++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = { ++ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0}, ++ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1}, ++ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1} ++}; ++ ++#define BIT_DEPTH 8 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc, ++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ const MvField *curr, const MvField *neigh, uint8_t *bs) ++{ ++ for (; pus > 0; pus--) { ++ int strength, out; ++ int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; ++ int curr_refL1 = curr_rpl1[curr->ref_idx[1]]; ++ int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]]; ++ int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]]; ++ ++#if 1 // This more directly matches the original implementation ++ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { ++ // same L0 and L1 ++ if (curr_refL0 == neigh_refL0 && ++ curr_refL0 == curr_refL1 && ++ neigh_refL0 == neigh_refL1) { ++ if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || ++ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) && ++ (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || ++ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)) ++ strength = 1; ++ else ++ strength = 0; ++ } else if (neigh_refL0 == curr_refL0 && ++ neigh_refL1 == curr_refL1) { ++ if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 || ++ FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else if (neigh_refL1 == curr_refL0 && ++ neigh_refL0 == curr_refL1) { ++ if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 || ++ FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else { ++ strength = 1; ++ } ++ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV ++ Mv curr_mv0, neigh_mv0; ++ ++ if (curr->pred_flag & 1) { ++ curr_mv0 = curr->mv[0]; ++ } else { ++ curr_mv0 = curr->mv[1]; ++ curr_refL0 = curr_refL1; ++ } ++ ++ if (neigh->pred_flag & 1) { ++ neigh_mv0 = neigh->mv[0]; ++ } else { ++ neigh_mv0 = neigh->mv[1]; ++ neigh_refL0 = neigh_refL1; ++ } ++ ++ if (curr_refL0 == neigh_refL0) { ++ if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else ++ strength = 1; ++ } else ++ strength = 1; ++#else // This has exactly the same effect, but is more suitable for vectorisation ++ Mv curr_mv[2]; ++ Mv neigh_mv[2]; ++ memcpy(curr_mv, curr->mv, sizeof curr_mv); ++ memcpy(neigh_mv, neigh->mv, sizeof neigh_mv); ++ ++ if (!(curr->pred_flag & 2)) { ++ curr_mv[1] = curr_mv[0]; ++ curr_refL1 = curr_refL0; ++ } ++ if (!(neigh->pred_flag & 2)) { ++ neigh_mv[1] = neigh_mv[0]; ++ neigh_refL1 = neigh_refL0; ++ } ++ if (!(curr->pred_flag & 1)) { ++ curr_mv[0] = curr_mv[1]; ++ curr_refL0 = curr_refL1; ++ } ++ if (!(neigh->pred_flag & 1)) { ++ neigh_mv[0] = neigh_mv[1]; ++ neigh_refL0 = neigh_refL1; ++ } ++ ++ strength = 1; ++ ++ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) | ++ (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) | ++ (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4); ++ ++ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) | ++ (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) | ++ (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4); ++ ++ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2); ++#endif ++ ++ curr += in_inc / sizeof (MvField); ++ neigh += in_inc / sizeof (MvField); ++ ++ for (out = dup; out > 0; out--) ++ { ++ *bs = strength; ++ bs += out_inc; ++ } ++ } ++} ++ ++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) ++{ ++#undef FUNC ++#define FUNC(a, depth) a ## _ ## depth ++ ++#undef PEL_FUNC ++#define PEL_FUNC(dst1, idx1, idx2, a, depth) \ ++ for(i = 0 ; i < 10 ; i++) \ ++{ \ ++ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \ ++} ++ ++#undef EPEL_FUNCS ++#define EPEL_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \ ++ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \ ++ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth) ++ ++#undef EPEL_UNI_FUNCS ++#define EPEL_UNI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth) ++ ++#undef EPEL_BI_FUNCS ++#define EPEL_BI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth) ++ ++#undef QPEL_FUNCS ++#define QPEL_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \ ++ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \ ++ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth) ++ ++#undef QPEL_UNI_FUNCS ++#define QPEL_UNI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth) ++ ++#undef QPEL_BI_FUNCS ++#define QPEL_BI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) ++ ++#define SLICED_ADD_RESIDUAL(depth)\ ++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ ++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ ++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ ++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ ++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ ++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ ++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ ++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ ++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ ++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ ++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ ++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ ++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ ++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) ++#define SLICED_LOOP_FILTERS(depth)\ ++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ ++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) ++#define SLICED_SAO(depth)\ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ ++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ ++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) ++ ++#define HEVC_DSP(depth) \ ++ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ ++ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \ ++ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ ++ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ ++ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ ++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ ++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ ++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ ++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ ++ SLICED_ADD_RESIDUAL(depth); \ ++ hevcdsp->dequant = FUNC(dequant, depth); \ ++ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ ++ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ ++ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ ++ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \ ++ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \ ++ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \ ++ \ ++ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \ ++ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \ ++ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ ++ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ ++ \ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ ++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ ++ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ ++ SLICED_SAO(depth); \ ++ \ ++ QPEL_FUNCS(depth); \ ++ QPEL_UNI_FUNCS(depth); \ ++ QPEL_BI_FUNCS(depth); \ ++ EPEL_FUNCS(depth); \ ++ EPEL_UNI_FUNCS(depth); \ ++ EPEL_BI_FUNCS(depth); \ ++ \ ++ SLICED_LOOP_FILTERS(depth); \ ++ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ ++ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \ ++ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth) ++int i = 0; ++ ++ switch (bit_depth) { ++ case 9: ++ HEVC_DSP(9); ++ break; ++ case 10: ++ HEVC_DSP(10); ++ break; ++ case 12: ++ HEVC_DSP(12); ++ break; ++ default: ++ HEVC_DSP(8); ++ break; ++ } ++ ++ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; ++ ++ if (ARCH_PPC) ++ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth); ++ if (ARCH_X86) ++ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth); ++ if (ARCH_ARM) ++ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth); ++ if (ARCH_MIPS) ++ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth); ++} +diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h +new file mode 100644 +index 0000000000..c974baa820 +--- /dev/null ++++ b/libavcodec/rpi_hevcdsp.h +@@ -0,0 +1,182 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere ++ * ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCDSP_H ++#define AVCODEC_RPI_HEVCDSP_H ++ ++#include "hevc.h" ++#include "get_bits.h" ++ ++#define MAX_PB_SIZE 64 ++ ++typedef struct SAOParams { ++// int offset_abs[3][4]; ///< sao_offset_abs ++// int offset_sign[3][4]; ///< sao_offset_sign ++ ++ uint8_t band_position[3]; ///< sao_band_position ++ uint8_t eo_class[3]; ///< sao_eo_class ++ uint8_t type_idx[3]; ///< sao_type_idx ++ ++ int16_t offset_val[3][5]; ///> 16; ++ const int dc_u = (dc << 16) >> 16; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + dc_u); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); ++ } ++ dst += stride; ++ } ++} ++ ++ ++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 32); ++} ++ ++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 32); ++} ++ ++// -- U -- (plaited) ++ ++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); ++} ++ ++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); ++} ++ ++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); ++} ++ ++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- V -- (plaited) ++ ++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); ++} ++ ++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); ++} ++ ++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); ++} ++ ++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- C -- (plaited - both U & V) ++ ++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++ ++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) ++{ ++ int16_t *coeffs = (int16_t *) _coeffs; ++ int x, y; ++ int size = 1 << log2_size; ++ ++ if (mode) { ++ coeffs += size; ++ for (y = 0; y < size - 1; y++) { ++ for (x = 0; x < size; x++) ++ coeffs[x] += coeffs[x - size]; ++ coeffs += size; ++ } ++ } else { ++ for (y = 0; y < size; y++) { ++ for (x = 1; x < size; x++) ++ coeffs[x] += coeffs[x - 1]; ++ coeffs += size; ++ } ++ } ++} ++ ++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) ++{ ++ int shift = 15 - BIT_DEPTH - log2_size; ++ int x, y; ++ int size = 1 << log2_size; ++ ++ if (shift > 0) { ++ int offset = 1 << (shift - 1); ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ *coeffs = (*coeffs + offset) >> shift; ++ coeffs++; ++ } ++ } ++ } else { ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ *coeffs = *coeffs << -shift; ++ coeffs++; ++ } ++ } ++ } ++} ++ ++#define SET(dst, x) (dst) = (x) ++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift) ++ ++#define TR_4x4_LUMA(dst, src, step, assign) \ ++ do { \ ++ int c0 = src[0 * step] + src[2 * step]; \ ++ int c1 = src[2 * step] + src[3 * step]; \ ++ int c2 = src[0 * step] - src[3 * step]; \ ++ int c3 = 74 * src[1 * step]; \ ++ \ ++ assign(dst[2 * step], 74 * (src[0 * step] - \ ++ src[2 * step] + \ ++ src[3 * step])); \ ++ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \ ++ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \ ++ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \ ++ } while (0) ++ ++static void FUNC(transform_4x4_luma)(int16_t *coeffs) ++{ ++ int i; ++ int shift = 7; ++ int add = 1 << (shift - 1); ++ int16_t *src = coeffs; ++ ++ for (i = 0; i < 4; i++) { ++ TR_4x4_LUMA(src, src, 4, SCALE); ++ src++; ++ } ++ ++ shift = 20 - BIT_DEPTH; ++ add = 1 << (shift - 1); ++ for (i = 0; i < 4; i++) { ++ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE); ++ coeffs += 4; ++ } ++} ++ ++#undef TR_4x4_LUMA ++ ++#define TR_4(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \ ++ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \ ++ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \ ++ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \ ++ \ ++ assign(dst[0 * dstep], e0 + o0); \ ++ assign(dst[1 * dstep], e1 + o1); \ ++ assign(dst[2 * dstep], e1 - o1); \ ++ assign(dst[3 * dstep], e0 - o0); \ ++ } while (0) ++ ++#define TR_8(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_8[4]; \ ++ int o_8[4] = { 0 }; \ ++ for (i = 0; i < 4; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_8[i] += transform[4 * j][i] * src[j * sstep]; \ ++ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \ ++ \ ++ for (i = 0; i < 4; i++) { \ ++ assign(dst[i * dstep], e_8[i] + o_8[i]); \ ++ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \ ++ } \ ++ } while (0) ++ ++#define TR_16(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_16[8]; \ ++ int o_16[8] = { 0 }; \ ++ for (i = 0; i < 8; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_16[i] += transform[2 * j][i] * src[j * sstep]; \ ++ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \ ++ \ ++ for (i = 0; i < 8; i++) { \ ++ assign(dst[i * dstep], e_16[i] + o_16[i]); \ ++ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \ ++ } \ ++ } while (0) ++ ++#define TR_32(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_32[16]; \ ++ int o_32[16] = { 0 }; \ ++ for (i = 0; i < 16; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_32[i] += transform[j][i] * src[j * sstep]; \ ++ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \ ++ \ ++ for (i = 0; i < 16; i++) { \ ++ assign(dst[i * dstep], e_32[i] + o_32[i]); \ ++ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \ ++ } \ ++ } while (0) ++ ++#define IDCT_VAR4(H) \ ++ int limit2 = FFMIN(col_limit + 4, H) ++#define IDCT_VAR8(H) \ ++ int limit = FFMIN(col_limit, H); \ ++ int limit2 = FFMIN(col_limit + 4, H) ++#define IDCT_VAR16(H) IDCT_VAR8(H) ++#define IDCT_VAR32(H) IDCT_VAR8(H) ++ ++#define IDCT(H) \ ++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ ++ int col_limit) \ ++{ \ ++ int i; \ ++ int shift = 7; \ ++ int add = 1 << (shift - 1); \ ++ int16_t *src = coeffs; \ ++ IDCT_VAR ## H(H); \ ++ \ ++ for (i = 0; i < H; i++) { \ ++ TR_ ## H(src, src, H, H, SCALE, limit2); \ ++ if (limit2 < H && i%4 == 0 && !!i) \ ++ limit2 -= 4; \ ++ src++; \ ++ } \ ++ \ ++ shift = 20 - BIT_DEPTH; \ ++ add = 1 << (shift - 1); \ ++ for (i = 0; i < H; i++) { \ ++ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ ++ coeffs += H; \ ++ } \ ++} ++ ++#define IDCT_DC(H) \ ++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \ ++{ \ ++ int i, j; \ ++ int shift = 14 - BIT_DEPTH; \ ++ int add = 1 << (shift - 1); \ ++ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \ ++ \ ++ for (j = 0; j < H; j++) { \ ++ for (i = 0; i < H; i++) { \ ++ coeffs[i + j * H] = coeff; \ ++ } \ ++ } \ ++} ++ ++IDCT( 4) ++IDCT( 8) ++IDCT(16) ++IDCT(32) ++ ++IDCT_DC( 4) ++IDCT_DC( 8) ++IDCT_DC(16) ++IDCT_DC(32) ++ ++#undef TR_4 ++#undef TR_8 ++#undef TR_16 ++#undef TR_32 ++ ++#undef SET ++#undef SCALE ++ ++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ for (k = 0; k < 4; k++) ++ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++ ++#define CMP(a, b) (((a) > (b)) - ((a) < (b))) ++ ++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, ++ int eo, int width, int height) { ++ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); ++ stride_dst /= sizeof(pixel); ++ ++ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ int diff0 = CMP(src[x], src[x + a_stride]); ++ int diff1 = CMP(src[x], src[x + b_stride]); ++ int offset_val = edge_idx[2 + diff0 + diff1]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} ++ ++ ++#if BIT_DEPTH == 10 ++// We need a 32 bit variation for the _c restores so hijack bit depth 10 ++#undef pixel ++#undef BIT_DEPTH ++#define pixel uint32_t ++#define BIT_DEPTH 32 ++// All 16 bit variations are the same ++#define sao_edge_restore_0_10 sao_edge_restore_0_9 ++#define sao_edge_restore_1_10 sao_edge_restore_1_9 ++#define sao_edge_restore_0_11 sao_edge_restore_0_9 ++#define sao_edge_restore_1_11 sao_edge_restore_1_9 ++#define sao_edge_restore_0_12 sao_edge_restore_0_9 ++#define sao_edge_restore_1_12 sao_edge_restore_1_9 ++#define sao_edge_restore_0_13 sao_edge_restore_0_9 ++#define sao_edge_restore_1_13 sao_edge_restore_1_9 ++#define sao_edge_restore_0_14 sao_edge_restore_0_9 ++#define sao_edge_restore_1_14 sao_edge_restore_1_9 ++#define sao_edge_restore_0_15 sao_edge_restore_0_9 ++#define sao_edge_restore_1_15 sao_edge_restore_1_9 ++#define sao_edge_restore_0_16 sao_edge_restore_0_9 ++#define sao_edge_restore_1_16 sao_edge_restore_1_9 +#endif -diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h -new file mode 100644 -index 0000000000..82bf380eb4 ---- /dev/null -+++ b/libavcodec/rpi_shader.h -@@ -0,0 +1,63 @@ -+#ifndef rpi_shader_H -+#define rpi_shader_H -+ -+extern unsigned int rpi_shader[]; -+ -+#define mc_setup_c_q0 (rpi_shader + 0) -+#define mc_start (rpi_shader + 0) -+#define mc_setup_c_qn (rpi_shader + 2) -+#define mc_filter_c_p (rpi_shader + 142) -+#define mc_filter_c_p_l1 (rpi_shader + 272) -+#define mc_filter_c_b (rpi_shader + 402) -+#define mc_sync_q0 (rpi_shader + 590) -+#define mc_sync_q1 (rpi_shader + 608) -+#define mc_sync_q2 (rpi_shader + 620) -+#define mc_sync_q3 (rpi_shader + 632) -+#define mc_sync_q4 (rpi_shader + 644) -+#define mc_sync_q5 (rpi_shader + 662) -+#define mc_sync_q6 (rpi_shader + 674) -+#define mc_sync_q7 (rpi_shader + 686) -+#define mc_sync_q8 (rpi_shader + 698) -+#define mc_sync_q9 (rpi_shader + 716) -+#define mc_sync_q10 (rpi_shader + 728) -+#define mc_sync_q11 (rpi_shader + 740) -+#define mc_exit_c_qn (rpi_shader + 752) -+#define mc_exit_y_qn (rpi_shader + 752) -+#define mc_exit_c_q0 (rpi_shader + 770) -+#define mc_exit_y_q0 (rpi_shader + 770) -+#define mc_setup_y_q0 (rpi_shader + 790) -+#define mc_setup_y_qn (rpi_shader + 792) -+#define mc_filter_y_pxx (rpi_shader + 1032) -+#define mc_filter_y_bxx (rpi_shader + 1162) -+#define mc_filter_y_p00 (rpi_shader + 1292) -+#define mc_filter_y_b00 (rpi_shader + 1382) -+#define mc_setup_c10_q0 (rpi_shader + 1462) -+#define mc_setup_c10_qn (rpi_shader + 1464) -+#define mc_filter_c10_p (rpi_shader + 1600) -+#define mc_filter_c10_p_l1 (rpi_shader + 1728) -+#define mc_filter_c10_b (rpi_shader + 1856) -+#define mc_sync10_q0 (rpi_shader + 2042) -+#define mc_sync10_q1 (rpi_shader + 2060) -+#define mc_sync10_q2 (rpi_shader + 2072) -+#define mc_sync10_q3 (rpi_shader + 2084) -+#define mc_sync10_q4 (rpi_shader + 2096) -+#define mc_sync10_q5 (rpi_shader + 2114) -+#define mc_sync10_q6 (rpi_shader + 2126) -+#define mc_sync10_q7 (rpi_shader + 2138) -+#define mc_sync10_q8 (rpi_shader + 2150) -+#define mc_sync10_q9 (rpi_shader + 2168) -+#define mc_sync10_q10 (rpi_shader + 2180) -+#define mc_sync10_q11 (rpi_shader + 2192) -+#define mc_exit_c10_q0 (rpi_shader + 2204) -+#define mc_exit_y10_q0 (rpi_shader + 2204) -+#define mc_exit_c10_qn (rpi_shader + 2224) -+#define mc_exit_y10_qn (rpi_shader + 2224) -+#define mc_setup_y10_q0 (rpi_shader + 2242) -+#define mc_setup_y10_qn (rpi_shader + 2244) -+#define mc_filter_y10_pxx (rpi_shader + 2494) -+#define mc_filter_y10_p00 (rpi_shader + 2624) -+#define mc_filter_y10_bxx (rpi_shader + 2716) -+#define mc_filter_y10_b00 (rpi_shader + 2846) -+#define mc_end (rpi_shader + 2926) ++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 ++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int sao_eo_class = sao->eo_class[c_idx]; ++ int init_x = 0, width = _width, height = _height; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ if (sao_eo_class != SAO_EO_VERT) { ++ if (borders[0]) { ++ for (y = 0; y < height; y++) { ++ dst[y * stride_dst] = src[y * stride_src]; ++ } ++ init_x = 1; ++ } ++ if (borders[2]) { ++ int offset = width - 1; ++ for (x = 0; x < height; x++) { ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; ++ } ++ width--; ++ } ++ } ++ if (sao_eo_class != SAO_EO_HORIZ) { ++ if (borders[1]) { ++ for (x = init_x; x < width; x++) ++ dst[x] = src[x]; ++ } ++ if (borders[3]) { ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); ++ for (x = init_x; x < width; x++) ++ dst[x + y_stride_dst] = src[x + y_stride_src]; ++ height--; ++ } ++ } ++} ++ ++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int sao_eo_class = sao->eo_class[c_idx]; ++ int init_x = 0, init_y = 0, width = _width, height = _height; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ if (sao_eo_class != SAO_EO_VERT) { ++ if (borders[0]) { ++ for (y = 0; y < height; y++) { ++ dst[y * stride_dst] = src[y * stride_src]; ++ } ++ init_x = 1; ++ } ++ if (borders[2]) { ++ int offset = width - 1; ++ for (x = 0; x < height; x++) { ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; ++ } ++ width--; ++ } ++ } ++ if (sao_eo_class != SAO_EO_HORIZ) { ++ if (borders[1]) { ++ for (x = init_x; x < width; x++) ++ dst[x] = src[x]; ++ init_y = 1; ++ } ++ if (borders[3]) { ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); ++ for (x = init_x; x < width; x++) ++ dst[x + y_stride_dst] = src[x + y_stride_src]; ++ height--; ++ } ++ } ++ ++ { ++ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1]; ++ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2]; ++ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3]; ++ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3]; ++ ++ // Restore pixels that can't be modified ++ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) { ++ for(y = init_y+save_upper_left; y< height-save_lower_left; y++) ++ dst[y*stride_dst] = src[y*stride_src]; ++ } ++ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) { ++ for(y = init_y+save_upper_right; y< height-save_lower_right; y++) ++ dst[y*stride_dst+width-1] = src[y*stride_src+width-1]; ++ } ++ ++ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) { ++ for(x = init_x+save_upper_left; x < width-save_upper_right; x++) ++ dst[x] = src[x]; ++ } ++ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) { ++ for(x = init_x+save_lower_left; x < width-save_lower_right; x++) ++ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x]; ++ } ++ if(diag_edge[0] && sao_eo_class == SAO_EO_135D) ++ dst[0] = src[0]; ++ if(diag_edge[1] && sao_eo_class == SAO_EO_45D) ++ dst[width-1] = src[width-1]; ++ if(diag_edge[2] && sao_eo_class == SAO_EO_135D) ++ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1]; ++ if(diag_edge[3] && sao_eo_class == SAO_EO_45D) ++ dst[stride_dst*(height-1)] = src[stride_src*(height-1)]; ++ ++ } ++} ++#endif ++#if BIT_DEPTH == 32 ++#undef BIT_DEPTH ++#undef pixel ++#define BIT_DEPTH 10 ++#define pixel uint16_t ++#endif ++ ++// --- Plaited chroma versions ++ ++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table_u[32] = { 0 }; ++ int offset_table_v[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ width *= 2; ++ ++ for (k = 0; k < 4; k++) ++ { ++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; ++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; ++ } ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) ++ { ++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); ++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); ++ // *** & 31 shouldn't be wanted but just now we generate broken input that ++ // crashes us in 10-bit world ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); ++ } ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++ ++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, ++ int eo, int width, int height) { + -+#endif -diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm -new file mode 100644 -index 0000000000..ba6cc13a95 ---- /dev/null -+++ b/libavcodec/rpi_shader.qasm -@@ -0,0 +1,1741 @@ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel); + -+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress -+# the warning that we are using rotation & ra/rb registers. r0..3 can be -+# rotated through all 16 elems ra regs can only be rotated through their -+# local 4. As it happens this is what is wanted here as we do not want the -+# constants from the other half of the calc. ++ stride_dst /= sizeof(pixel); ++ width *= 2; + -+# PREREAD is the number of requests that we have sitting in the TMU request -+# queue. -+# -+# There are 8 slots availible in the TMU request Q for tm0s requests, but -+# only 4 output FIFO entries and overflow is bad (corruption or crash) -+# (If threaded then only 2 out FIFO entries, but we aren't.) -+# In s/w we are effectively limited to the min vertical read which is >= 4 -+# so output FIFO is the limit. -+# -+# However in the current world there seems to be no benefit (and a small -+# overhead) in setting this bigger than 2. ++ av_assert0(width <= 64); + -+.set PREREAD, 4 ++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) { ++ int diff0u = CMP(src[x], src[x + a_stride]); ++ int diff1u = CMP(src[x], src[x + b_stride]); ++ int offset_valu = edge_idx[2 + diff0u + diff1u]; ++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); ++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); ++ int offset_valv = edge_idx[2 + diff0v + diff1v]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); ++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} + -+# Block heights - 8 & 16 are the only numbers we currently support ++// Do once ++#if BIT_DEPTH == 8 ++// Any old 2 byte 'normal' restore will work for these ++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 ++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 ++// We need 32 bit for 9 bit+ ++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 ++#endif + -+.set C_BLK_HEIGHT_8, 16 -+.set C_BLK_HEIGHT_16, 8 -+.set Y_BLK_HEIGHT_8, 16 -+.set Y_BLK_HEIGHT_16, 8 ++#undef CMP + -+# QPU counts - depend on block size -+# If we have a 2-byte format & block_size > 8 then can only afford -+# 8 QPUs -+# These numbers must match the numbers in rpi_shader_cmd.h ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++static void FUNC(put_hevc_pel_pixels)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); + -+.set N_QPU_8, 12 -+.set N_QPU_16, 12 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = src[x] << (14 - BIT_DEPTH); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} + -+# register allocation -+# ++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); + -+# ra0-3 -+# Used as temp and may be loop filter coeffs (split into .8s) -+# or temp in loop. Check usage on an individual basis. ++ for (y = 0; y < height; y++) { ++ memcpy(dst, src, width * sizeof(pixel)); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+# ra4-7 -+# C: L0 H filter out FIFO -+# otherwise -- free -- ++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# ra8-11 -+# temp in some places - check usage -+# Y: (with rb8-11) horiz out FIFO ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+# ra12-15 -+# -- free -- ++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# uniform: width:height -+.set ra_width_height, ra16 -+.set ra_width, ra16.16b -+.set ra_height, ra16.16a ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+# y:y2 same layout as y_y2_next so we can update both together -+.set ra_y_y2, ra17 -+.set ra_y2, ra17.16a -+.set ra_y, ra17.16b ++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); + -+# uniform: L1 weight (U on left, V on right) -+# Only used in Y B -+.set ra_wt_off_mul_l1, ra18 -+.set ra_wt_off_l1, ra18.16b -+.set ra_wt_mul_l1, ra18.16a ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; + -+# y_next:y2_next same layout as y_y2 so we can update both together -+.set ra_y_y2_next, ra19 -+.set ra_y_next, ra19.16b -+.set ra_y2_next, ra19.16a ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); ++ } ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+# Setup: consts - subdivide a single register -+.set ra_kff100100, ra20 -+.set ra_k256, ra20.16a -+.set ra_k0, ra20.8a -+.set ra_k1, ra20.8b -+.set ra_k16, ra20.8c -+.set ra_k255, ra20.8d ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++#define QPEL_FILTER(src, stride) \ ++ (filter[0] * src[x - 3 * stride] + \ ++ filter[1] * src[x - 2 * stride] + \ ++ filter[2] * src[x - stride] + \ ++ filter[3] * src[x ] + \ ++ filter[4] * src[x + stride] + \ ++ filter[5] * src[x + 2 * stride] + \ ++ filter[6] * src[x + 3 * stride] + \ ++ filter[7] * src[x + 4 * stride]) ++ ++static void FUNC(put_hevc_qpel_h)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} + -+# Loop: xshifts -+.set ra_xshift, ra21.16a -+.set ra_xshift_next, ra21.16b ++static void FUNC(put_hevc_qpel_v)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} + -+# Loop var: L0 weight (U on left, V on right) -+# _off_ is not used in loop as we want to modify it before use -+.set ra_wt_off_mul_l0, ra22 -+.set ra_wt_mul_l0, ra22.16a -+.set ra_wt_off_l0, ra22.16b ++static void FUNC(put_hevc_qpel_hv)(int16_t *dst, ++ uint8_t *_src, ++ ptrdiff_t _srcstride, ++ int height, intptr_t mx, ++ intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+# Max pel value (for 8 bit we can get away with sat ops but not 9+) -+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the -+# 2nd byte but as the source should never be > 3 there 0x3ff should do -+.set ra_blk_height_pmax, ra23 -+.set ra_pmax, ra23.16a -+.set ra_blk_height, ra23.8c -+# -- free -- ra23.8d ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; ++ tmp += MAX_PB_SIZE; ++ dst += MAX_PB_SIZE; ++ } ++} + -+# Loop: src frame base (L0) -+.set ra_base, ra24 ++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Loop: src frame base (L1) -+.set ra_base2, ra25 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+# Loop: next src frame base (L0) -+.set ra_base_next, ra26 ++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); + -+# -- free -- ra27 -+# -- free -- ra28 -+# -- free -- ra29 ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; + -+# Use an even numbered register as a link register to avoid corrupting flags -+.set ra_link, ra30 ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# -- free -- ra31 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+.set rb_xshift2, rb0 -+.set rb_xshift2_next, rb1 ++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 -+.set rb_elem_x, rb2 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+# El Flags -+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n -+.set rb_ef, rb3 + -+# rb4-7 -+# C-B: L1 H filter out FIFO -+# Y: (with ra2.8x) Y vertical filter coeffs ++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); + -+# rb8-11 -+# C: Vertical filter coeffs -+# Y: (with ra8-11) horiz out FIFO ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; + -+# Loop var: offset to add before shift (round + weighting offsets) -+# Exact value varies by loop -+.set rb_wt_off, rb12 ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Setup: denom + 6 + 9 -+.set rb_wt_den_p15, rb13 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# -- free -- rb14 -+# -- free -- rb15 ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+# Line pitch (128 for sand128) -+.set rb_pitch, rb16 ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; + -+# Loop count - 2 (set up TMU for next xfer) -+.set rb_i_tmu, rb17 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} + -+# Loop count for min(height, 16) -+# Y will reset & loop again if height > 16 -+.set rb_lcount, rb18 ++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# frame_base2_next -+.set rb_base2_next, rb19 ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give -+# offset to the slice -+.set rb_xpitch, rb20 ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; + -+# -- free -- rb21 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+# Setup: 0xff (8-bit) / 0xffff (9+ bit) -+.set rb_pmask, rb22 ++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Loop: destination address -+.set rb_dest, rb23 ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+# vdw_setup_1(dst_pitch) -+.set rb_dma1_base, rb24 ++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); + -+# Setup: pic width - 1 -+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. -+.set rb_max_x, rb25 ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; + -+# Loop: height<<23 + width<<16 + vdw_setup_0 -+.set rb_dma0, rb26 ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; + -+# vdw_setup_0 (depends on QPU number) -+.set rb_dma0_base, rb27 ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+# Setup: vw_setup value to reset VPM write pointer -+.set rb_vpm_init, rb28 ++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Loop: vdw_setup_1(dst_pitch-width) = stride -+.set rb_dma1, rb29 ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+# Setup: pic_height - 1 -+.set rb_max_y, rb30 ++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); + -+# -- free -- rb31 ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; + ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; + ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + ++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. -+.set i_shift16, -16 -+.set i_shift21, -11 -+.set i_shift23, -9 -+.set i_shift30, -2 ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+# Much of the setup code is common between Y & C -+# Macros that express this - obviously these can't be overlapped -+# so are probably unsuitable for loop code ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; + -+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma -+ mov r2, qpu_num -+.if v_bit_depth <= 8 -+ # 8 bit version -+ asr r1, r2, 2 -+ shl r1, r1, 6 -+ and r0, r2, 3 -+ or r0, r0, r1 ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} + -+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit -+ add r_vpm, r0, r1 # VPM 8bit storage ++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later -+ shl r0, r0, 5 ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; + -+.else -+ # 16 bit version -+ # Limited to 8 QPUs if blk height > 8 -+ asr r1, r2, 1 -+.if v_blk_height <= 8 -+ shl r1, r1, 4 -+.else -+ shl r1, r1, 5 -+.endif -+ and r0, r2, 1 -+ or r0, r0, r1 ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR -+ add r_vpm, r0, r1 ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++#define EPEL_FILTER(src, stride) \ ++ (filter[0] * src[x - stride] + \ ++ filter[1] * src[x] + \ ++ filter[2] * src[x + stride] + \ ++ filter[3] * src[x + 2 * stride]) + -+ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into -+ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) -+ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later -+ shl r0, r0, 6 -+.endif -+ add r_dma, r0, r1 # DMA out -+.endm ++static void FUNC(put_hevc_epel_h)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} + ++static void FUNC(put_hevc_epel_v)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; + -+.macro m_setup_q0 -+ srel -, 12 -+.endm ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} + -+# Code start label -+::mc_start ++static void FUNC(put_hevc_epel_hv)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; + -+################################################################################ -+# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id) ++ src -= EPEL_EXTRA_BEFORE * srcstride; + -+.macro m_setup_c, v_bit_depth ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+# Cannot use mul24 on x as x might be -ve, so must use shift -+.if v_bit_depth <= 8 -+.set v_x_shift, 1 -+.set v_pmask, 0xff -+.set v_blk_height, C_BLK_HEIGHT_8 -+.else -+.set v_x_shift, 2 -+.set v_pmask, 0xffff -+.set v_blk_height, C_BLK_HEIGHT_16 -+.endif ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; + -+ mov tmurs, 1 # No swap TMUs ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; ++ tmp += MAX_PB_SIZE; ++ dst += MAX_PB_SIZE; ++ } ++} + -+# Load first request location -+ mov ra0, unif # next_x_y ++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+ shl rb_ef, r0, i_shift30 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+ mov ra_base, unif # Store frame c base ++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Read image dimensions -+ sub r0, unif, 1 # pic c width -+ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes -+ sub rb_max_y, unif, 1 # pic c height ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ } ++ dst += dststride; ++ src += srcstride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+# load constants -+ mov ra_kff100100, 0xff100100 -+ mov rb_pmask, v_pmask -+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# get source pitch -+ mov rb_xpitch, unif # stride2 -+ mov rb_pitch, unif # stride1 -+ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly -+ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} + -+ and r0, 1, elem_num -+ nop ; mul24 r0, r0, 5 -+.if v_bit_depth <= 8 -+ add rb_elem_x, r0, elem_num -+.else -+ add r0, r0, elem_num -+ add rb_elem_x, r0, r0 -+.endif ++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Compute base address for first and second access -+# ra_base ends up with t0s base -+# ra_base2 ends up with t1s base ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ dst += dststride; ++ src += srcstride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] -+ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice -+ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y -+ min r0, r0, rb_max_x ++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Get shift -+# Shift will always calculate as 0 for 9+ bit -+# Ideally we can optimize the shift out of the code in these cases but for now -+# it is tidier to leave it in -+.if v_bit_depth <= 8 -+ shl ra_xshift_next, r0, 3 -+.else -+ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 -+.endif ++ src -= EPEL_EXTRA_BEFORE * srcstride; + -+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+.if v_bit_depth <= 8 -+ and r0, r0, -4 -+.endif -+ sub r1, ra_k0, rb_pitch -+ and r1, r0, r1 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 -+ add ra_base, ra_base, r0 ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; + -+ add rb_wt_den_p15, 23 - v_bit_depth, unif # denominator ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} + -+# Compute part of VPM to use for DMA output -+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? -+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# And again for L1, but only worrying about frame2 stuff ++ src -= EPEL_EXTRA_BEFORE * srcstride; + -+# Load first request location -+ mov ra0, unif # next_x_y ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+ mov ra_base2, unif # [ra0 delay] Store frame c base ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; + -+# Compute base address for first and second access -+# ra_base ends up with t0s base -+# ra_base2 ends up with t1s base ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+ shl r0, ra0.16b, v_x_shift -+ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset -+ max r0, r0, 0 -+ min r0, r0, rb_max_x ++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Get shift (already zero if 9+ bit so ignore) -+.if v_bit_depth <= 8 -+ shl rb_xshift2_next, r0, 3 -+.endif ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ } ++ dst += dststride; ++ src += srcstride; ++ } ++} + -+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+.if v_bit_depth <= 8 -+ and r0, r0, -4 -+.endif -+ sub r1, ra_k0, rb_pitch -+ and r1, r0, r1 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r2, ra_y2 -+ add ra_base2, ra_base2, r0 ++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+# Do preloads -+# r0 = ra_y, r2 = ra_y2 -+ mov r3, PREREAD ; mov r0, ra_y ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ } ++ dst += dststride; ++ src += srcstride; ++ } ++} + -+:1 -+ sub.setf r3, r3, 1 -+ max r1, r0, 0 -+ min r1, r1, rb_max_y -+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t0s, ra_base, r1 ; mov ra_y, r0 ++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+ max r1, r2, 0 -+ brr.anynz -, r:1b -+ min r1, r1, rb_max_y -+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t1s, ra_base2, r1 ; mov ra_y2, r2 -+# >>> .anynz 1b ++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif + -+ mov ra_link, unif # link -+# touch registers to keep simulator happy -+ # ra/b4..7: B0 -> B stash registers -+ mov ra4, 0 ; mov rb4, 0 -+ bra -, ra_link -+ mov ra5, 0 ; mov rb5, 0 -+ mov ra6, 0 ; mov rb6, 0 -+ mov ra7, 0 ; mov rb7, 0 -+# >>> ra_link -+.endm ++ src -= EPEL_EXTRA_BEFORE * srcstride; + -+::mc_setup_c_q0 -+ m_setup_q0 -+::mc_setup_c_qn -+ m_setup_c 8 ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+################################################################################ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; + -+# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst) ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} + -+# At this point we have already issued two pairs of texture requests for the current block -+# ra_x, ra_x16_base point to the current coordinates for this block ++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } + -+.macro m_filter_c_p, v_tmu, v_bit_depth ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; + -+.if v_bit_depth <= 8 -+.set v_x_shift, 1 -+.set v_x_mul, 2 -+.set v_v_shift, 8 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 2 -+.set v_x_mul, 4 -+.set v_v_shift, i_shift16 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} + -+.if v_tmu == 0 -+.set vrx_xshift, rb_xshift2 # b side more convienient -+.set vrx_xshift_next, ra_xshift_next -+.set vra_y_next, ra_y_next -+.set vrx_base_next, ra_base_next -+.set vra_y, ra_y -+.set vra_base, ra_base -+.set vr_txs, t0s -+.else -+.set vrx_xshift, ra_xshift # a side more convienient -+.set vrx_xshift_next, rb_xshift2_next -+.set vra_y_next, ra_y2_next -+.set vrx_base_next, rb_base2_next -+.set vra_y, ra_y2 -+.set vra_base, ra_base2 -+.set vr_txs, t1s -+.endif ++// line zero ++#define P3 pix[-4 * xstride] ++#define P2 pix[-3 * xstride] ++#define P1 pix[-2 * xstride] ++#define P0 pix[-1 * xstride] ++#define Q0 pix[0 * xstride] ++#define Q1 pix[1 * xstride] ++#define Q2 pix[2 * xstride] ++#define Q3 pix[3 * xstride] + -+# per-channel shifts were calculated on the *previous* invocation -+# get base addresses and per-channel shifts for *next* invocation -+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++// line three. used only for deblocking decision ++#define TP3 pix[-4 * xstride + 3 * ystride] ++#define TP2 pix[-3 * xstride + 3 * ystride] ++#define TP1 pix[-2 * xstride + 3 * ystride] ++#define TP0 pix[-1 * xstride + 3 * ystride] ++#define TQ0 pix[0 * xstride + 3 * ystride] ++#define TQ1 pix[1 * xstride + 3 * ystride] ++#define TQ2 pix[2 * xstride + 3 * ystride] ++#define TQ3 pix[3 * xstride + 3 * ystride] ++ ++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, ++ ptrdiff_t _xstride, ptrdiff_t _ystride, ++ int beta, int *_tc, ++ uint8_t *_no_p, uint8_t *_no_q) ++{ ++ int d, j; ++ pixel *pix = (pixel *)_pix; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); + -+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base ++ beta <<= BIT_DEPTH - 8; + -+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 -+ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height -+ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs -+ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next -+ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ const int no_p = _no_p[j]; ++ const int no_q = _no_q[j]; + -+.if v_bit_depth <= 8 -+ shl vrx_xshift_next, r0, 3 -+ and r0, r0, -4 -+.endif -+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs -+ add vrx_base_next, r3, r0 ; mov r1, ra_height ++ if (d0 + d3 >= beta) { ++ pix += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); + -+# set up VPM write -+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight -+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; + -+# ; unpack filter coefficients ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix += ystride; ++ } ++ } ++ } ++ } ++} + -+ shl r0, r1, v_dma_h_shift ; mov rb8, ra3.8a -+ add r0, r0, r2 ; mov rb9, ra3.8b # Combine width and height of destination area (r0=h<<8, r2=w*2) -+ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight ++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, int *_tc, ++ uint8_t *_no_p, uint8_t *_no_q) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix = (pixel *)_pix; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); + -+ mov rb_dest, unif ; mov ra9, rb_max_y # dst_addr ; alias rb_max_y ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; + -+ shl r1, r1, rb_wt_den_p15 ; mov rb11, ra3.8d ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix += ystride; ++ } ++ } ++} + -+ asr rb_wt_off, r1, 2 ; mov ra_link, unif # ; Link -+ sub ra3, rb_wt_den_p15, ra_k1 ++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, ++ int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q); ++} + -+# r5 = 0 (loop counter) -+# ra9 = alias for rb_max_y -+# ra_wt_mul_l0 = weight L0 -+# ra3 = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19] -+# rb_wt_off = (offset * 2 + 1) << (ra3 - 1) ++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, ++ int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q); ++} + -+# We want (r0r1) -+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... -+# We fetch (after shift) -+# C0 : C3 : C1 : C4 : C2 : C5 : ... ++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, ++ int beta, int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), ++ beta, tc, no_p, no_q); ++} + -+:1 -+# retrieve texture results and pick out bytes -+# then submit two more texture requests ++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, ++ int beta, int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, ++ beta, tc, no_p, no_q); ++} + -+.if v_tmu == 0 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment -+ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next -+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next -+.else -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 # loop counter increment -+ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next -+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y -+ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next -+.endif ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 + -+ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 -+ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 -+ min r3, r3, ra9 ; mov.ifnc r0, r2 ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 + -+ mov ra4, ra5 ; mul24 r2, r3, rb_pitch -+ add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++// line zero ++#define P3 pix_l[0 * xstride] ++#define P2 pix_l[1 * xstride] ++#define P1 pix_l[2 * xstride] ++#define P0 pix_l[3 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++#define Q2 pix_r[2 * xstride] ++#define Q3 pix_r[3 * xstride] + -+# apply horizontal filter -+# The filter coeffs for the two halves of this are the same (unlike in the -+# Y case) so it doesn't matter which ra0 we get them from -+# Also as the two halves are locked together we don't need to separate the 1st -+# r0 mul or the last r1 mul as they are vaild for all QPUs ++// line three. used only for deblocking decision ++#define TP3 pix_l[0 * xstride + 3 * ystride] ++#define TP2 pix_l[1 * xstride + 3 * ystride] ++#define TP1 pix_l[2 * xstride + 3 * ystride] ++#define TP0 pix_l[3 * xstride + 3 * ystride] ++#define TQ0 pix_r[0 * xstride + 3 * ystride] ++#define TQ1 pix_r[1 * xstride + 3 * ystride] ++#define TQ2 pix_r[2 * xstride + 3 * ystride] ++#define TQ3 pix_r[3 * xstride + 3 * ystride] + -+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+ sub.setf -, r5, 4 ; mul24 r0, ra0.8d, r1 ++// This is identical to hevc_loop_filter_luma except that the P/Q ++// components are on separate pointers ++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, const int32_t _tc[2], ++ const uint8_t _no_p[2], const uint8_t _no_q[2], ++ uint8_t * _pix_l) ++{ ++ int d, j; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ const ptrdiff_t xstride = 1; ++ const ptrdiff_t ystride = _stride / sizeof(pixel); + -+# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift) -+# Have to dup block as we need to move the brr - code is more common than it -+# looks at first glance -+.if v_bit_depth <= 8 -+ brr.anyn -, r:1b -+ add r2, r2, r3 ; mov ra5, ra6 -+ mov ra6, ra7 ; mul24 r1, ra7, rb10 -+ sub ra7, r2, r0 ; mul24 r0, ra4, rb8 -+.else -+ add r2, r2, r3 ; mov ra5, ra6 -+ brr.anyn -, r:1b -+ mov ra6, ra7 ; mul24 r1, ra7, rb10 -+ sub r2, r2, r0 ; mul24 r0, ra4, rb8 -+ asr ra7, r2, v_bit_depth - 8 -+.endif -+# >>> .anyn 1b ++ beta <<= BIT_DEPTH - 8; + -+ sub r1, r1, r0 ; mul24 r0, ra5, rb9 # [ra7 delay] -+ add r1, r1, r0 ; mul24 r0, ra7, rb11 -+ sub r1, r1, r0 -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 -+ asr r1, r1, 14 -+ nop ; mul24 r1, r1, ra_wt_mul_l0 -+ shl r1, r1, 8 ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 -+ brr.anyn -, r:1b -+ asr r1, r1, ra3 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> .anyn 1b ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ const int no_p = _no_p[j]; ++ const int no_q = _no_q[j]; + -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++ if (d0 + d3 >= beta) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); + -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; ++ ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++ } ++ } ++} + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 + -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 + -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 -+ brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> 1b -+.endm ++#define P1 pix_l[0 * xstride] ++#define P0 pix_l[1 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] + -+# At 10 bits -+# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits) -+# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230 -+# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits) -+# (P) -+# * weight (255) = 5987400 = 0x5b5c48 (23 bits) -+# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits) -+# ... should be OK -+# -+# (B) -+# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits) -+# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits) -+# So signed overflow if we sign extend here :-( -+# -+# In practice this doesn't happen (we need a maximal offset and a very unlucky -+# filter). -+# -+# This could be fixed by offsetting the filters s.t. they are unsigned until -+# weight mul and then removing the offset with the weighting offset (I think -+# this should work) or splitting the rounding & offsetting ++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, const int32_t *_tc, ++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); + -+::mc_filter_c_p -+ m_filter_c_p 0, 8 ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; + -+::mc_filter_c_p_l1 -+ m_filter_c_p 1, 8 ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++} + -+################################################################################ ++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); ++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); ++} + -+# mc_filter_c_b ++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); ++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); ++} + -+# At this point we have already issued two pairs of texture requests for the current block -+# ra_x, ra_x16_base point to the current coordinates for this block ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 + -+.macro m_filter_c_b, v_bit_depth +diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c +new file mode 100644 +index 0000000000..f6db76482d +--- /dev/null ++++ b/libavcodec/rpi_hevcpred.c +@@ -0,0 +1,122 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+.if v_bit_depth <= 8 -+.set v_x_shift, 1 -+.set v_v_shift, 8 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 2 -+.set v_v_shift, i_shift16 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif -+.set v_x_mul, (1 << v_x_shift) ++#include "rpi_hevcdec.h" + -+# per-channel shifts were calculated on the *previous* invocation ++#include "rpi_hevcpred.h" + -+# get base addresses and per-channel shifts for *next* invocation -+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++#define PRED_C 0 ++#define BIT_DEPTH 8 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH + -+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base ++#define BIT_DEPTH 9 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH + -+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 -+ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a -+ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height -+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+ min r0, r0, rb_max_x ; mov ra0, unif # L0 H filter coeffs ++#define BIT_DEPTH 10 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH + -+.if v_bit_depth <= 8 -+ shl ra_xshift_next, r0, 3 -+.endif ++#define BIT_DEPTH 12 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C + -+ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs -+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height -+ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B ++#define PRED_C 1 ++#define BIT_DEPTH 8 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH + -+# set up VPM write ++#define BIT_DEPTH 9 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH + -+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight -+ add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height -+ add rb_lcount, r1, 3 ; mov.ifc ra_wt_mul_l0, unif # ; V weight ++#define BIT_DEPTH 10 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH + -+ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 -+ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base -+ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs ++#define BIT_DEPTH 12 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C + -+# L1 - uniform layout could possibly be optimized ++void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth) ++{ ++#undef FUNC ++#define FUNC(a, depth) a ## _ ## depth + -+ shl r0, ra3.16b, v_x_shift # r0=x*2 -+ add r0, r0, rb_elem_x ; mov ra3, unif # ; V filter coeffs -+ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight -+ max r0, r0, r5 ; mov rb8, ra3.8a # ; start unpacking filter coeffs -+ min r0, r0, rb_max_x ; mov rb9, ra3.8b ++#undef FUNCC ++#define FUNCC(a, depth) a ## _ ## depth ## _c + -+.if v_bit_depth <= 8 -+ shl rb_xshift2_next, r0, 3 -+.endif ++#define HEVC_PRED_Y(depth) \ ++ hpc->intra_pred[0] = FUNC(intra_pred_2, depth); \ ++ hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \ ++ hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \ ++ hpc->intra_pred[3] = FUNC(intra_pred_5, depth); \ ++ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ ++ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ ++ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ ++ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ ++ hpc->pred_dc = FUNC(pred_dc, depth); \ ++ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); + -+ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight -+ and r1, r0, r1 ; mov rb10, ra3.8c -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov rb_dest, unif # Add stripe offsets ; dst_addr -+ add rb_base2_next, r3, r0 ++#define HEVC_PRED_C(depth) \ ++ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \ ++ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \ ++ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \ ++ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \ ++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ ++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ ++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ ++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ ++ hpc->pred_dc_c = FUNCC(pred_dc, depth); \ ++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); + -+ mov ra9, rb_max_y ; mov rb11, ra3.8d -+ shl r1, ra_wt_off_l1, rb_wt_den_p15 -+ asr rb_wt_off, r1, 9 ; mov ra_link, unif # link ++#define HEVC_PRED(depth) \ ++ HEVC_PRED_Y(depth); \ ++ HEVC_PRED_C(depth); + -+# r5 loop counter -+# ra0 H coeffs L0 -+# ra1 H coeffs L1 -+# ra2 V coeffs L0 -+# ra3 temp -+# ra4-7 L0 H FIFO -+# rb4-7 L1 H FIFO -+# rb8-rb11 V coeffs L1 -+# ra9 rb_max_y alias ++ switch (bit_depth) { ++ case 9: ++ HEVC_PRED(9); ++ break; ++ case 10: ++ HEVC_PRED(10); ++ break; ++ case 12: ++ HEVC_PRED(12); ++ break; ++ default: ++ HEVC_PRED(8); ++ break; ++ } + -+:1 -+# retrieve texture results and pick out bytes -+# then submit two more texture requests -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0 # loop counter increment -+ shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next -+ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next -+ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next -+ add ra_y, 1, ra_y ; mov r3, ra_y ++ if (ARCH_MIPS) ++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); ++} +diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h +new file mode 100644 +index 0000000000..03c6eb3295 +--- /dev/null ++++ b/libavcodec/rpi_hevcpred.h +@@ -0,0 +1,57 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+ max r3, r3, ra_k0 ; mov r0, r1 << 15 -+ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++#ifndef AVCODEC_RPI_HEVCPRED_H ++#define AVCODEC_RPI_HEVCPRED_H + -+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++#include ++#include ++#include "config.h" + -+# L0 H-filter -+# H FIFO scrolls are spread all over this loop -+ mov rb4, rb5 ; mov ra4, ra5 # ? Just moves ++struct HEVCRpiContext; ++struct HEVCRpiLocalContext; + -+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra0.8d, r1 -+.if v_bit_depth <= 8 -+ sub ra3, r2, r3 ; mov rb5, rb6 ; ldtmu1 -+.else -+ sub r2, r2, r3 ; mov rb5, rb6 ; ldtmu1 -+ asr ra3, r2, (v_bit_depth - 8) -+.endif ++typedef struct HEVCPredContext { ++ void (*intra_pred[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); + -+ shr r2, r4, rb_xshift2 ; mov ra5, ra6 -+ shr r1, r2, v_v_shift ; mov r3, ra_y2 -+ add ra_y2, r3, ra_k1 ; mov rb6, rb7 ++ void (*pred_planar[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_angular[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int c_idx, int mode); ++ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s, struct HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx); + -+ max r3, r3, ra_k0 ; mov r0, r1 << 15 -+ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride, int log2_size, int c_idx); ++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int c_idx, int mode); ++} HEVCPredContext; + -+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch -+ add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++void ff_hevc_rpi_pred_init(HEVCPredContext *hpc, int bit_depth); ++void ff_hevc_rpi_pred_init_mips(HEVCPredContext *hpc, int bit_depth); + -+# L1 H-filter ++#endif /* AVCODEC_RPI_HEVCPRED_H */ +diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c +new file mode 100644 +index 0000000000..4ee776f955 +--- /dev/null ++++ b/libavcodec/rpi_hevcpred_template.c +@@ -0,0 +1,850 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ + -+ and r1, r1, rb_pmask ; mul24 r3, ra1.8a, r0 -+ nop ; mul24 r2, ra1.8b << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ sub.setf -, r5, 4 ; mul24 r0, ra1.8d, r1 -+# V filters - start in branch delay slots of H -+# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction -+ add r2, r2, r3 ; mul24 r1, rb5, ra2.8b -+ brr.anyn -, r:1b -+ mov ra6, ra7 ; mul24 r3, ra7, rb10 -+ sub r2, r2, r0 ; mul24 r0, rb4, ra2.8a -+ asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3 -+# >>> .anyn 1b ++#include "config.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "bit_depth_template.c" + -+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c # [rb7 delay] -+ add r1, r1, r0 ; mul24 r0, rb7, ra2.8d -+ sub r2, r1, r0 ; mul24 r0, ra4, rb8 -+ sub r1, r3, r0 ; mul24 r0, ra5, rb9 -+ add r1, r1, r0 ; mul24 r0, ra7, rb11 -+ sub r1, r1, r0 ; mul24 r2, r2, ra_k256 ++#include "rpi_hevcdec.h" ++#include "rpi_hevcpred.h" + -+ asr r2, r2, 14 ; mul24 r1, r1, ra_k256 -+ asr r1, r1, 14 ; mul24 r2, r2, ra_wt_mul_l0 + -+ add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1 # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9) -+ add r1, r1, r2 ; mov r3, ra_blk_height ++#define DUMP_PRED 0 + -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend ++#define POS(x, y) src[(x) + stride * (y)] + -+ brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> .anyn 1b ++// INCLUDED_ONCE defined at EOF ++#ifndef INCLUDED_ONCE ++typedef uint8_t (* c8_dst_ptr_t)[2]; ++typedef const uint8_t (* c8_src_ptr_t)[2]; ++typedef uint16_t (* c16_dst_ptr_t)[2]; ++typedef const uint16_t (* c16_src_ptr_t)[2]; + -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++// *** On ARM make these NEON registers ++typedef struct pixel4_16 { ++ uint16_t x[4]; ++} pixel4_16; ++typedef struct pixel4_32 { ++ uint32_t x[4]; ++} pixel4_32; ++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) ++{ ++ pixel4_16 t = {{x, x, x, x}}; ++ return t; ++} ++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) ++{ ++ pixel4_32 t = {{x, x, x, x}}; ++ return t; ++} ++#endif + -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++#if PRED_C ++// For chroma we double pixel size so we copy pairs ++#undef pixel ++#undef pixel2 ++#undef pixel4 ++#undef dctcoef ++#undef INIT_CLIP ++#undef no_rnd_avg_pixel4 ++#undef rnd_avg_pixel4 ++#undef AV_RN2P ++#undef AV_RN4P ++#undef AV_RN4PA ++#undef AV_WN2P ++#undef AV_WN4P ++#undef AV_WN4PA ++#undef CLIP ++#undef FUNC ++#undef FUNCC ++#undef av_clip_pixel ++#undef PIXEL_SPLAT_X4 + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++#if BIT_DEPTH == 8 ++#define pixel uint16_t ++#define pixel4 pixel4_16 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 ++#define cpel uint8_t ++#define c_src_ptr_t c8_src_ptr_t ++#define c_dst_ptr_t c8_dst_ptr_t ++#else ++#define pixel uint32_t ++#define pixel4 pixel4_32 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 ++#define cpel uint16_t ++#define c_src_ptr_t c16_dst_ptr_t ++#define c_dst_ptr_t c16_dst_ptr_t ++#endif ++#define AV_RN4P(p) (*(pixel4*)(p)) ++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) ++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) ++#endif + -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link + -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 -+ brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> 1b -+.endm ++// Get PW prior to horrid PRED_C trickery ++#if BIT_DEPTH == 8 ++#define PW 1 ++#else ++#define PW 2 ++#endif + -+::mc_filter_c_b -+ m_filter_c_b 8 + -+################################################################################ -+# Exit code used by both Luma & Chroma so place between them to avoid I-cache -+# conflicts ++#if DUMP_PRED && !defined(INCLUDE_ONCE) ++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) ++{ ++ for (unsigned int y = 0; y != size; y++, data += stride * 2) { ++ for (unsigned int x = 0; x != size; x++) { ++ printf("%4d", data[x * 2]); ++ } ++ printf("\n"); ++ } ++ printf("\n"); ++} ++#endif + -+.macro m_exit_drain -+.if PREREAD == 2 -+# Special case 2 as loop is wasteful -+ nop ; nop ; ldtmu0 -+ nop ; nop ; ldtmu1 -+ nop ; nop ; ldtmu0 -+ mov -, vw_wait ; nop ; ldtmu1 -+.else -+ mov.setf r3, PREREAD - 1 -+:1 -+ brr.anynz -, r:1b -+ nop ; nop ; ldtmu0 -+ nop ; nop ; ldtmu1 -+ sub.setf r3, r3, 1 -+ # >>> -+ mov -, vw_wait -+.endif -+.endm ++static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, ++ int log2_size, int c_idx_arg) ++{ ++#define PU(x) \ ++ ((x) >> s->ps.sps->log2_min_pu_size) ++#define MVF(x, y) \ ++ (s->ref->tab_mvf[(x) + (y) * min_pu_width]) ++#define MVF_PU(x, y) \ ++ MVF(PU(x0 + ((x) * (1 << hshift))), PU(y0 + ((y) * (1 << vshift)))) ++#define IS_INTRA(x, y) \ ++ (MVF_PU(x, y).pred_flag == PF_INTRA) ++#define MIN_TB_ADDR_ZS(x, y) \ ++ s->ps.pps->min_tb_addr_zs[(y) * (s->ps.sps->tb_mask+2) + (x)] ++#define EXTEND(ptr, val, len) \ ++do { \ ++ pixel4 pix = PIXEL_SPLAT_X4(val); \ ++ for (i = 0; i < (len); i += 4) \ ++ AV_WN4P(ptr + i, pix); \ ++} while (0) ++ ++#define EXTEND_RIGHT_CIP(ptr, start, length) \ ++ for (i = start; i < (start) + (length); i += 4) \ ++ if (!IS_INTRA(i, -1)) \ ++ AV_WN4P(&ptr[i], a); \ ++ else \ ++ a = PIXEL_SPLAT_X4(ptr[i+3]) ++#define EXTEND_LEFT_CIP(ptr, start, length) \ ++ for (i = start; i > (start) - (length); i--) \ ++ if (!IS_INTRA(i - 1, -1)) \ ++ ptr[i - 1] = ptr[i] ++#define EXTEND_UP_CIP(ptr, start, length) \ ++ for (i = (start); i > (start) - (length); i -= 4) \ ++ if (!IS_INTRA(-1, i - 3)) \ ++ AV_WN4P(&ptr[i - 3], a); \ ++ else \ ++ a = PIXEL_SPLAT_X4(ptr[i - 3]) ++#define EXTEND_DOWN_CIP(ptr, start, length) \ ++ for (i = start; i < (start) + (length); i += 4) \ ++ if (!IS_INTRA(-1, i)) \ ++ AV_WN4P(&ptr[i], a); \ ++ else \ ++ a = PIXEL_SPLAT_X4(ptr[i + 3]) ++ // c_idx will alaways be 1 for _c versions and 0 for y ++ const unsigned int c_idx = PRED_C; ++ int i; ++ const unsigned int hshift = ctx_hshift(s, c_idx); ++ const unsigned int vshift = ctx_vshift(s, c_idx); ++ int size = (1 << log2_size); ++ int size_in_luma_h = size << hshift; ++ int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size; ++ int size_in_luma_v = size << vshift; ++ int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size; ++ const int x = x0 >> hshift; ++ const int y = y0 >> vshift; ++ int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; ++ int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask; + -+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) -+# All qpus start at the beginning and after that (group - 1) must have finished -+# before (group) can start -+# -+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain -+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - -+# lockup otherwise) -+# -+# There is some, currently ill defined, potential lockup if we have the VDM active -+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? -+# -+# The code stalled when I had many waiters on a single sem so we have a -+# "ripple" of srels to restart. Unsure why, may have been bug, but this works -+# and we currently have both the memory & sems to support it. -+.macro m_sync_q, n_qpu, n_quads -+# Do not generate code for qpu >= quads * 4 - fns should never be called -+.if n_qpu < n_quads * 4 -+ mov ra_link, unif # Can only branch to an a reg (not r0) -+ mov -, vw_wait # [ra_link delay] ++ int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb); + -+.set n_sem_sync, n_qpu - (n_qpu % 4) -+.set n_sem_in, n_qpu -+.set n_sem_out, n_qpu + 1 ++ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel); ++ pixel *const src = c_idx == 0 ? ++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : ++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); + -+.if n_qpu % 4 == 0 ++ int min_pu_width = s->ps.sps->min_pu_width; + -+.set n_sem_quad_in, 12 + n_qpu / 4 -+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) ++ const enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c : ++ lc->tu.intra_pred_mode; ++ pixel4 a; ++ pixel left_array[2 * MAX_TB_SIZE + 1]; ++#if !PRED_C ++ pixel filtered_left_array[2 * MAX_TB_SIZE + 1]; ++#endif ++ pixel top_array[2 * MAX_TB_SIZE + 1]; ++#if !PRED_C ++ pixel filtered_top_array[2 * MAX_TB_SIZE + 1]; ++#endif + -+ sacq -, n_sem_sync -+ sacq -, n_sem_sync -+ sacq -, n_sem_sync -+ bra -, ra_link -+ sacq -, n_sem_quad_in -+ srel -, n_sem_out -+ srel -, n_sem_quad_out ++ pixel *left = left_array + 1; ++ pixel *top = top_array + 1; ++#if !PRED_C ++ pixel *filtered_left = filtered_left_array + 1; ++ pixel *filtered_top = filtered_top_array + 1; ++#endif ++ int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask); ++ int cand_left = lc->na.cand_left; ++ int cand_up_left = lc->na.cand_up_left; ++ int cand_up = lc->na.cand_up; ++ int cand_up_right = lc->na.cand_up_right && cur_tb_addr > MIN_TB_ADDR_ZS((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask, y_tb - 1); + -+.else -+ bra -, ra_link -+ srel -, n_sem_sync -+ sacq -, n_sem_in -+.if n_sem_out % 4 != 0 -+ srel -, n_sem_out -+.else -+ nop -+.endif -+.endif -+.endif -+.endm ++ int bottom_left_size = (FFMIN(y0 + 2 * size_in_luma_v, s->ps.sps->height) - ++ (y0 + size_in_luma_v)) >> vshift; ++ int top_right_size = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) - ++ (x0 + size_in_luma_h)) >> hshift; + -+.set v_quads8, N_QPU_8 / 4 ++ pixel * src_l = src - 1; ++ pixel * src_u = src - stride; ++ pixel * src_ur = src_u + size; + -+::mc_sync_q0 -+ m_sync_q 0, v_quads8 -+::mc_sync_q1 -+ m_sync_q 1, v_quads8 -+::mc_sync_q2 -+ m_sync_q 2, v_quads8 -+::mc_sync_q3 -+ m_sync_q 3, v_quads8 -+::mc_sync_q4 -+ m_sync_q 4, v_quads8 -+::mc_sync_q5 -+ m_sync_q 5, v_quads8 -+::mc_sync_q6 -+ m_sync_q 6, v_quads8 -+::mc_sync_q7 -+ m_sync_q 7, v_quads8 -+::mc_sync_q8 -+ m_sync_q 8, v_quads8 -+::mc_sync_q9 -+ m_sync_q 9, v_quads8 -+::mc_sync_q10 -+ m_sync_q 10, v_quads8 -+::mc_sync_q11 -+ m_sync_q 11, v_quads8 ++ { ++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs ++ const AVFrame * const frame = s->frame; ++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 ++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; ++ if ((x & mask) == 0) ++ src_l -= stripe_adj; ++ if (((x + size) & mask) == 0) ++ src_ur += stripe_adj; ++ } + -+# mc_exit() -+# Chroma & Luma the same now ++ if (s->ps.pps->constrained_intra_pred_flag == 1) { ++ int size_in_luma_pu_v = PU(size_in_luma_v); ++ int size_in_luma_pu_h = PU(size_in_luma_h); ++ int on_pu_edge_x = !av_mod_uintp2(x0, s->ps.sps->log2_min_pu_size); ++ int on_pu_edge_y = !av_mod_uintp2(y0, s->ps.sps->log2_min_pu_size); ++ if (!size_in_luma_pu_h) ++ size_in_luma_pu_h++; ++ if (cand_bottom_left == 1 && on_pu_edge_x) { ++ int x_left_pu = PU(x0 - 1); ++ int y_bottom_pu = PU(y0 + size_in_luma_v); ++ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_bottom_pu); ++ cand_bottom_left = 0; ++ for (i = 0; i < max; i += 2) ++ cand_bottom_left |= (MVF(x_left_pu, y_bottom_pu + i).pred_flag == PF_INTRA); ++ } ++ if (cand_left == 1 && on_pu_edge_x) { ++ int x_left_pu = PU(x0 - 1); ++ int y_left_pu = PU(y0); ++ int max = FFMIN(size_in_luma_pu_v, s->ps.sps->min_pu_height - y_left_pu); ++ cand_left = 0; ++ for (i = 0; i < max; i += 2) ++ cand_left |= (MVF(x_left_pu, y_left_pu + i).pred_flag == PF_INTRA); ++ } ++ if (cand_up_left == 1) { ++ int x_left_pu = PU(x0 - 1); ++ int y_top_pu = PU(y0 - 1); ++ cand_up_left = MVF(x_left_pu, y_top_pu).pred_flag == PF_INTRA; ++ } ++ if (cand_up == 1 && on_pu_edge_y) { ++ int x_top_pu = PU(x0); ++ int y_top_pu = PU(y0 - 1); ++ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_top_pu); ++ cand_up = 0; ++ for (i = 0; i < max; i += 2) ++ cand_up |= (MVF(x_top_pu + i, y_top_pu).pred_flag == PF_INTRA); ++ } ++ if (cand_up_right == 1 && on_pu_edge_y) { ++ int y_top_pu = PU(y0 - 1); ++ int x_right_pu = PU(x0 + size_in_luma_h); ++ int max = FFMIN(size_in_luma_pu_h, s->ps.sps->min_pu_width - x_right_pu); ++ cand_up_right = 0; ++ for (i = 0; i < max; i += 2) ++ cand_up_right |= (MVF(x_right_pu + i, y_top_pu).pred_flag == PF_INTRA); ++ } ++ memset(left, 128, 2 * MAX_TB_SIZE*sizeof(pixel)); ++ memset(top , 128, 2 * MAX_TB_SIZE*sizeof(pixel)); ++ top[-1] = 128; ++ } ++ if (cand_up_left) { ++ left[-1] = src_l[-stride]; ++ top[-1] = left[-1]; ++ } ++ if (cand_up) ++ // Always good - even with sand ++ memcpy(top, src_u, size * sizeof(pixel)); ++ if (cand_up_right) { ++ memcpy(top + size, src_ur, top_right_size * sizeof(pixel)); ++ EXTEND(top + size + top_right_size, top[size + top_right_size - 1], ++ size - top_right_size); ++ } ++ if (cand_left) ++ for (i = 0; i < size; i++) ++ left[i] = src_l[stride * i]; ++ if (cand_bottom_left) { ++ for (i = size; i < size + bottom_left_size; i++) ++ left[i] = src_l[stride * i]; ++ EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1], ++ size - bottom_left_size); ++ } + -+.macro m_exit_qn -+ m_exit_drain -+ nop ; nop ; thrend -+ nop -+ nop -+# >>> thrend <<< -+.endm ++ if (s->ps.pps->constrained_intra_pred_flag == 1) { ++ if (cand_bottom_left || cand_left || cand_up_left || cand_up || cand_up_right) { ++ int size_max_x = x0 + ((2 * size) << hshift) < s->ps.sps->width ? ++ 2 * size : (s->ps.sps->width - x0) >> hshift; ++ int size_max_y = y0 + ((2 * size) << vshift) < s->ps.sps->height ? ++ 2 * size : (s->ps.sps->height - y0) >> vshift; ++ int j = size + (cand_bottom_left? bottom_left_size: 0) -1; ++ if (!cand_up_right) { ++ size_max_x = x0 + ((size) << hshift) < s->ps.sps->width ? ++ size : (s->ps.sps->width - x0) >> hshift; ++ } ++ if (!cand_bottom_left) { ++ size_max_y = y0 + (( size) << vshift) < s->ps.sps->height ? ++ size : (s->ps.sps->height - y0) >> vshift; ++ } ++ if (cand_bottom_left || cand_left || cand_up_left) { ++ while (j > -1 && !IS_INTRA(-1, j)) ++ j--; ++ if (!IS_INTRA(-1, j)) { ++ j = 0; ++ while (j < size_max_x && !IS_INTRA(j, -1)) ++ j++; ++ EXTEND_LEFT_CIP(top, j, j + 1); ++ left[-1] = top[-1]; ++ } ++ } else { ++ j = 0; ++ while (j < size_max_x && !IS_INTRA(j, -1)) ++ j++; ++ if (j > 0) ++ if (x0 > 0) { ++ EXTEND_LEFT_CIP(top, j, j + 1); ++ } else { ++ EXTEND_LEFT_CIP(top, j, j); ++ top[-1] = top[0]; ++ } ++ left[-1] = top[-1]; ++ } ++ left[-1] = top[-1]; ++ if (cand_bottom_left || cand_left) { ++ a = PIXEL_SPLAT_X4(left[-1]); ++ EXTEND_DOWN_CIP(left, 0, size_max_y); ++ } ++ if (!cand_left) ++ EXTEND(left, left[-1], size); ++ if (!cand_bottom_left) ++ EXTEND(left + size, left[size - 1], size); ++ if (x0 != 0 && y0 != 0) { ++ a = PIXEL_SPLAT_X4(left[size_max_y - 1]); ++ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y); ++ if (!IS_INTRA(-1, - 1)) ++ left[-1] = left[0]; ++ } else if (x0 == 0) { ++ EXTEND(left, 0, size_max_y); ++ } else { ++ a = PIXEL_SPLAT_X4(left[size_max_y - 1]); ++ EXTEND_UP_CIP(left, size_max_y - 1, size_max_y); ++ } ++ top[-1] = left[-1]; ++ if (y0 != 0) { ++ a = PIXEL_SPLAT_X4(left[-1]); ++ EXTEND_RIGHT_CIP(top, 0, size_max_x); ++ } ++ } ++ } ++ // Infer the unavailable samples ++ if (!cand_bottom_left) { ++ if (cand_left) { ++ EXTEND(left + size, left[size - 1], size); ++ } else if (cand_up_left) { ++ EXTEND(left, left[-1], 2 * size); ++ cand_left = 1; ++ } else if (cand_up) { ++ left[-1] = top[0]; ++ EXTEND(left, left[-1], 2 * size); ++ cand_up_left = 1; ++ cand_left = 1; ++ } else if (cand_up_right) { ++ EXTEND(top, top[size], size); ++ left[-1] = top[size]; ++ EXTEND(left, left[-1], 2 * size); ++ cand_up = 1; ++ cand_up_left = 1; ++ cand_left = 1; ++ } else { // No samples available ++#if PRED_C ++ left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8)); ++#else ++ left[-1] = (1 << (BIT_DEPTH - 1)); ++#endif ++ EXTEND(top, left[-1], 2 * size); ++ EXTEND(left, left[-1], 2 * size); ++ } ++ } + -+::mc_exit_c_qn -+::mc_exit_y_qn -+ m_exit_qn ++ if (!cand_left) ++ EXTEND(left, left[size], size); ++ if (!cand_up_left) { ++ left[-1] = left[0]; ++ } ++ if (!cand_up) ++ EXTEND(top, left[-1], size); ++ if (!cand_up_right) ++ EXTEND(top + size, top[size - 1], size); + ++ top[-1] = left[-1]; + ++ // Filtering process ++ // Sand can only apply to chroma_format_idc == 1 so we don't need to ++ // worry about chroma smoothing for that case ++#if !PRED_C ++ if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0 || ctx_cfmt(s) == 3)) { ++ if (mode != INTRA_DC && size != 4){ ++ int intra_hor_ver_dist_thresh[] = { 7, 1, 0 }; ++ int min_dist_vert_hor = FFMIN(FFABS((int)(mode - 26U)), ++ FFABS((int)(mode - 10U))); ++ if (min_dist_vert_hor > intra_hor_ver_dist_thresh[log2_size - 3]) { ++ int threshold = 1 << (BIT_DEPTH - 5); ++ if (s->ps.sps->sps_strong_intra_smoothing_enable_flag && c_idx == 0 && ++ log2_size == 5 && ++ FFABS(top[-1] + top[63] - 2 * top[31]) < threshold && ++ FFABS(left[-1] + left[63] - 2 * left[31]) < threshold) { ++ // We can't just overwrite values in top because it could be ++ // a pointer into src ++ filtered_top[-1] = top[-1]; ++ filtered_top[63] = top[63]; ++ for (i = 0; i < 63; i++) ++ filtered_top[i] = ((64 - (i + 1)) * top[-1] + ++ (i + 1) * top[63] + 32) >> 6; ++ for (i = 0; i < 63; i++) ++ left[i] = ((64 - (i + 1)) * left[-1] + ++ (i + 1) * left[63] + 32) >> 6; ++ top = filtered_top; ++ } else { ++ filtered_left[2 * size - 1] = left[2 * size - 1]; ++ filtered_top[2 * size - 1] = top[2 * size - 1]; ++ for (i = 2 * size - 2; i >= 0; i--) ++ filtered_left[i] = (left[i + 1] + 2 * left[i] + ++ left[i - 1] + 2) >> 2; ++ filtered_top[-1] = ++ filtered_left[-1] = (left[0] + 2 * left[-1] + top[0] + 2) >> 2; ++ for (i = 2 * size - 2; i >= 0; i--) ++ filtered_top[i] = (top[i + 1] + 2 * top[i] + ++ top[i - 1] + 2) >> 2; ++ left = filtered_left; ++ top = filtered_top; ++ } ++ } ++ } ++ } + -+# mc_interrupt_exit12() ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, log2_size, c_idx); ++ break; ++ default: ++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, c_idx, ++ mode); ++ break; ++ } ++#else ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, log2_size, c_idx); ++ break; ++ default: ++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top, ++ (uint8_t *)left, stride, c_idx, ++ mode); ++ break; ++ } + -+.macro m_exit_q0 -+ m_exit_drain -+ sacq -, 12 -+ nop ; nop ; thrend -+ mov interrupt, 1 -+ nop -+# >>> thrend <<< -+.endm ++#if DUMP_PRED ++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); ++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); ++#endif ++#endif ++} + -+::mc_exit_c_q0 -+::mc_exit_y_q0 -+ m_exit_q0 ++#define INTRA_PRED(size) \ ++static void FUNC(intra_pred_ ## size)(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int c_idx) \ ++{ \ ++ FUNC(intra_pred)(s, lc, x0, y0, size, c_idx); \ ++} + -+# LUMA CODE ++INTRA_PRED(2) ++INTRA_PRED(3) ++INTRA_PRED(4) ++INTRA_PRED(5) + -+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. -+# For P frames we make the second x,y coordinates offset by +8 ++#undef INTRA_PRED + ++#if !PRED_C ++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ int size = 1 << trafo_size; ++ for (y = 0; y < size; y++) ++ for (x = 0; x < size; x++) ++ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + ++ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); ++} ++#else ++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, ++ const uint8_t * _left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ int size = 1 << trafo_size; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; + -+################################################################################ -+# mc_setup -+# -+# typedef struct qpu_mc_pred_y_s_s { -+# qpu_mc_src_t next_src1; -+# qpu_mc_src_t next_src2; -+# uint16_t pic_h; -+# uint16_t pic_w; -+# uint32_t stride2; -+# uint32_t stride1; -+# uint32_t wdenom; -+# uint32_t next_fn; -+# } qpu_mc_pred_y_s_t; ++ for (y = 0; y < size; y++, src += stride) ++ { ++ for (x = 0; x < size; x++) ++ { ++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + ++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); ++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + ++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); ++ } ++ } ++} ++#endif + -+.macro m_setup_y, v_bit_depth ++#define PRED_PLANAR(size)\ ++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_planar)(src, top, left, stride, size + 2); \ ++} + -+# Cannot use mul24 on x as x might be -ve, so must use shift -+.if v_bit_depth <= 8 -+.set v_x_shift, 0 -+.set v_pmask, 0xff -+.set v_blk_height, Y_BLK_HEIGHT_8 -+.else -+.set v_x_shift, 1 -+.set v_pmask, 0xffff -+.set v_blk_height, Y_BLK_HEIGHT_16 -+.endif ++PRED_PLANAR(0) ++PRED_PLANAR(1) ++PRED_PLANAR(2) ++PRED_PLANAR(3) + ++#undef PRED_PLANAR + -+ # Need to save these because we need to know the frame dimensions before computing texture coordinates -+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y -+ mov ra9, unif # ref_y_base -+ mov ra1, unif # x2_y2 -+ mov ra11, unif # ref_y2_base ++#if !PRED_C ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size, int c_idx) ++{ ++ int i, j, x, y; ++ int size = (1 << log2_size); ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ int dc = size; ++ pixel4 a; ++ for (i = 0; i < size; i++) ++ dc += left[i] + top[i]; + -+# load constants -+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] -+ shl rb_ef, r0, i_shift30 ++ dc >>= log2_size + 1; + ++ a = PIXEL_SPLAT_X4(dc); + -+ mov ra_kff100100, 0xff100100 -+ mov rb_pmask, v_pmask -+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ for (i = 0; i < size; i++) ++ for (j = 0; j < size; j+=4) ++ AV_WN4P(&POS(j, i), a); ++ ++ if (c_idx == 0 && size < 32) { ++ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2; ++ for (x = 1; x < size; x++) ++ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2; ++ for (y = 1; y < size; y++) ++ POS(0, y) = (left[y] + 3 * dc + 2) >> 2; ++ } ++} ++#else ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size, int c_idx) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; ++ unsigned int dc0 = size; ++ unsigned int dc1 = size; + -+# Compute part of VPM to use ++ for (i = 0; i < size; i++) ++ { ++ dc0 += left[i][0] + top[i][0]; ++ dc1 += left[i][1] + top[i][1]; ++ } + -+# Read image dimensions -+ mov ra3, unif # width_height -+ mov rb_xpitch, unif # stride2 -+.if v_x_shift == 0 -+ sub rb_max_x, ra3.16b, 1 -+.else -+ sub r0, ra3.16b, 1 -+ shl rb_max_x, r0, v_x_shift -+.endif -+ sub rb_max_y, ra3.16a, 1 -+ mov rb_pitch, unif # stride1 ++ dc0 >>= log2_size + 1; ++ dc1 >>= log2_size + 1; + -+# get destination pitch -+ mov r1, vdw_setup_1(0) -+ or rb_dma1_base, r1, rb_pitch ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = dc0; ++ src[j][1] = dc1; + -+# Compute base address for first and second access -+ mov r3, elem_num -+ add r0, ra0.16b, r3 # Load x + elem_num -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, 0 -+ min r0, r0, rb_max_x -+ shl ra_xshift_next, r0, 3 # Compute shifts ++ } ++ } ++} ++#endif + -+# X is byte offset - we can only load words - mask ++#ifndef ANGLE_CONSTS ++#define ANGLE_CONSTS ++static const int intra_pred_angle[] = { ++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, ++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 ++}; ++static const int inv_angle[] = { ++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, ++ -630, -910, -1638, -4096 ++}; ++#endif + -+ and r0, r0, -4 ; v8subs r2, r2, r2 -+ sub r2, r2, rb_pitch -+ and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets -+ add ra_base, ra9, r0 ++#if !PRED_C ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int c_idx, ++ int mode, int size) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; + -+ # r3 still contains elem_num -+ add r0, ra1.16b, r3 # Load x -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, 0 -+ min r0, r0, rb_max_x -+ shl rb_xshift2_next, r0, 3 # Compute shifts ++ int angle = intra_pred_angle[mode - 2]; ++ pixel ref_array[3 * MAX_TB_SIZE + 4]; ++ pixel *ref_tmp = ref_array + size; ++ const pixel *ref; ++ int last = (size * angle) >> 5; + -+ # r2 still contains mask -+ and r0, r0, -4 -+ and r1, r0, r2 -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 # Add stripe offsets -+ add ra_base2, ra11, r0 ++ if (mode >= 18) { ++ ref = top - 1; ++ if (angle < 0 && last < -1) { ++ for (x = 0; x <= size; x += 4) ++ AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1])); ++ for (x = last; x <= -1; x++) ++ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; ++ ref = ref_tmp; ++ } + -+# Do preloads -+ nop ; mov r0, ra0.16a # ; r0 = y -+ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 ++ for (y = 0; y < size; y++) { ++ int idx = ((y + 1) * angle) >> 5; ++ int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; x += 4) { ++ POS(x , y) = ((32 - fact) * ref[x + idx + 1] + ++ fact * ref[x + idx + 2] + 16) >> 5; ++ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] + ++ fact * ref[x + 1 + idx + 2] + 16) >> 5; ++ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] + ++ fact * ref[x + 2 + idx + 2] + 16) >> 5; ++ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] + ++ fact * ref[x + 3 + idx + 2] + 16) >> 5; ++ } ++ } else { ++ for (x = 0; x < size; x += 4) ++ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1])); ++ } ++ } ++ if (mode == 26 && c_idx == 0 && size < 32) { ++ for (y = 0; y < size; y++) ++ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1)); ++ } ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ for (x = 0; x <= size; x += 4) ++ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1])); ++ for (x = last; x <= -1; x++) ++ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; ++ ref = ref_tmp; ++ } + -+:1 -+ sub.setf r3, r3, 1 -+ max r1, r0, 0 -+ min r1, r1, rb_max_y -+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t0s, ra_base, r1 ; mov ra_y, r0 ++ for (x = 0; x < size; x++) { ++ int idx = ((x + 1) * angle) >> 5; ++ int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ POS(x, y) = ((32 - fact) * ref[y + idx + 1] + ++ fact * ref[y + idx + 2] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ POS(x, y) = ref[y + idx + 1]; ++ } ++ } ++ if (mode == 10 && c_idx == 0 && size < 32) { ++ for (x = 0; x < size; x += 4) { ++ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - top[-1]) >> 1)); ++ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1)); ++ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1)); ++ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1)); ++ } ++ } ++ } ++} ++#else ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int c_idx, ++ int mode, int size) ++{ ++ int x, y; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ c_src_ptr_t top = (c_src_ptr_t)_top; ++ c_src_ptr_t left = (c_src_ptr_t)_left; + -+ max r1, r2, 0 -+ brr.anynz -, r:1b -+ min r1, r1, rb_max_y -+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch -+ add t1s, ra_base2, r1 ; mov ra_y2, r2 -+# >>> .anynz 1b ++ const int angle = intra_pred_angle[mode - 2]; ++ cpel ref_array[3 * MAX_TB_SIZE + 4][2]; ++ c_dst_ptr_t ref_tmp = ref_array + size; ++ c_src_ptr_t ref; ++ const int last = (size * angle) >> 5; + -+ add rb_wt_den_p15, unif, 23 - v_bit_depth # weight denom ++ if (mode >= 18) { ++ ref = top - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c_src_ptr_t)ref_tmp; ++ } + -+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ for (y = 0; y < size; y++, src += stride) { ++ const int idx = ((y + 1) * angle) >> 5; ++ const int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; ++x) { ++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + ++ fact * ref[x + idx + 2][0] + 16) >> 5; ++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + ++ fact * ref[x + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ memcpy(src, ref + idx + 1, size * 2 * PW); ++ } ++ } ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c_src_ptr_t)ref_tmp; ++ } + -+ mov ra_link, unif # Next fn ++ for (x = 0; x < size; x++, src++) { ++ const int idx = ((x + 1) * angle) >> 5; ++ const int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + ++ fact * ref[y + idx + 2][0] + 16) >> 5; ++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + ++ fact * ref[y + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ { ++ src[y * stride][0] = ref[y + idx + 1][0]; ++ src[y * stride][1] = ref[y + idx + 1][1]; ++ } ++ } ++ } ++ } ++} ++#endif ++ ++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int c_idx, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 2); ++} ++ ++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int c_idx, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 3); ++} ++ ++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int c_idx, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 4); ++} + -+# touch vertical context to keep simulator happy -+ mov ra8, 0 ; mov rb8, 0 -+ bra -, ra_link -+ mov ra9, 0 ; mov rb9, 0 -+ mov ra10, 0 ; mov rb10, 0 -+ mov ra11, 0 ; mov rb11, 0 -+# >>> ra_link -+.endm ++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int c_idx, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5); ++} + -+::mc_setup_y_q0 -+ m_setup_q0 -+::mc_setup_y_qn -+ m_setup_y 8 ++#undef cpel ++#undef c_src_ptr_t ++#undef c_dst_ptr_t + -+################################################################################ -+# -+# Start of per-block setup code -+# P and B blocks share the same setup code to save on Icache space ++#undef EXTEND_LEFT_CIP ++#undef EXTEND_RIGHT_CIP ++#undef EXTEND_UP_CIP ++#undef EXTEND_DOWN_CIP ++#undef IS_INTRA ++#undef MVF_PU ++#undef MVF ++#undef PU ++#undef EXTEND ++#undef MIN_TB_ADDR_ZS ++#undef POS ++#undef PW + -+# luma_setup_delay3 done in delay slots of branch that got us here ++#ifndef INCLUDED_ONCE ++#define INCLUDED_ONCE ++#endif + -+# get base addresses and per-channel shifts for *next* invocation -+# per-channel shifts were calculated on the *previous* invocation +diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c +new file mode 100644 +index 0000000000..c16d9931bd +--- /dev/null ++++ b/libavcodec/rpi_mailbox.c +@@ -0,0 +1,145 @@ ++/* ++Copyright (c) 2012, Broadcom Europe Ltd. ++All rights reserved. + -+# 1st 3 instructions of per_block-setup in branch delay -+# -+# typedef struct qpu_mc_pred_y_p_s { -+# qpu_mc_src_t next_src1; -+# qpu_mc_src_t next_src2; -+# uint16_t h; -+# uint16_t w; -+# uint32_t mymx21; -+# uint32_t wo1; -+# uint32_t wo2; -+# uint32_t dst_addr; -+# uint32_t next_fn; -+# } qpu_mc_pred_y_p_t; -+# ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. + -+.macro m_luma_setup, v_bit_depth -+# Hack - QASM may well have have label pasting but I have no idea how... -+.if v_bit_depth == 8 -+ brr ra_link, r:per_block_setup_8 -+.elif v_bit_depth == 10 -+ brr ra_link, r:per_block_setup_10 -+.endif -+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? -+ add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 -+ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next -+.endm ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ + -+.macro m_per_block_setup, v_bit_depth ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + -+.if v_bit_depth <= 8 -+.set v_x_shift, 0 -+.set v_x_mul, 1 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 1 -+.set v_x_mul, 2 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif ++#include + -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next -+ min r0, r0, rb_max_x ++#define MAJOR_NUM 100 ++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *) ++#define DEVICE_FILE_NAME "/dev/vcio" + -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ and r0, r0, -4 -+ sub r2, r5, rb_pitch ; mov ra_base_next, unif # src1.base -+ and r1, r0, r2 ; mov ra_y_next, ra0.16a -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y -+ add ra_base_next, ra_base_next, r0 # [ra1 delay] ++#include "rpi_mailbox.h" ++//#include + -+ add r0, ra1.16b, r3 # Load x2 -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif -+ max r0, r0, r5 ; mov ra_y2_next, ra1.16a -+ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base -+ shl rb_xshift2_next, r0, 3 # Compute shifts -+ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height -+ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes -+ add rb_base2_next, rb_base2_next, r0 ++/* ++ * use ioctl to send mbox property message ++ */ + -+# get width,height of block (unif load above), r1 = width * pel_size -+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) -+ add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height -+ add rb_lcount, r0, 7 -+ shl r0, r0, v_dma_h_shift -+ add r0, r0, r1 # Combine width and height of destination area -+ shl r0, r0, v_dma_wh_shift # Shift into bits 16 upwards of the vdw_setup0 register -+ add rb_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets ++static int mbox_property(int file_desc, void *buf) ++{ ++ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf); + -+# get filter coefficients and discard unused B frame values -+ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight -+ shl ra8, r0, 3 ; mov r3, ra_k255 ++ if (ret_val < 0) { ++ printf("ioctl_set_msg failed:%d\n", ret_val); ++ } + -+# Pack the 1st 4 filter coefs for H & V tightly -+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) ++#ifdef DEBUG ++ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf; ++ for (i=0; i>> branch ra_link ++ rv = mbox_property(fd, buf); ++ memcpy(img, rimg, sizeof(*img)); + -+# r5 = 0 -+# ra_wt_mul_l1 = weight L1 -+# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred) -+# rb_wt_off = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1) -+# rb_wt_den_p15 = weight denom + 6 + 9 -+# rb_wt_mul_l0 = weight L0 -+.endm ++ return rv; ++} + -+:per_block_setup_8 -+ m_per_block_setup 8 ++int mbox_open() { ++ int file_desc; + ++ // open a char device file used for communicating with kernel mbox driver ++ file_desc = open(DEVICE_FILE_NAME, 0); ++ if (file_desc < 0) { ++ printf("Can't open device file: %s\n", DEVICE_FILE_NAME); ++ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM); ++ } ++ return file_desc; ++} + ++void mbox_close(int file_desc) { ++ close(file_desc); ++} + -+################################################################################ -+# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+# In a P block, y2_x2 should be y_x+8 -+# At this point we have already issued two pairs of texture requests for the current block +diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h +new file mode 100644 +index 0000000000..b3168788d2 +--- /dev/null ++++ b/libavcodec/rpi_mailbox.h +@@ -0,0 +1,58 @@ ++#ifndef RPI_MAILBOX_H ++#define RPI_MAILBOX_H + -+.macro m_filter_y_pxx, v_bit_depth -+ m_luma_setup v_bit_depth ++/* The image structure. */ ++typedef struct vc_image_extra_uv_s { ++ void *u, *v; ++ int vpitch; ++} VC_IMAGE_EXTRA_UV_T; + -+ shl ra_wt_mul_l0, ra_wt_mul_l0, 1 ++typedef union { ++ VC_IMAGE_EXTRA_UV_T uv; ++// VC_IMAGE_EXTRA_RGBA_T rgba; ++// VC_IMAGE_EXTRA_PAL_T pal; ++// VC_IMAGE_EXTRA_TF_T tf; ++// VC_IMAGE_EXTRA_BAYER_T bayer; ++// VC_IMAGE_EXTRA_MSBAYER_T msbayer; ++// VC_IMAGE_EXTRA_CODEC_T codec; ++// VC_IMAGE_EXTRA_OPENGL_T opengl; ++} VC_IMAGE_EXTRA_T; + -+# r5 = 0 (loop count) + -+:1 -+# retrieve texture results and pick out bytes -+# then submit two more texture requests ++typedef struct VC_IMAGE_T { ++ unsigned short type; /* should restrict to 16 bits */ ++ unsigned short info; /* format-specific info; zero for VC02 behaviour */ ++ unsigned short width; /* width in pixels */ ++ unsigned short height; /* height in pixels */ ++ int pitch; /* pitch of image_data array in bytes */ ++ int size; /* number of bytes available in image_data array */ ++ void *image_data; /* pixel data */ ++ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */ ++ void *metadata; /* metadata header for the image */ ++ void *pool_object; /* nonNULL if image was allocated from a vc_pool */ ++ int mem_handle; /* the mem handle for relocatable memory storage */ ++ int metadata_size; /* size of metadata of each channel in bytes */ ++ int channel_offset; /* offset of consecutive channels in bytes */ ++ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */ ++ uint8_t num_channels; /* number of channels (2 for stereo) */ ++ uint8_t current_channel;/* the channel this header is currently pointing to */ ++ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/ ++ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header ++ into a linked-mulitchannel image */ ++ uint8_t channel_index; /* index of the channel this header represents while ++ it is being linked. */ ++ uint8_t _dummy[3]; /* pad struct to 64 bytes */ ++} VC_IMAGE_T; + -+# N.B. Whilst y == y2 as far as this loop is concerned we will start -+# the grab for the next block before we finish with this block and that -+# might be B where y != y2 so we must do full processing on both y and y2 ++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1]; + -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch + -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++extern int mbox_open(void); ++extern void mbox_close(int file_desc); + -+ max r2, ra_y2, 0 -+ min r2, r2, rb_max_y ; mov ra7, ra8 -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++extern unsigned mbox_mem_lock(int file_desc, unsigned handle); ++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle); + -+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++int mbox_get_image_params(int fd, VC_IMAGE_T * img); + -+# apply horizontal filter -+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++#endif +diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c +new file mode 100644 +index 0000000000..3dfc35fa5c +--- /dev/null ++++ b/libavcodec/rpi_qpu.c +@@ -0,0 +1,939 @@ ++#include ++#include ++#include ++#include ++#include ++#include "libavutil/avassert.h" + -+ sub.setf -, r5, 8 ; mov ra9, ra10 -+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a -+ brr.anyn -, r:1b -+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+ mov ra10, ra11 ; mov rb10, rb11 -+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+ # >>> .anyn 1b ++#include "config.h" + -+ # apply vertical filter and write to VPM -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 -+# At this point r1 is a 22-bit signed quantity: 8 (original sample), -+# +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign) -+# The top 8 bits have rubbish in them as mul24 is unsigned -+# The low 6 bits need discard before weighting -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 # x256 - sign extend & discard rubbish -+ asr r1, r1, 14 -+ nop ; mul24 r1, r1, ra_wt_mul_l0 -+ add r1, r1, rb_wt_off ; mov r3, ra_blk_height # ; r3 = block height for outside loop ++#include ++#include + -+ shl r1, r1, 8 ; v8subs r0, ra_height, r3 -+ brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++#include + -+# >>> branch.anyn yloop ++#include "rpi_mailbox.h" ++#include "rpi_qpu.h" ++#include "rpi_hevc_shader.h" ++#include "rpi_hevc_transform8.h" ++#include "rpi_hevc_transform10.h" ++#include "libavutil/rpi_sand_fns.h" + -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#include "interface/vmcs_host/vc_vchi_gpuserv.h" ++#pragma GCC diagnostic pop + -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) ++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++// Add profile flags to all QPU requests - generates output in "vcdbg log msg" ++// Beware this is expensive and will probably throw off all other timing by >10% ++#define RPI_TRACE_QPU_PROFILE_ALL 0 + -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link ++// QPU "noflush" flags ++// a mixture of flushing & profiling + -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 -+ brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> 1b -+.endm ++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed ++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers ++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results ++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling ++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) + -+::mc_filter_y_pxx -+ m_filter_y_pxx 8 ++#define vcos_verify_ge0(x) ((x)>=0) + ++// Size in 32bit words ++#define QPU_CODE_SIZE 4098 ++#define VPU_CODE_SIZE 2048 + -+################################################################################ ++static const short rpi_transMatrix2even[32][16] = { // Even rows first ++{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, ++{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, ++{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, ++{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87}, ++{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83}, ++{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80}, ++{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75}, ++{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70}, ++{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64}, ++{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57}, ++{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50}, ++{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43}, ++{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36}, ++{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25}, ++{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18}, ++{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9}, ++// Odd rows ++{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4}, ++{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13}, ++{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22}, ++{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31}, ++{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38}, ++{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46}, ++{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54}, ++{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61}, ++{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67}, ++{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73}, ++{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78}, ++{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82}, ++{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85}, ++{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88}, ++{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90}, ++{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} ++}; + -+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) -+# In a P block, only the first half of coefficients contain used information. -+# At this point we have already issued two pairs of texture requests for the current block -+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time? -+# Or possibly by taking advantage of symmetry? ++// Code/constants on GPU ++struct GPU ++{ ++ unsigned int qpu_code[QPU_CODE_SIZE]; ++ unsigned int vpu_code8[VPU_CODE_SIZE]; ++ unsigned int vpu_code10[VPU_CODE_SIZE]; ++ short transMatrix2even[16*16*2]; ++}; ++ ++#define CFE_ENTS_PER_A 8 ++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices ++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70 ++// allow 128 ++#define CFE_ENT_COUNT 128 ++#define CFE_A_COUNT (CFE_ENT_COUNT / CFE_ENTS_PER_A) + -+.macro m_filter_y_bxx, v_bit_depth -+ m_luma_setup v_bit_depth ++struct rpi_cache_flush_env_s { ++// unsigned int n; ++// struct vcsm_user_clean_invalid_s a[CFE_A_COUNT]; ++ struct vcsm_user_clean_invalid2_s v; ++}; + -+:1 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++#define WAIT_COUNT_MAX 16 + -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++typedef struct trace_time_one_s ++{ ++ int count; ++ int64_t start[WAIT_COUNT_MAX]; ++ int64_t total[WAIT_COUNT_MAX]; ++} trace_time_one_t; + -+ max r2, ra_y2, 0 -+ min r2, r2, rb_max_y ; mov ra7, ra8 -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte ++typedef struct trace_time_wait_s ++{ ++ unsigned int jcount; ++ int64_t start0; ++ int64_t last_update; ++ trace_time_one_t active; ++ trace_time_one_t wait; ++} trace_time_wait_t; + -+ add.setf -, rb_ef, rb_ef ; mov ra8, ra9 ++typedef struct vq_wait_s ++{ ++ sem_t sem; ++ struct vq_wait_s * next; ++} vq_wait_t; + -+# apply horizontal filter -+ and r1, r1, rb_pmask ; mul24 r3, ra0.8a, r0 -+ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0 -+ nop ; mul24.ifn r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 -+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 -+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 -+ nop ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++#define VQ_WAIT_POOL_SIZE 16 ++typedef struct vq_wait_pool_s ++{ ++ vq_wait_t * head; ++ vq_wait_t pool[VQ_WAIT_POOL_SIZE]; ++} vq_wait_pool_t; + -+ sub.setf -, r5, 8 ; mov ra9, ra10 -+ sub r2, r2, r3 ; mul24 r0, rb9, ra2.8a -+ brr.anyn -, r:1b -+ mov rb9, rb10 ; mul24 r1, rb10, ra2.8b -+ mov ra10, ra11 ; mov rb10, rb11 -+ asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7 -+ # >>> .anyn 1b ++static void vq_wait_pool_init(vq_wait_pool_t * const pool); ++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); + -+ # apply vertical filter and write to VPM -+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c -+ sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d -+ add r1, r1, r0 ; mul24 r0, ra8, rb4 -+ add r1, r1, r0 ; mul24 r0, ra9, rb5 -+ sub r1, r1, r0 ; mul24 r0, ra10, rb6 -+ add r1, r1, r0 ; mul24 r0, ra11, rb7 -+ sub r1, r1, r0 ; mov r2, rb_wt_off -+# As with P-pred r1 is a 22-bit signed quantity in 32-bits -+# Top 8 bits are bad - low 6 bits should be discarded -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256 ++typedef struct gpu_env_s ++{ ++ int open_count; ++ int init_count; ++ int mb; ++ int vpu_i_cache_flushed; ++ GPU_MEM_PTR_T code_gm_ptr; ++ vq_wait_pool_t wait_pool; ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ trace_time_wait_t ttw; ++#endif ++} gpu_env_t; + -+ asr r1, r1, 14 -+ nop ; mul24 r0, r1, ra_wt_mul_l0 -+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8 @ "mul_used", 0 ++// Stop more than one thread trying to allocate memory or use the processing resources at once ++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; ++static gpu_env_t * gpu = NULL; + -+ add r1, r1, r0 ; mov r3, ra_blk_height -+ shl r1, r1, 8 ; v8subs r0, ra_height, r3 -+ brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> branch.anyn 1b ++#if RPI_TRACE_TIME_VPU_QPU_WAIT + -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++static int64_t ns_time(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; ++} + -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 + -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link ++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) ++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) ++#define T_ARG(t) T_SEC(t), T_MS(t) ++#define T_FMT "%u.%03u" + -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 -+ brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> 1b -+.endm ++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) ++{ ++ // Update totals for levels that are still pending ++ for (int i = 0; i < tto->count; ++i) { ++ tto->total[i] += now - tto->start[i]; ++ tto->start[i] = now; ++ } + -+::mc_filter_y_bxx -+ m_filter_y_bxx 8 ++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", ++ prefix, ++ T_ARG(now - start0 - tto->total[0]), ++ T_ARG(tto->total[0]), ++ T_ARG(tto->total[1]), ++ T_ARG(tto->total[2]), ++ T_ARG(tto->total[3])); ++} + -+################################################################################ -+# -+# typedef struct qpu_mc_pred_y_p00_s { -+# qpu_mc_src_t next_src1; -+# uint16_t h; -+# uint16_t w; -+# uint32_t wo1; -+# uint32_t dst_addr; -+# uint32_t next_fn; -+# } qpu_mc_pred_y_p00_t; + -+.macro m_filter_y_p00, v_bit_depth ++static void tto_start(trace_time_one_t * const tto, const int64_t now) ++{ ++ av_assert0(tto->count < WAIT_COUNT_MAX); ++ tto->start[tto->count++] = now; ++} + -+.if v_bit_depth <= 8 -+.set v_x_shift, 0 -+.set v_x_mul, 1 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 7 -+.set v_dma_wh_shift, i_shift16 -+.else -+.set v_x_shift, 1 -+.set v_x_mul, 2 -+# Shifts to get width & height in the right place in rb_dma0 -+.set v_dma_h_shift, 8 -+.set v_dma_wh_shift, 15 -+.endif ++static void tto_end(trace_time_one_t * const tto, const int64_t now) ++{ ++ const int n = --tto->count; ++ av_assert0(n >= 0); ++ tto->total[n] += now - tto->start[n]; ++} + -+ mov ra0, unif ; mov r3, elem_num # y_x -+ mov ra_xshift, ra_xshift_next # [ra0 delay] -+ add r0, ra0.16b, r3 -+.if v_x_shift != 0 -+ shl r0, r0, v_x_shift -+.endif ++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) ++{ ++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); ++ tto_print(&ttw->active, now, ttw->start0, "Active"); ++ tto_print(&ttw->wait, now, ttw->start0, " Wait"); ++} + -+ max r0, r0, 0 -+ min r0, r0, rb_max_x ++#endif + -+ shl ra_xshift_next, r0, 3 # Compute shifts -+ and r0, r0, -4 ; v8subs r2, r2, r2 -+ sub r2, r2, rb_pitch ; mov ra_base_next, unif # src1.base -+ and r1, r0, r2 ; mov ra_y_next, ra0.16a -+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch -+ add r0, r0, r1 ; mov ra_width_height, unif # Add stripe offsets ; width_height -+ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write ++// GPU memory alloc fns (internal) + -+# get width,height of block (unif load above) -+# Compute vdw_setup1(dst_pitch-width) -+ shl r1, ra_width, v_x_shift -+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height -+ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height -+ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 -+ add r0, r0, r1 ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset -+ shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr -+ add rb_dma0, r0, rb_dma0_base ++// GPU_MEM_PTR_T alloc fns ++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { ++ p->numbytes = (numbytes + 255) & ~255; // Round up ++ p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" ); ++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" ); ++ av_assert0(p->vcsm_handle); ++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); ++ av_assert0(p->vc_handle); ++ p->arm = vcsm_lock(p->vcsm_handle); ++ av_assert0(p->arm); ++ p->vc = mbox_mem_lock(mb, p->vc_handle); ++ av_assert0(p->vc); ++// printf("***** %s, %d\n", __func__, numbytes); + -+ shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3 # Offset calc ; r5 = 0 -+ # For B l1 & L0 offsets should be identical so it doesn't matter which we use -+ asr rb_wt_off, r0, 1 ; mov ra_link, unif # ; link ++ return 0; ++} ++ ++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) { ++ p->numbytes = numbytes; ++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" ); ++ av_assert0(p->vcsm_handle); ++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle); ++ av_assert0(p->vc_handle); ++ p->arm = vcsm_lock(p->vcsm_handle); ++ av_assert0(p->arm); ++ p->vc = mbox_mem_lock(mb, p->vc_handle); ++ av_assert0(p->vc); ++// printf("***** %s, %d\n", __func__, numbytes); ++ return 0; ++} + -+:1 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 -+ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) { ++ mbox_mem_unlock(mb, p->vc_handle); ++ vcsm_unlock_ptr(p->arm); ++ vcsm_free(p->vcsm_handle); ++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++// printf("***** %s\n", __func__); ++} + -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask + -+ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 -+ shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++// GPU init, free, lock, unlock + -+ brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> branch.anyn 1b ++static void gpu_term(void) ++{ ++ gpu_env_t * const ge = gpu; + -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++ // We have to hope that eveything has terminated... ++ gpu = NULL; + -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++ vc_gpuserv_deinit(); + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ gpu_free_internal(ge->mb, &ge->code_gm_ptr); + -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link ++ vcsm_exit(); + -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 -+ brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> 1b -+.endm ++ mbox_close(ge->mb); + -+::mc_filter_y_p00 -+ m_filter_y_p00 8 ++ vq_wait_pool_deinit(&ge->wait_pool); + -+################################################################################ ++ free(ge); ++} + -+.macro m_filter_y_b00, v_bit_depth -+# luma setup does a fair bit more than we need calculating filter coeffs -+# that we will never use but it saves I-cache to use it (also simple!) -+ m_luma_setup v_bit_depth + -+# Fix up vals that were expecting a filter (somewhat icky) -+ mov r0, 7 -+ sub rb_i_tmu, rb_i_tmu, r0 -+ sub rb_lcount, rb_lcount, r0 -+ mov r0, 8 ; mov r1, ra_wt_off_mul_l0 -+ shl rb_wt_off, rb_wt_off, r0 -+ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// Connect to QPU, returns 0 on success. ++static int gpu_init(gpu_env_t ** const gpu) { ++ volatile struct GPU* ptr; ++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); ++ *gpu = NULL; + -+:1 -+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1 -+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 -+ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ if (ge == NULL) ++ return -1; + -+ max r2, ra_y, 0 # y -+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next -+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 -+ add t0s, ra_base, r2 ; mov.ifz ra_base2, rb_base2_next ++ if ((ge->mb = mbox_open()) < 0) ++ return -1; + -+ max r2, ra_y2, 0 -+ min r2, r2, rb_max_y -+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 -+ add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte -+ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++ vq_wait_pool_init(&ge->wait_pool); + -+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 -+ add r1, r0, r1 -+ shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height -+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ vcsm_init(); + -+ brr.anyn -, r:1b -+ asr r1, r1, rb_wt_den_p15 -+ min r1, r1, ra_pmax ; mov -, vw_wait -+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch -+# >>> branch.anyn 1b ++ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr); ++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; + -+# r0 = remaining height (min 0) -+# r2 = r3 * rb_pitch -+# r3 = block_height (currently always 16) ++ // Zero everything so we have zeros between the code bits ++ memset((void *)ptr, 0, sizeof(*ptr)); + -+# If looping again then we consumed 16 height last loop -+# rb_dma1 (stride) remains constant -+# rb_i_tmu remains const (based on total height) -+# recalc rb_dma0, rb_lcount based on new segment height ++ // Now copy over the QPU code into GPU memory ++ { ++ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader; ++ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->qpu_code, ff_hevc_rpi_shader, num_bytes); ++ } ++ // And the VPU code ++ { ++ int num_bytes = sizeof(rpi_hevc_transform8); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); ++ } ++ { ++ int num_bytes = sizeof(rpi_hevc_transform10); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); ++ } ++ // And the transform coefficients ++ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); + -+ mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0 ++ *gpu = ge; ++ return 0; ++} + -+# DMA out -+ bra.anyz -, ra_link -+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride -+ sub r1, r0, r3 ; mov vw_addr, rb_dest # start the VDW -+ shl r1, r1, i_shift23 -+# >>> .anyz ra_link + -+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve -+# We add to dma0 to reduce the number of output lines in the final block -+ add rb_lcount, rb_lcount, r0 -+ brr -, r:1b -+ add rb_dma0, rb_dma0, r1 -+ add rb_dest, rb_dest, r2 -+ mov vw_setup, rb_vpm_init # Reset our VDM write pointer -+# >>> 1b -+.endm + -+::mc_filter_y_b00 -+ m_filter_y_b00 8 ++static void gpu_unlock(void) { ++ pthread_mutex_unlock(&gpu_mutex); ++} + -+################################################################################ -+################################################################################ -+# 10 BIT ++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. ++static gpu_env_t * gpu_lock(void) { ++ pthread_mutex_lock(&gpu_mutex); + -+::mc_setup_c10_q0 -+ m_setup_q0 -+::mc_setup_c10_qn -+ m_setup_c 10 ++ av_assert0(gpu != NULL); ++ return gpu; ++} + -+::mc_filter_c10_p -+ m_filter_c_p 0, 10 ++static gpu_env_t * gpu_lock_ref(void) ++{ ++ pthread_mutex_lock(&gpu_mutex); + -+::mc_filter_c10_p_l1 -+ m_filter_c_p 1, 10 ++ if (gpu == NULL) { ++ int rv = gpu_init(&gpu); ++ if (rv != 0) { ++ gpu_unlock(); ++ return NULL; ++ } ++ } + ++ ++gpu->open_count; ++ return gpu; ++} + -+::mc_filter_c10_b -+ m_filter_c_b 10 ++static void gpu_unlock_unref(gpu_env_t * const ge) ++{ ++ if (--ge->open_count == 0) ++ gpu_term(); + -+# Even if these fns are the same as for other bit depths we want our own copy -+# to keep the code we are using in a single lump to avoid (direct map) cache -+# thrashing -+.set v_quads10, N_QPU_16 / 4 ++ gpu_unlock(); ++} + -+::mc_sync10_q0 -+ m_sync_q 0, v_quads10 -+::mc_sync10_q1 -+ m_sync_q 1, v_quads10 -+::mc_sync10_q2 -+ m_sync_q 2, v_quads10 -+::mc_sync10_q3 -+ m_sync_q 3, v_quads10 -+::mc_sync10_q4 -+ m_sync_q 4, v_quads10 -+::mc_sync10_q5 -+ m_sync_q 5, v_quads10 -+::mc_sync10_q6 -+ m_sync_q 6, v_quads10 -+::mc_sync10_q7 -+ m_sync_q 7, v_quads10 -+::mc_sync10_q8 -+ m_sync_q 8, v_quads10 -+::mc_sync10_q9 -+ m_sync_q 9, v_quads10 -+::mc_sync10_q10 -+ m_sync_q 10, v_quads10 -+::mc_sync10_q11 -+ m_sync_q 11, v_quads10 ++static inline gpu_env_t * gpu_ptr(void) ++{ ++ av_assert0(gpu != NULL); ++ return gpu; ++} + -+::mc_exit_y10_q0 -+::mc_exit_c10_q0 -+ m_exit_q0 ++// Public gpu fns + -+::mc_exit_y10_qn -+::mc_exit_c10_qn -+ m_exit_qn ++// Allocate memory on GPU ++// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes ++// Returns 0 on success. ++// This allocates memory that will not be cached in ARM's data cache. ++// Therefore safe to use without data cache flushing. ++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) ++{ ++ int r; ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p); ++ gpu_unlock(); ++ return r; ++} + -+::mc_setup_y10_q0 -+ m_setup_q0 -+::mc_setup_y10_qn -+ m_setup_y 10 ++// This allocates data that will be ++// Cached in ARM L2 ++// Uncached in VPU L2 ++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) ++{ ++ int r; ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ r = gpu_malloc_cached_internal(ge->mb, numbytes, p); ++ gpu_unlock(); ++ return r; ++} + -+:per_block_setup_10 -+ m_per_block_setup 10 ++void gpu_free(GPU_MEM_PTR_T * const p) { ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_free_internal(ge->mb, p); ++ gpu_unlock_unref(ge); ++} + -+::mc_filter_y10_pxx -+ m_filter_y_pxx 10 ++unsigned int vpu_get_fn(const unsigned int bit_depth) { ++ // Make sure that the gpu is initialized ++ av_assert0(gpu != NULL); ++ switch (bit_depth){ ++ case 8: ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); ++ case 10: ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); ++ default: ++ av_assert0(0); ++ } ++ return 0; ++} + -+::mc_filter_y10_p00 -+ m_filter_y_p00 10 ++unsigned int vpu_get_constants(void) { ++ av_assert0(gpu != NULL); ++ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even); ++} + -+::mc_filter_y10_bxx -+ m_filter_y_bxx 10 ++int gpu_get_mailbox(void) ++{ ++ av_assert0(gpu); ++ return gpu->mb; ++} + -+::mc_filter_y10_b00 -+ m_filter_y_b00 10 ++void gpu_ref(void) ++{ ++ gpu_lock_ref(); ++ gpu_unlock(); ++} + ++void gpu_unref(void) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_unlock_unref(ge); ++} + ++// ---------------------------------------------------------------------------- ++// ++// Cache flush functions + -+::mc_end -+# Do not add code here because mc_end must appear after all other code. -diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h -new file mode 100644 -index 0000000000..9f8983da52 ---- /dev/null -+++ b/libavcodec/rpi_shader_cmd.h -@@ -0,0 +1,128 @@ -+#ifndef RPI_SHADER_CMD_H -+#define RPI_SHADER_CMD_H ++#define CACHE_EL_MAX 16 + -+#pragma pack(push, 4) ++rpi_cache_flush_env_t * rpi_cache_flush_init() ++{ ++ rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) + ++ sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX); ++ if (rfe == NULL) ++ return NULL; + -+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y -+// If mixed then we are just confused and get a lot of warnings.... -+typedef const uint8_t * qpu_mc_src_addr_t; -+typedef uint8_t * qpu_mc_dst_addr_t; -+#else -+typedef uint32_t qpu_mc_src_addr_t; -+typedef uint32_t qpu_mc_dst_addr_t; -+#endif ++ rfe->v.op_count = 0; ++ return rfe; ++} + -+typedef struct qpu_mc_src_s ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) +{ -+ int16_t y; -+ int16_t x; -+ qpu_mc_src_addr_t base; -+} qpu_mc_src_t; -+ ++ if (rfe != NULL) ++ free(rfe); ++} + -+typedef struct qpu_mc_pred_c_p_s { -+ qpu_mc_src_t next_src; -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x; -+ uint32_t coeffs_y; -+ uint32_t wo_u; -+ uint32_t wo_v; -+ qpu_mc_dst_addr_t dst_addr_c; -+ uint32_t next_fn; -+} qpu_mc_pred_c_p_t; ++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = 0; ++ if (rfe->v.op_count != 0) { ++ if (vcsm_clean_invalid2(&rfe->v) != 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno); ++ rc = -1; ++ } ++ rfe->v.op_count = 0; ++ } ++ return rc; ++} + -+typedef struct qpu_mc_pred_c_b_s { -+ qpu_mc_src_t next_src1; -+ uint16_t h; -+ uint16_t w; -+ uint32_t coeffs_x1; -+ uint32_t coeffs_y1; -+ uint32_t weight_u1; -+ uint32_t weight_v1; -+ qpu_mc_src_t next_src2; -+ uint32_t coeffs_x2; -+ uint32_t coeffs_y2; -+ uint32_t wo_u2; -+ uint32_t wo_v2; -+ qpu_mc_dst_addr_t dst_addr_c; -+ uint32_t next_fn; -+} qpu_mc_pred_c_b_t; ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = rpi_cache_flush_execute(rfe);; + -+typedef struct qpu_mc_pred_c_s_s { -+ qpu_mc_src_t next_src1; -+ uint32_t pic_cw; // C Width (== Y width / 2) -+ uint32_t pic_ch; // C Height (== Y Height / 2) -+ uint32_t stride2; -+ uint32_t stride1; -+ uint32_t wdenom; -+ qpu_mc_src_t next_src2; -+ uint32_t next_fn; -+} qpu_mc_pred_c_s_t; ++ free(rfe); ++ return rc; ++} + -+typedef struct qpu_mc_pred_c_s { -+ union { -+ qpu_mc_pred_c_p_t p; -+ qpu_mc_pred_c_b_t b; -+ qpu_mc_pred_c_s_t s; -+ }; -+} qpu_mc_pred_c_t; ++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) ++{ ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; + ++ av_assert0(rfe->v.op_count <= CACHE_EL_MAX); + -+typedef struct qpu_mc_pred_y_p_s { -+ qpu_mc_src_t next_src1; -+ qpu_mc_src_t next_src2; -+ uint16_t h; -+ uint16_t w; -+ uint32_t mymx21; -+ uint32_t wo1; -+ uint32_t wo2; -+ qpu_mc_dst_addr_t dst_addr; -+ uint32_t next_fn; -+} qpu_mc_pred_y_p_t; ++ b->invalidate_mode = mode; ++ b->block_count = blocks; ++ b->start_address = gm->arm + offset0; ++ b->block_size = block_size; ++ b->inter_block_stride = block_stride; ++} + -+typedef struct qpu_mc_pred_y_p00_s { -+ qpu_mc_src_t next_src1; -+ uint16_t h; -+ uint16_t w; -+ uint32_t wo1; -+ qpu_mc_dst_addr_t dst_addr; -+ uint32_t next_fn; -+} qpu_mc_pred_y_p00_t; ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset, const unsigned int size) ++{ ++ // Deal with empty pointer trivially ++ if (gm == NULL || size == 0) ++ return; + -+typedef struct qpu_mc_pred_y_s_s { -+ qpu_mc_src_t next_src1; -+ qpu_mc_src_t next_src2; -+ uint16_t pic_h; -+ uint16_t pic_w; -+ uint32_t stride2; -+ uint32_t stride1; -+ uint32_t wdenom; -+ uint32_t next_fn; -+} qpu_mc_pred_y_s_t; ++ av_assert0(offset <= gm->numbytes); ++ av_assert0(size <= gm->numbytes); ++ av_assert0(offset + size <= gm->numbytes); + -+// Only a useful structure in that it allows us to return something other than a void * -+typedef struct qpu_mc_pred_y_s { -+ union { -+ qpu_mc_pred_y_p_t p; -+ qpu_mc_pred_y_p00_t p00; -+ qpu_mc_pred_y_s_t s; -+ }; -+} qpu_mc_pred_y_t; ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); ++} + -+typedef union qpu_mc_pred_cmd_u { -+ qpu_mc_pred_y_t y; -+ qpu_mc_pred_c_t c; -+ uint32_t data[1]; -+} qpu_mc_pred_cmd_t; ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++{ ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); ++} + -+#define QPU_MC_PRED_N_Y8 12 -+#define QPU_MC_PRED_N_C8 12 + -+#define QPU_MC_PRED_N_Y10 12 -+#define QPU_MC_PRED_N_C10 12 ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) ++{ ++#if !RPI_ONE_BUF ++#error Fixme! (NIF) ++#endif ++ if (gpu_is_buf1(frame)) { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); ++ } ++ else ++ { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); ++ } ++} + -+#pragma pack(pop) ++// Flush an area of a frame ++// Width, height, x0, y0 in luma pels ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma) ++{ ++ const unsigned int y_offset = frame->linesize[0] * y0; ++ const unsigned int y_size = frame->linesize[0] * height; ++ // Round UV up/down to get everything ++ const unsigned int uv_rnd = (1U << uv_shift) >> 1; ++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); ++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; + ++#if 0 ++ // *** frame->height is cropped height so not good ++ // As all unsigned they will also reject -ve ++ // Test individually as well as added to reject overflow ++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped ++ av_assert0(n <= (unsigned int)frame->height); ++ av_assert0(start_line + n <= (unsigned int)frame->height); +#endif + -diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c -new file mode 100644 -index 0000000000..2d763f54ef ---- /dev/null -+++ b/libavcodec/rpi_shader_template.c -@@ -0,0 +1,66 @@ -+#ifdef RPI -+ -+#include "hevc.h" -+#include "hevcdec.h" -+#include "libavutil/rpi_sand_fns.h" -+#include "rpi_shader_cmd.h" -+#include "rpi_shader_template.h" ++ if (!gpu_is_buf1(frame)) ++ { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); ++ } ++ } ++ else if (!av_rpi_is_sand_frame(frame)) ++ { ++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); ++ } ++ } ++ else ++ { ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame); ++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); ++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C ++ av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); + -+typedef struct shader_track_s -+{ -+ const union qpu_mc_pred_cmd_u *qpu_mc_curr; -+ const struct qpu_mc_src_s *last_l0; -+ const struct qpu_mc_src_s *last_l1; -+ uint32_t width; // pic_width * PW -+ uint32_t height; -+ uint32_t stride2; -+ uint32_t stride1; -+ uint32_t wdenom; -+} shader_track_t; ++ if (do_chroma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); ++ b->block_size = uv_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ if (do_luma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); ++ b->block_size = y_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ } ++} + -+static int wtoidx(const unsigned int w) ++// Call this to clean and invalidate a region of memory ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) +{ -+ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; -+ return pel_weight[w]; ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(); ++ rpi_cache_flush_add_gm_ptr(rfe, p, mode); ++ rpi_cache_flush_finish(rfe); +} + -+static const int fctom(uint32_t x) ++ ++// ---------------------------------------------------------------------------- ++ ++ ++// Wait abstractions - mostly so we can easily add profile code ++static void vq_wait_pool_init(vq_wait_pool_t * const wp) +{ -+ int rv; -+ // As it happens we can take the 2nd filter term & divide it by 8 -+ // (dropping fractions) to get the fractional move -+ rv = 8 - ((x >> 11) & 0xf); -+ av_assert2(rv >= 0 && rv <= 7); -+ return rv; ++ unsigned int i; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_init(&wp->pool[i].sem, 0, 0); ++ wp->pool[i].next = wp->pool + i + 1; ++ } ++ wp->head = wp->pool + 0; ++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; +} + -+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) ++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) +{ -+ return (x << shl) >> shr; ++ unsigned int i; ++ wp->head = NULL; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_destroy(&wp->pool[i].sem); ++ wp->pool[i].next = NULL; ++ } +} + -+static inline int woff_p(HEVCContext *const s, int32_t x) ++ ++// If sem_init actually takes time then maybe we want a pool... ++static vq_wait_t * vq_wait_new(void) +{ -+ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); ++ gpu_env_t * const ge = gpu_lock_ref(); ++ vq_wait_t * const wait = ge->wait_pool.head; ++ ge->wait_pool.head = wait->next; ++ wait->next = NULL; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ tto_start(&ge->ttw.active, ns_time()); ++#endif ++ ++ gpu_unlock(); ++ return wait; +} + -+static inline int woff_b(HEVCContext *const s, int32_t x) ++static void vq_wait_delete(vq_wait_t * const wait) +{ -+ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); ++ gpu_env_t * const ge = gpu_lock(); ++ wait->next = ge->wait_pool.head; ++ ge->wait_pool.head = wait; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ trace_time_wait_t * const ttw = &ge->ttw; ++ const int64_t now = ns_time(); ++ ++ttw->jcount; ++ tto_end(&ttw->wait, now); ++ ++ if (ttw->start0 == 0) ++ { ++ ttw->start0 = ttw->active.start[0]; ++ ttw->last_update = ttw->start0; ++ } ++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) ++ { ++ ttw->last_update += WAIT_TIME_PRINT_PERIOD; ++ ttw_print(ttw, now); ++ } ++ } ++#endif ++ gpu_unlock_unref(ge); +} + -+static inline int wweight(int32_t x) ++static void vq_wait_wait(vq_wait_t * const wait) +{ -+ return ext(x, 16, 16); ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ const int64_t now = ns_time(); ++ gpu_env_t * const ge = gpu_lock(); ++ tto_start(&ge->ttw.wait, now); ++ gpu_unlock(); ++ } ++#endif ++ ++ while (sem_wait(&wait->sem) == -1 && errno == EINTR) ++ /* loop */; +} + ++static void vq_wait_post(vq_wait_t * const wait) ++{ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ gpu_env_t *const ge = gpu_lock(); ++ tto_end(&ge->ttw.active, ns_time()); ++ gpu_unlock(); ++ } ++#endif + -+#define PW 1 -+#include "rpi_shader_template_fn.h" ++ sem_post(&wait->sem); ++} + -+#undef PW -+#define PW 2 -+#include "rpi_shader_template_fn.h" + -+#endif + -diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h -new file mode 100644 -index 0000000000..ecf5b8185a ---- /dev/null -+++ b/libavcodec/rpi_shader_template.h -@@ -0,0 +1,24 @@ -+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H -+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++// Header comments were wrong for these two ++#define VPU_QPU_MASK_QPU 1 ++#define VPU_QPU_MASK_VPU 2 + -+#ifdef RPI -+struct HEVCContext; -+struct HEVCRpiInterPredEnv; ++#define VPU_QPU_JOB_MAX 4 ++struct vpu_qpu_job_env_s ++{ ++ unsigned int n; ++ unsigned int mask; ++ struct gpu_job_s j[VPU_QPU_JOB_MAX]; ++}; + -+void rpi_shader_c8(struct HEVCContext *const s, -+ const struct HEVCRpiInterPredEnv *const ipe_y, -+ const struct HEVCRpiInterPredEnv *const ipe_c); ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; + -+void rpi_shader_c16(struct HEVCContext *const s, -+ const struct HEVCRpiInterPredEnv *const ipe_y, -+ const struct HEVCRpiInterPredEnv *const ipe_c); ++vpu_qpu_job_env_t * vpu_qpu_job_new(void) ++{ ++ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); ++ return vqj; ++} + -+void rpi_sand_dump8(const char * const name, -+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) ++{ ++ memset(vqj, 0, sizeof(*vqj)); ++ free(vqj); ++} + -+void rpi_sand_dump16(const char * const name, -+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) ++{ ++ struct gpu_job_s * const j = vqj->j + vqj->n++; ++ av_assert0(vqj->n <= VPU_QPU_JOB_MAX); ++ return j; ++} + -+#endif -+#endif ++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) ++{ ++ if (vpu_code != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_VPU; + -diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h -new file mode 100644 -index 0000000000..b5ac2ceed6 ---- /dev/null -+++ b/libavcodec/rpi_shader_template_fn.h -@@ -0,0 +1,477 @@ -+#define STRCAT(x,y) x##y ++ j->command = EXECUTE_VPU; ++ // The bottom two bits of the execute address contain no-flush flags ++ // b0 will flush the VPU I-cache if unset so we nearly always want that set ++ // as we never reload code ++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; ++ j->u.v.q[1] = r0; ++ j->u.v.q[2] = r1; ++ j->u.v.q[3] = r2; ++ j->u.v.q[4] = r3; ++ j->u.v.q[5] = r4; ++ j->u.v.q[6] = r5; ++ gpu->vpu_i_cache_flushed = 1; ++ } ++} + -+#if PW == 1 -+#define pixel uint8_t -+#define FUNC(f) STRCAT(f, 8) -+#elif PW == 2 -+#define pixel uint16_t -+#define FUNC(f) STRCAT(f, 16) ++// flags are QPU_FLAGS_xxx ++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) ++{ ++ if (n != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_QPU; ++ ++ j->command = EXECUTE_QPU; ++ j->u.q.jobs = n; ++#if RPI_TRACE_QPU_PROFILE_ALL ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; +#else -+#error Unexpected PW ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; +#endif -+ -+#define PATCH_STRIDE (16 * PW) -+ -+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) -+{ -+ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { -+ const pixel s = *(const pixel *)src; -+ pixel * d = (pixel *)dst; -+ for (unsigned int j = 0; j < w; j += PW) { -+ *d++ = s; -+ } -+ } ++ j->u.q.timeout = 5000; ++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); ++ } +} + -+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++// Convert callback to sem post ++static void vpu_qpu_job_callback_wait(void * v) +{ -+ for (unsigned int i = 0; i != h; ++i, dst += stride) { -+ memcpy(dst, src, w); -+ } ++ vq_wait_post(v); +} + -+static void FUNC(get_patch_y)(const shader_track_t * const st, -+ uint8_t * dst, const unsigned int dst_stride, -+ const qpu_mc_src_t *src, -+ unsigned int _w, unsigned int _h) ++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) +{ -+ int x = src->x * PW; -+ int y = src->y; -+ int w = _w * PW; -+ int h = _h; -+ int dl = 0; -+ int dr = 0; -+ int dt = 0; -+ int db = 0; -+ -+ if (x < 0) { -+ if (-x >= w) -+ x = PW - w; -+ dl = -x; -+ w += x; -+ x = 0; -+ } -+ if (x + w > st->width) { -+ if (x >= st->width) -+ x = st->width - PW; -+ dr = (x + w) - st->width; -+ w = st->width - x; -+ } ++ vq_wait_t * wait; + -+ // Y -+ if (y < 0) { -+ if (-y >= h) -+ y = 1 - h; -+ dt = -y; -+ h += y; -+ y = 0; -+ } -+ if (y + h > st->height) { -+ if (y >= st->height) -+ y = st->height - 1; -+ db = (y + h) - st->height; -+ h = st->height - y; -+ } ++ if (vqj->mask == 0) { ++ *wait_h = NULL; ++ return; ++ } + -+ dst += dl + dt * dst_stride; -+ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ // We are going to want a sync object ++ wait = vq_wait_new(); + -+ // Edge dup -+ if (dl != 0) -+ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); -+ if (dr != 0) -+ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); -+ w += dl + dr; -+ dst -= dl; ++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync ++ // If we only posted one thing or only QPU jobs ++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) ++ { ++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); ++ av_assert0(j->callback.func == 0); + -+ if (dt != 0) -+ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); -+ if (db != 0) -+ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); -+} ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } ++ else ++ { ++ struct gpu_job_s *const j = new_job(vqj); + ++ j->command = EXECUTE_SYNC; ++ j->u.s.mask = vqj->mask; ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } + ++ vqj->mask = 0; ++ *wait_h = wait; ++} + -+static void FUNC(get_patch_c)(const shader_track_t * const st, -+ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, -+ const qpu_mc_src_t *src, -+ unsigned int _w, unsigned int _h) ++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) +{ -+ int x = src->x * PW; -+ int y = src->y; -+ int w = _w * PW; -+ int h = _h; -+ int dl = 0; -+ int dr = 0; -+ int dt = 0; -+ int db = 0; -+ const int width = st->width; -+ const int height = st->height; ++ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j); ++} + -+ if (x < 0) { -+ if (-x >= w) -+ x = PW - w; -+ dl = -x; -+ w += x; -+ x = 0; -+ } -+ if (x + w > width) { -+ if (x >= width) -+ x = width - PW; -+ dr = (x + w) - width; -+ w = width - x; -+ } ++// Simple wrapper of start + delete ++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) ++{ ++ int rv; ++ rv = vpu_qpu_job_start(vqj); ++ vpu_qpu_job_delete(vqj); ++ return rv; ++} + -+ // Y -+ if (y < 0) { -+ if (-y >= h) -+ y = 1 - h; -+ dt = -y; -+ h += y; -+ y = 0; -+ } -+ if (y + h > height) { -+ if (y >= height) -+ y = height - 1; -+ db = (y + h) - height; -+ h = height - y; ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) ++{ ++ if (wait_h != NULL) ++ { ++ vq_wait_t * const wait = *wait_h; ++ if (wait != NULL) { ++ *wait_h = NULL; ++ vq_wait_wait(wait); ++ vq_wait_delete(wait); + } ++ } ++} + -+ dst_u += dl + dt * dst_stride; -+ dst_v += dl + dt * dst_stride; -+ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++int vpu_qpu_init() ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; + -+ // Edge dup -+ if (dl != 0) -+ { -+ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); -+ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); -+ } -+ if (dr != 0) -+ { -+ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); -+ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); -+ } -+ w += dl + dr; -+ dst_u -= dl; -+ dst_v -= dl; ++ if (ge->init_count++ == 0) ++ { ++ vc_gpuserv_init(); ++ } + -+ if (dt != 0) -+ { -+ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); -+ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); -+ } -+ if (db != 0) -+ { -+ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); -+ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); -+ } ++ gpu_unlock(); ++ return 0; +} + -+// w, y, w, h in pixels -+// stride1, stride2 in bytes -+void FUNC(rpi_sand_dump)(const char * const name, -+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) ++void vpu_qpu_term() +{ -+ const int mask = stride2 == 0 ? ~0 : stride1 - 1; -+ -+ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); ++ gpu_env_t * const ge = gpu_lock(); + -+ if (is_c) { -+ x *= 2; -+ w *= 2; -+ } ++ if (--ge->init_count == 0) { ++ vc_gpuserv_deinit(); + -+ for (int i = y; i != y + h; ++i) { -+ for (int j = x; j != x + w; ++j) { -+ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; -+ char sep = is_c && (j & 1) == 0 ? ':' : ' '; -+#if PW == 1 -+ if (j < 0 || i < 0) -+ printf("..%c", sep); -+ else -+ printf("%02x%c", *(const pixel*)p, sep); -+#else -+ if (j < 0 || i < 0) -+ printf("...%c", sep); -+ else -+ printf("%03x%c", *(const pixel*)p, sep); ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ttw_print(&ge->ttw, ns_time()); +#endif -+ } -+ printf("\n"); -+ } ++ } ++ ++ gpu_unlock_unref(ge); ++} ++ ++uint32_t qpu_fn(const int * const mc_fn) ++{ ++ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader) + offsetof(struct GPU, qpu_code); +} + + -+void FUNC(rpi_shader_c)(HEVCContext *const s, -+ const HEVCRpiInterPredEnv *const ipe_y, -+ const HEVCRpiInterPredEnv *const ipe_c) ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) +{ -+ for (int c_idx = 0; c_idx < 2; ++c_idx) -+ { -+ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; -+ shader_track_t tracka[QPU_N_MAX] = {{NULL}}; -+ unsigned int exit_n = 0; ++ // Dummy values we can catch with emulation ++ qf->y_pxx = ~1U; ++ qf->y_bxx = ~2U; ++ qf->y_p00 = ~3U; ++ qf->y_b00 = ~4U; ++ qf->c_pxx = ~5U; ++ qf->c_bxx = ~6U; + -+ if (ipe == NULL || !ipe->used) { -+ continue; -+ } ++ switch (bit_depth) { ++ case 8: ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y_b00); ++ qf->c_pxx = qpu_fn(mc_filter_c_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c_b); ++ break; ++ case 10: ++ qf->c_pxx = qpu_fn(mc_filter_c10_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c10_b); ++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y10_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y10_b00); ++ break; ++ default: ++ return -1; ++ } ++ return 0; ++} + -+ do { -+ for (unsigned int i = 0; i != ipe->n; ++i) { -+ const HEVCRpiInterPredQ * const q = ipe->q + i; -+ shader_track_t * const st = tracka + i; -+ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; +diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h +new file mode 100644 +index 0000000000..9389047f8e +--- /dev/null ++++ b/libavcodec/rpi_qpu.h +@@ -0,0 +1,208 @@ ++#ifndef RPI_QPU_H ++#define RPI_QPU_H + -+ for (;;) { -+ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; ++#define RPI_ONE_BUF 1 + -+ if (link == q->code_setup) { -+ if (c_idx == 0) { -+ // Luma -+ const qpu_mc_pred_y_s_t *const c = &cmd->y.s; ++typedef struct gpu_mem_ptr_s { ++ unsigned char *arm; // Pointer to memory mapped on ARM side ++ int vc_handle; // Videocore handle of relocatable memory ++ int vcsm_handle; // Handle for use by VCSM ++ int vc; // Address for use in GPU code ++ int numbytes; // Size of memory block ++} GPU_MEM_PTR_T; + -+ st->height = c->pic_h; -+ st->width = c->pic_w * PW; -+ st->stride1 = c->stride1; -+ st->stride2 = c->stride2; -+ st->wdenom = c->wdenom; -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else { -+ // Chroma -+ const qpu_mc_pred_c_s_t *const c = &cmd->c.s; ++// General GPU functions ++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); ++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); ++extern void gpu_free(GPU_MEM_PTR_T * const p); + -+ st->height = c->pic_ch; -+ st->width = c->pic_cw * PW; -+ st->stride1 = c->stride1; -+ st->stride2 = c->stride2; -+ st->wdenom = c->wdenom; -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ } -+ else if (link == s->qpu.y_pxx) { -+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; -+ const int w1 = FFMIN(c->w, 8); -+ const int w2 = c->w - w1; ++#include "libavutil/frame.h" ++#if !RPI_ONE_BUF ++static inline uint32_t get_vc_address_y(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]); ++ return p->vc; ++} + -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++static inline uint32_t get_vc_address_u(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]); ++ return p->vc; ++} + -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h + 7); -+ if (w2 > 0) { -+ FUNC(get_patch_y)(st, -+ patch_y2, PATCH_STRIDE, -+ st->last_l1, -+ 16, c->h + 7); -+ } ++static inline uint32_t get_vc_address_v(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]); ++ return p->vc; ++} + -+ // wo[offset] = offset*2+1 -+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); -+ if (w2 > 0) { -+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( -+ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); -+ } -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.y_bxx) { -+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]); ++} + -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]); ++} + -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h + 7); -+ FUNC(get_patch_y)(st, -+ patch_y2, PATCH_STRIDE, -+ st->last_l1, -+ 16, c->h + 7); ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]); ++} + -+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( -+ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, -+ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); ++#else + -+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, -+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), -+ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.y_p00) { -+ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; ++static inline int gpu_is_buf1(const AVFrame * const frame) ++{ ++ return frame->buf[1] == NULL; ++} + -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame) ++{ ++ return av_buffer_get_opaque(frame->buf[0]); ++} + -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h + 7); ++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) ++{ ++ return av_buffer_pool_opaque(frame->buf[n]); ++} + -+ // wo[offset] = offset*2+1 -+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); ++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) ++{ ++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); ++ return gm->vc + (frame->data[n] - gm->arm); ++} + -+ st->last_l0 = &c->next_src1; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.y_b00) { -+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; + -+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++static inline uint32_t get_vc_address_y(const AVFrame * const frame) { ++ return get_vc_address3(frame, 0); ++} + -+ av_assert0(c->w <= 16 && c->h <= 64); ++static inline uint32_t get_vc_address_u(const AVFrame * const frame) { ++ return get_vc_address3(frame, 1); ++} + -+ FUNC(get_patch_y)(st, -+ patch_y1, PATCH_STRIDE, -+ st->last_l0, -+ 16, c->h); -+ FUNC(get_patch_y)(st, -+ patch_y2, PATCH_STRIDE, -+ st->last_l1, -+ 16, c->h); ++static inline uint32_t get_vc_address_v(const AVFrame * const frame) { ++ return get_vc_address3(frame, 2); ++} ++ ++#if 0 ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.numbytes = frame->data[1] - frame->data[0]; ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 0); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.arm += frame->data[1] - frame->data[0]; ++ g.vc += frame->data[1] - frame->data[0]; ++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 1); ++} + -+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( -+ patch_y3, patch_y1, PATCH_STRIDE, -+ c->h, 0, 0, c->w); ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.arm += frame->data[2] - frame->data[0]; ++ g.vc += frame->data[2] - frame->data[0]; ++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 2); ++} ++#endif ++#endif + -+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( -+ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, -+ c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2), -+ 0, woff_b(s, c->wo2), 0, 0, c->w); -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.c_pxx) { -+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; -+ const int mx = fctom(c->coeffs_x); -+ const int my = fctom(c->coeffs_y); ++// Cache flush stuff + -+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_u3[8 * 16 * PW]; -+ uint8_t patch_v3[8 * 16 * PW]; ++struct rpi_cache_flush_env_s; ++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; + -+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++rpi_cache_flush_env_t * rpi_cache_flush_init(void); ++// Free env without flushing ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & clear but do not free the env ++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & free the env ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); + -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++typedef enum ++{ ++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, ++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, ++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 ++} rpi_cache_flush_mode_t; + -+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, ++ const unsigned int offset, const unsigned int size); ++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma); + -+ st->last_l0 = &c->next_src; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.c_pxx_l1) { -+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; -+ const int mx = fctom(c->coeffs_x); -+ const int my = fctom(c->coeffs_y); ++// init, add, finish for one gm ptr ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); + -+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) -+ uint8_t patch_u3[8 * 16 * PW]; -+ uint8_t patch_v3[8 * 16 * PW]; + -+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++// QPU specific functions + -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); -+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( -+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++typedef struct HEVCRpiQpu { ++ uint32_t c_pxx; ++ uint32_t c_pxx_l1; ++ uint32_t c_bxx; ++ uint32_t y_pxx; ++ uint32_t y_bxx; ++ uint32_t y_p00; ++ uint32_t y_b00; ++} HEVCRpiQpu; + -+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); + -+ st->last_l1 = &c->next_src; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == s->qpu.c_bxx) { -+ const qpu_mc_pred_c_b_t *const c = &cmd->c.b; -+ const int mx1 = fctom(c->coeffs_x1); -+ const int my1 = fctom(c->coeffs_y1); -+ const int mx2 = fctom(c->coeffs_x2); -+ const int my2 = fctom(c->coeffs_y2); ++uint32_t qpu_fn(const int * const mc_fn); + -+ uint8_t patch_u1[PATCH_STRIDE * 72]; -+ uint8_t patch_v1[PATCH_STRIDE * 72]; -+ uint8_t patch_u2[PATCH_STRIDE * 72]; -+ uint8_t patch_v2[PATCH_STRIDE * 72]; -+ uint8_t patch_u3[8 * 16 * PW]; -+ uint8_t patch_v3[8 * 16 * PW]; -+ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; -+ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; ++#define QPU_N_GRP 4 ++#define QPU_N_MAX 12 + -+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); -+ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++#define QPU_MAIL_EL_VALS 2 + -+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( -+ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, mx1, my1, c->w); -+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( -+ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, -+ c->h, mx1, my1, c->w); ++struct vpu_qpu_wait_s; ++typedef struct vq_wait_s * vpu_qpu_wait_h; + -+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( -+ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, -+ c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2), -+ 0, woff_b(s, c->wo_u2), mx2, my2, c->w); -+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( -+ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, -+ c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2), -+ 0, woff_b(s, c->wo_v2), mx2, my2, c->w); ++// VPU specific functions + -+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++struct vpu_qpu_job_env_s; ++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; + -+ st->last_l0 = &c->next_src1; -+ st->last_l1 = &c->next_src2; -+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); -+ } -+ else if (link == q->code_sync) { -+ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); -+ break; -+ } -+ else if (link == q->code_exit) { -+ // We expect exit to occur without other sync -+ av_assert0(i == exit_n); -+ ++exit_n; -+ break; -+ } -+ else { -+ av_assert0(0); -+ } -+ } ++vpu_qpu_job_h vpu_qpu_job_new(void); ++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); ++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); ++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); ++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_job_start(const vpu_qpu_job_h vqj); ++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); + -+ st->qpu_mc_curr = cmd; -+ } -+ } while (exit_n == 0); -+ } -+} ++extern unsigned int vpu_get_fn(const unsigned int bit_depth); ++extern unsigned int vpu_get_constants(void); + -+#undef FUNC -+#undef pixel ++// Waits for previous post_codee to complete and Will null out *wait_h after use ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_init(void); ++void vpu_qpu_term(void); ++ ++extern int gpu_get_mailbox(void); ++void gpu_ref(void); ++void gpu_unref(void); + ++#endif diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c new file mode 100644 -index 0000000000..97d58abc0a +index 0000000000..185288da5a --- /dev/null +++ b/libavcodec/rpi_zc.c -@@ -0,0 +1,745 @@ -+#include "config.h" -+#ifdef RPI +@@ -0,0 +1,741 @@ +#include "libavcodec/avcodec.h" +#include "rpi_qpu.h" +#include "rpi_mailbox.h" @@ -32156,8 +40432,6 @@ index 0000000000..97d58abc0a + } +} + -+#endif // RPI -+ diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h new file mode 100644 index 0000000000..26fb3be999 @@ -32285,6 +40559,51 @@ index 13668c2105..bebf9024ec 100644 return 0; } +diff --git a/libavcodec/utils.c b/libavcodec/utils.c +index 9551f312e7..a1f68b8e30 100644 +--- a/libavcodec/utils.c ++++ b/libavcodec/utils.c +@@ -1277,6 +1277,40 @@ AVCodec *avcodec_find_decoder(enum AVCodecID id) + return find_encdec(id, 0); + } + ++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt) ++{ ++ const enum AVPixelFormat *pf = p->pix_fmts; ++ ++ // Assume good if we lack info ++ if (pf == NULL) ++ return 1; ++ if (fmt == AV_PIX_FMT_NONE) ++ return 0; ++ ++ for (; *pf != AV_PIX_FMT_NONE; ++pf) { ++ if (*pf == fmt) ++ return 1; ++ } ++ return 0; ++} ++ ++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt) ++{ ++ AVCodec *p, *experimental = NULL; ++ p = first_avcodec; ++ id= remap_deprecated_codec_id(id); ++ while (p) { ++ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) { ++ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) { ++ experimental = p; ++ } else ++ return p; ++ } ++ p = p->next; ++ } ++ return experimental; ++} ++ + AVCodec *avcodec_find_decoder_by_name(const char *name) + { + AVCodec *p; diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c index f0f849b326..cd97974748 100644 --- a/libavfilter/avfilter.c @@ -32310,21 +40629,8 @@ index ad5aedd5f7..0d2df8b870 100644 frame->format); break; case AVMEDIA_TYPE_AUDIO: -diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c -index 53cbcfb543..f93f06fcfb 100644 ---- a/libavformat/mpegts.c -+++ b/libavformat/mpegts.c -@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = { - #endif - { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 }, - { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC }, -- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 }, -+ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC }, - { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 }, - { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC }, - { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS }, diff --git a/libavformat/utils.c b/libavformat/utils.c -index 1a7996c4fd..154942fe74 100644 +index 1a7996c4fd..271e70ed84 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c @@ -750,7 +750,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in @@ -32336,27 +40642,111 @@ index 1a7996c4fd..154942fe74 100644 continue; s->streams[i]->pts_wrap_reference = pts_wrap_reference; s->streams[i]->pts_wrap_behavior = pts_wrap_behavior; +@@ -2940,6 +2940,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) + return 1; + } + ++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER ++// This should be quite general purpose but avoid possible conflicts ++// by limiting usage to cases wehere we know it works. ++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts) ++{ ++ // Only try fallback if we know it is supported (HEVC only) ++ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL : ++ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE); ++ int err; ++ ++ // Failed to find fallback or we are already at the fallback ++ if (new_codec == NULL || new_codec == old_codec) ++ { ++ return AVERROR_DECODER_NOT_FOUND; ++ } ++ ++ // * This may be dodgy - header says to not use this fn, ++ // especially if we are going to reopen the context... ++ // (but it does seem to work for our cases) ++ if (avcodec_is_open(avctx)) { ++ avcodec_close(avctx); ++ } ++ ++ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0) ++ { ++ return err; ++ } ++ ++ return 0; ++} ++#else ++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND) ++#endif ++ + /* returns 1 or 0 if or if not decoded data was returned, or a negative error */ + static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, + AVDictionary **options) +@@ -2974,7 +3008,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, + av_dict_set(options ? options : &thread_opt, "threads", "1", 0); + if (s->codec_whitelist) + av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0); +- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt); ++ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND) ++ { ++ // Try fallback if if looks worth a try ++ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt); ++ } + if (!options) + av_dict_free(&thread_opt); + if (ret < 0) { +@@ -3005,6 +3043,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt, + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO || + avctx->codec_type == AVMEDIA_TYPE_AUDIO) { + ret = avcodec_send_packet(avctx, &pkt); ++ ++ // If we are going to want to fall back we should know here ++ if (ret == AVERROR_DECODER_NOT_FOUND) { ++ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0) ++ break; ++ continue; ++ } ++ + if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) + break; + if (ret >= 0) +@@ -3601,9 +3647,20 @@ FF_ENABLE_DEPRECATION_WARNINGS + // Try to just open decoders, in case this is enough to get parameters. + if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) { + if (codec && !avctx->codec) +- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0) +- av_log(ic, AV_LOG_WARNING, +- "Failed to open codec in %s\n",__FUNCTION__); ++ { ++ int err; ++ ++ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0) ++ { ++ if (err == AVERROR_DECODER_NOT_FOUND) { ++ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt); ++ } ++ if (err < 0) { ++ av_log(ic, AV_LOG_WARNING, ++ "Failed to open codec in %s\n",__FUNCTION__); ++ } ++ } ++ } + } + if (!options) + av_dict_free(&thread_opt); diff --git a/libavutil/Makefile b/libavutil/Makefile -index 65e285a701..afb3effa2e 100644 +index 65e285a701..2ca778c59f 100644 --- a/libavutil/Makefile +++ b/libavutil/Makefile -@@ -62,6 +62,8 @@ HEADERS = adler32.h \ - rational.h \ - replaygain.h \ - ripemd.h \ -+ rpi_sand_fns.h \ -+ rpi_sand_fn_pw.h \ - samplefmt.h \ - sha.h \ - sha512.h \ -@@ -140,6 +142,7 @@ OBJS = adler32.o \ - reverse.o \ - rc4.o \ - ripemd.o \ -+ rpi_sand_fns.o \ - samplefmt.o \ - sha.o \ - sha512.o \ +@@ -165,6 +165,7 @@ OBJS-$(CONFIG_QSV) += hwcontext_qsv.o + OBJS-$(CONFIG_LIBDRM) += hwcontext_drm.o + OBJS-$(CONFIG_LZO) += lzo.o + OBJS-$(CONFIG_OPENCL) += opencl.o opencl_internal.o ++OBJS-$(CONFIG_RPI) += rpi_sand_fns.o + OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o + OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o + OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile index 5da44b0542..b74b7c4e2f 100644 --- a/libavutil/arm/Makefile @@ -32441,22 +40831,33 @@ index 73b6bd0b14..d907de3f1c 100644 * @} */ diff --git a/libavutil/frame.c b/libavutil/frame.c -index d5fd2932e3..1851e3655f 100644 +index d5fd2932e3..151a33a24d 100644 --- a/libavutil/frame.c +++ b/libavutil/frame.c -@@ -25,6 +25,7 @@ +@@ -16,6 +16,8 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "config.h" ++ + #include "channel_layout.h" + #include "avassert.h" + #include "buffer.h" +@@ -25,6 +27,9 @@ #include "imgutils.h" #include "mem.h" #include "samplefmt.h" ++#if CONFIG_RPI +#include "rpi_sand_fns.h" ++#endif static AVFrameSideData *frame_new_side_data(AVFrame *frame, -@@ -833,6 +834,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) +@@ -833,6 +838,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) (frame->crop_top + frame->crop_bottom) >= frame->height) return AVERROR(ERANGE); -+#ifdef RPI ++#if CONFIG_RPI + // Sand cannot be cropped - do not try + if (av_rpi_is_sand_format(frame->format)) + return 0; @@ -32727,12 +41128,11 @@ index 0000000000..52d52a2a83 + diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c new file mode 100644 -index 0000000000..ec4cfadf8a +index 0000000000..b8bfad915e --- /dev/null +++ b/libavutil/rpi_sand_fns.c -@@ -0,0 +1,99 @@ +@@ -0,0 +1,96 @@ +#include "config.h" -+#ifdef RPI +#include +#include +#include "rpi_sand_fns.h" @@ -32828,17 +41228,14 @@ index 0000000000..ec4cfadf8a + } +} + -+#endif // RPI -+ diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h new file mode 100644 -index 0000000000..aa880d0f63 +index 0000000000..ebaa2b6d08 --- /dev/null +++ b/libavutil/rpi_sand_fns.h -@@ -0,0 +1,129 @@ +@@ -0,0 +1,131 @@ +#ifndef AVUTIL_RPI_SAND_FNS +#define AVUTIL_RPI_SAND_FNS -+#ifdef RPI + +#include "libavutil/frame.h" + @@ -32891,9 +41288,13 @@ index 0000000000..aa880d0f63 + +static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) +{ -+ // * We could repl;ace thios with a fixed 128 whic would allow the compiler -+ // to optimize a whole lot better ++#ifdef RPI_ZC_SAND128_ONLY ++ // If we are sure we only only support 128 byte sand formats replace the ++ // var with a constant which should allow for better optimisation ++ return 128; ++#else + return frame->linesize[0]; ++#endif +} + +static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) @@ -32963,7 +41364,6 @@ index 0000000000..aa880d0f63 +} + +#endif -+#endif + diff --git a/libswscale/input.c b/libswscale/input.c index bb2f4933ec..de5a17bc7f 100644 @@ -32995,14 +41395,14 @@ index bb2f4933ec..de5a17bc7f 100644 if (c->chrSrcHSubSample) { switch (srcFormat) { diff --git a/libswscale/utils.c b/libswscale/utils.c -index dcab707de6..403558db3c 100644 +index dcab707de6..5b24de889a 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -256,6 +256,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = { [AV_PIX_FMT_P010BE] = { 1, 1 }, [AV_PIX_FMT_P016LE] = { 1, 0 }, [AV_PIX_FMT_P016BE] = { 1, 0 }, -+#ifdef RPI ++#if CONFIG_RPI + [AV_PIX_FMT_SAND128] = { 1, 0 }, + [AV_PIX_FMT_SAND64_10] = { 1, 0 }, +#endif @@ -33544,17 +41944,16 @@ index 0000000000..fc14f2a3c2 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh new file mode 100755 -index 0000000000..ec25b81c31 +index 0000000000..59c0d3959e --- /dev/null +++ b/pi-util/conf_pi1.sh -@@ -0,0 +1,31 @@ +@@ -0,0 +1,30 @@ +echo "Configure for Pi1" + +RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf +RPI_OPT_VC=`pwd`/../firmware/opt/vc + +RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" +RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" +#RPI_KEEPS="-save-temps=obj" +RPI_KEEPS="" @@ -33566,8 +41965,8 @@ index 0000000000..ec25b81c31 + --target-os=linux\ + --disable-stripping\ + --enable-mmal\ -+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ -+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_INCLUDES"\ + --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ + --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ + --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- @@ -33581,18 +41980,18 @@ index 0000000000..ec25b81c31 +# -Wa,-ahls diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh new file mode 100755 -index 0000000000..f8e5e75375 +index 0000000000..4de256bc8a --- /dev/null +++ b/pi-util/conf_pi2.sh -@@ -0,0 +1,30 @@ +@@ -0,0 +1,32 @@ +echo "Configure for Pi2/3" + +RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf +RPI_OPT_VC=`pwd`/../firmware/opt/vc + +RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" -+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1" +RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" ++RPI_DEFINES="-D__VCCOREVER__=0x4000000" +#RPI_KEEPS="-save-temps=obj" +RPI_KEEPS="" + @@ -33603,12 +42002,14 @@ index 0000000000..f8e5e75375 + --disable-stripping\ + --disable-thumb\ + --enable-mmal\ -+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\ -+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\ ++ --enable-rpi\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\ + --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ + --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ + --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- + ++# --enable-decoder=hevc_rpi\ +# --enable-extra-warnings\ +# --arch=armv71\ +# --enable-shared\ @@ -33617,10 +42018,10 @@ index 0000000000..f8e5e75375 +# -Wa,-ahls diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py new file mode 100755 -index 0000000000..70f7be22bb +index 0000000000..e9556f0837 --- /dev/null +++ b/pi-util/ffconf.py -@@ -0,0 +1,174 @@ +@@ -0,0 +1,175 @@ +#!/usr/bin/env python + +import string @@ -33634,7 +42035,7 @@ index 0000000000..70f7be22bb + +ffmpeg_exec = "./ffmpeg" + -+def testone(fileroot, srcname, es_file, md5_file): ++def testone(fileroot, srcname, es_file, md5_file, vcodec): + tmp_root = "/tmp" + + names = srcname.split('/') @@ -33656,7 +42057,7 @@ index 0000000000..70f7be22bb + + # Unaligned needed for cropping conformance + rstr = subprocess.call( -+ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file], ++ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file], + stdout=flog, stderr=subprocess.STDOUT) + + try: @@ -33720,7 +42121,7 @@ index 0000000000..70f7be22bb + return True + return False + -+def doconf(csva, tests, test_root): ++def doconf(csva, tests, test_root, vcodec): + unx_failures = [] + unx_success = [] + failures = 0 @@ -33732,7 +42133,7 @@ index 0000000000..70f7be22bb + print "==== ", name, + sys.stdout.flush() + -+ rv = testone(os.path.join(test_root, name), name, a[2], a[3]) ++ rv = testone(os.path.join(test_root, name), name, a[2], a[3], vcodec=vcodec) + if (rv == 0): + successes += 1 + else: @@ -33783,6 +42184,7 @@ index 0000000000..70f7be22bb + argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test") + argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir") + argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename") ++ argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use") + args = argp.parse_args() + + if args.csvgen: @@ -33793,7 +42195,7 @@ index 0000000000..70f7be22bb + csva = [a for a in csv.reader(csvfile, ConfCSVDialect())] + + -+ doconf(csva, args.tests, args.test_root) ++ doconf(csva, args.tests, args.test_root, args.vcodec) + diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py new file mode 100755 diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch new file mode 100644 index 00000000000..1d1fd1690ea --- /dev/null +++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch @@ -0,0 +1,283 @@ +From 8f170986cda0695f28eb2cd4e863aaae0e14d19f Mon Sep 17 00:00:00 2001 +From: Hendrik Leppkes +Date: Sat, 9 Jan 2016 16:34:09 +0100 +Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles + +--- + libavcodec/avcodec.h | 3 +++ + libavcodec/codec_desc.c | 7 +++++++ + libavcodec/profiles.c | 1 + + libavformat/mpegts.c | 2 +- + 4 files changed, 12 insertions(+), 1 deletion(-) + +diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h +index 6c4b011b5c..8f1f5a3e53 100644 +--- a/libavcodec/avcodec.h ++++ b/libavcodec/avcodec.h +@@ -449,6 +449,8 @@ enum AVCodecID { + AV_CODEC_ID_GDV, + AV_CODEC_ID_FITS, + ++ AV_CODEC_ID_H264_MVC, ++ + /* various PCM "codecs" */ + AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs + AV_CODEC_ID_PCM_S16LE = 0x10000, +@@ -3318,6 +3320,7 @@ typedef struct AVCodecContext { + #define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244 + #define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA) + #define FF_PROFILE_H264_CAVLC_444 44 ++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138 + + #define FF_PROFILE_VC1_SIMPLE 0 + #define FF_PROFILE_VC1_MAIN 1 +diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c +index 478b7c0ffc..ff10f3b2bc 100644 +--- a/libavcodec/codec_desc.c ++++ b/libavcodec/codec_desc.c +@@ -1700,6 +1700,13 @@ static const AVCodecDescriptor codec_descriptors[] = { + .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"), + .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS, + }, ++ { ++ .id = AV_CODEC_ID_H264_MVC, ++ .type = AVMEDIA_TYPE_VIDEO, ++ .name = "h264_mvc", ++ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"), ++ .props = AV_CODEC_PROP_LOSSY, ++ }, + + /* various PCM "codecs" */ + { +diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c +index 30498efedf..9d3cf4b535 100644 +--- a/libavcodec/profiles.c ++++ b/libavcodec/profiles.c +@@ -72,6 +72,7 @@ const AVProfile ff_h264_profiles[] = { + { FF_PROFILE_H264_CAVLC_444, "CAVLC 4:4:4" }, + { FF_PROFILE_H264_MULTIVIEW_HIGH, "Multiview High" }, + { FF_PROFILE_H264_STEREO_HIGH, "Stereo High" }, ++ { FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH, "Multiview High Depth" }, + { FF_PROFILE_UNKNOWN }, + }; + +diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c +index 53cbcfb543..f93f06fcfb 100644 +--- a/libavformat/mpegts.c ++++ b/libavformat/mpegts.c +@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = { + #endif + { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 }, + { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC }, +- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 }, ++ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC }, + { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 }, + { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC }, + { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS }, +-- +2.14.1 + + +From 00de72f97e8f69f5d4c614bff956ec726f97fa2e Mon Sep 17 00:00:00 2001 +From: Hendrik Leppkes +Date: Sat, 9 Jan 2016 16:34:40 +0100 +Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs + +--- + libavcodec/allcodecs.c | 1 + + libavcodec/h264.h | 2 ++ + libavcodec/h264_parser.c | 34 ++++++++++++++++++++++++++++++---- + 3 files changed, 33 insertions(+), 4 deletions(-) + +diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c +index 5361a22141..a5289a5e14 100644 +--- a/libavcodec/allcodecs.c ++++ b/libavcodec/allcodecs.c +@@ -732,6 +732,7 @@ static void register_all(void) + REGISTER_PARSER(H261, h261); + REGISTER_PARSER(H263, h263); + REGISTER_PARSER(H264, h264); ++ REGISTER_PARSER(H264_MVC, h264_mvc); + REGISTER_PARSER(HEVC, hevc); + REGISTER_PARSER(MJPEG, mjpeg); + REGISTER_PARSER(MLP, mlp); +diff --git a/libavcodec/h264.h b/libavcodec/h264.h +index 86df5eb9b3..22c4f1d82a 100644 +--- a/libavcodec/h264.h ++++ b/libavcodec/h264.h +@@ -41,7 +41,9 @@ enum { + H264_NAL_END_STREAM = 11, + H264_NAL_FILLER_DATA = 12, + H264_NAL_SPS_EXT = 13, ++ H264_NAL_SPS_SUBSET = 15, + H264_NAL_AUXILIARY_SLICE = 19, ++ H264_NAL_SLICE_EXT = 20, + }; + + #endif /* AVCODEC_H264_H */ +diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c +index 053325c26b..855c74896e 100644 +--- a/libavcodec/h264_parser.c ++++ b/libavcodec/h264_parser.c +@@ -62,6 +62,7 @@ typedef struct H264ParseContext { + int parse_last_mb; + int64_t reference_dts; + int last_frame_num, last_picture_structure; ++ int is_mvc; + } H264ParseContext; + + +@@ -109,14 +110,18 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf, + } else if (state <= 5) { + int nalu_type = buf[i] & 0x1F; + if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS || +- nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) { ++ nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD || ++ nalu_type == H264_NAL_SPS_SUBSET) { + if (pc->frame_start_found) { + i++; + goto found; + } + } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA || +- nalu_type == H264_NAL_IDR_SLICE)) { ++ nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) { + state += 8; ++ ++ if (nalu_type == H264_NAL_SLICE_EXT) ++ i += 3; // skip mvc extension + continue; + } + state = 7; +@@ -594,7 +599,8 @@ static int h264_parse(AVCodecParserContext *s, + } + } + +- parse_nal_units(s, avctx, buf, buf_size); ++ if (!p->is_mvc) ++ parse_nal_units(s, avctx, buf, buf_size); + + if (avctx->framerate.num) + avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1})); +@@ -651,7 +657,7 @@ static int h264_split(AVCodecContext *avctx, + if ((state & 0xFFFFFF00) != 0x100) + break; + nalu_type = state & 0x1F; +- if (nalu_type == H264_NAL_SPS) { ++ if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) { + has_sps = 1; + } else if (nalu_type == H264_NAL_PPS) + has_pps = 1; +@@ -703,3 +709,23 @@ AVCodecParser ff_h264_parser = { + .parser_close = h264_close, + .split = h264_split, + }; ++ ++static av_cold int init_mvc(AVCodecParserContext *s) ++{ ++ H264ParseContext *p = s->priv_data; ++ int ret = init(s); ++ if (ret < 0) ++ return ret; ++ ++ p->is_mvc = 1; ++ return 0; ++} ++ ++AVCodecParser ff_h264_mvc_parser = { ++ .codec_ids = { AV_CODEC_ID_H264_MVC }, ++ .priv_data_size = sizeof(H264ParseContext), ++ .parser_init = init_mvc, ++ .parser_parse = h264_parse, ++ .parser_close = h264_close, ++ .split = h264_split, ++}; +-- +2.14.1 + + +From bbf5daa149ccc2c462be1bd5f6f710eba0e82094 Mon Sep 17 00:00:00 2001 +From: Hendrik Leppkes +Date: Tue, 28 Nov 2017 16:12:12 +0000 +Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame + start was found + +--- + libavcodec/h264_parser.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c +index 855c74896e..90a99a19a8 100644 +--- a/libavcodec/h264_parser.c ++++ b/libavcodec/h264_parser.c +@@ -587,6 +587,9 @@ static int h264_parse(AVCodecParserContext *s, + } else { + next = h264_find_frame_end(p, buf, buf_size, avctx); + ++ if (next == END_NOT_FOUND && pc->frame_start_found == 0) ++ s->fetch_timestamp = 1; ++ + if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) { + *poutbuf = NULL; + *poutbuf_size = 0; +-- +2.14.1 + + +From 3a0ebb0f7473a9a5ab93e01f7261862a3d324e50 Mon Sep 17 00:00:00 2001 +From: popcornmix +Date: Tue, 28 Nov 2017 18:32:08 +0000 +Subject: [PATCH 4/4] extract_extradata_bsf: Support H264_MVC + +--- + libavcodec/extract_extradata_bsf.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c +index ed6509c681..188e62a42d 100644 +--- a/libavcodec/extract_extradata_bsf.c ++++ b/libavcodec/extract_extradata_bsf.c +@@ -56,7 +56,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt, + HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS, + }; + static const int extradata_nal_types_h264[] = { +- H264_NAL_SPS, H264_NAL_PPS, ++ H264_NAL_SPS, H264_NAL_SPS_SUBSET, H264_NAL_PPS, + }; + + ExtractExtradataContext *s = ctx->priv_data; +@@ -88,14 +88,14 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt, + if (nal->type == HEVC_NAL_SPS) has_sps = 1; + if (nal->type == HEVC_NAL_VPS) has_vps = 1; + } else { +- if (nal->type == H264_NAL_SPS) has_sps = 1; ++ if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SPS_SUBSET) has_sps = 1; + } + } + } + + if (extradata_size && + ((ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) || +- (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) { ++ ((ctx->par_in->codec_id == AV_CODEC_ID_H264 || ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC) && has_sps))) { + AVBufferRef *filtered_buf; + uint8_t *extradata, *filtered_data; + +@@ -247,6 +247,7 @@ static const struct { + } extract_tab[] = { + { AV_CODEC_ID_CAVS, extract_extradata_mpeg4 }, + { AV_CODEC_ID_H264, extract_extradata_h2645 }, ++ { AV_CODEC_ID_H264_MVC, extract_extradata_h2645 }, + { AV_CODEC_ID_HEVC, extract_extradata_h2645 }, + { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12 }, + { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12 }, +@@ -306,6 +307,7 @@ fail: + static const enum AVCodecID codec_ids[] = { + AV_CODEC_ID_CAVS, + AV_CODEC_ID_H264, ++ AV_CODEC_ID_H264_MVC, + AV_CODEC_ID_HEVC, + AV_CODEC_ID_MPEG1VIDEO, + AV_CODEC_ID_MPEG2VIDEO, +-- +2.14.1 +