Merge branch 'master' into doc

JDAI-CV · May 18, 2019 · 1f9f677 · 1f9f677
2 parents 701eacd + 9dacd3a
commit 1f9f677
Show file tree

Hide file tree

Showing 21 changed files with 308 additions and 98 deletions.
diff --git a/.daq_pm/configs/net_test_debug b/.daq_pm/configs/net_test_debug
@@ -0,0 +1,7 @@
+# It is a configuration file for [project_manager.vim](https://github.com/daquexian/project_manager.vim)
+name binary-nn
+type cpp
+build_dir build_net_test_debug
+target net_test
+cmake_options -DCMAKE_TOOLCHAIN_FILE=~/Android/Sdk/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-28 -DANDROID_ABI=arm64-v8a -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Debug
+binary ~/adb_push_and_run.sh tests/net_test
diff --git a/.daq_pm/configs/run_net_debug b/.daq_pm/configs/run_net_debug
@@ -0,0 +1,7 @@
+# It is a configuration file for [project_manager.vim](https://github.com/daquexian/project_manager.vim)
+name binary-nn
+type cpp
+build_dir build_arm64_no_test_debug
+cmake_options -DCMAKE_TOOLCHAIN_FILE=~/Android/Sdk/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-28 -DANDROID_ABI=arm64-v8a -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Debug -DBNN_BUILD_TEST=OFF -DBNN_BUILD_BENCHMARK=OFF -DBNN_CHECK_CONSISTENCY=ON
+# binary ~/adb_push_and_run.sh binaries/run /data/local/tmp/model_imagenet.dab 144 145
+binary ~/adb_push_and_run.sh binaries/run /data/local/tmp/simple5.dab output --v=6
diff --git a/.daq_pm/configs/run_net_new_bitpack b/.daq_pm/configs/run_net_new_bitpack
@@ -0,0 +1,6 @@
+# It is a configuration file for [project_manager.vim](https://github.com/daquexian/project_manager.vim)
+name binary-nn
+type cpp
+build_dir build_arm64_no_test
+cmake_options -DCMAKE_TOOLCHAIN_FILE=~/Android/Sdk/ndk-bundle/build/cmake/android.toolchain.cmake -DANDROID_PLATFORM=android-28 -DANDROID_ABI=arm64-v8a -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=Release -DBNN_BUILD_TEST=OFF -DBNN_BUILD_BENCHMARK=OFF
+binary ~/adb_push_and_run.sh binaries/run /data/local/tmp/br24g2.daq 242 && adb pull /data/local/tmp/mat.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,12 +6,10 @@ project(dabnn CXX)
 message(STATUS "CMake Version: ${CMAKE_VERSION}")
 
 if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "")
-    message("No build type, set to Release")
+    message(STATUS "No build type, set to Release")
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "" FORCE)
 endif()
-message("Build type: ${CMAKE_BUILD_TYPE}")
-
-set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -41,6 +39,8 @@ endif()
 if (${BNN_BUILD_ANDROID})
     set(CMAKE_CXX_STANDARD 17)
 
+    set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+
     if (${BNN_BUILD_TEST})
         include(cmake/gtest.cmake)
         configure_gtest()

diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
@@ -13,6 +13,22 @@
 #include <dabnn/mat.h>
 #include <dabnn/net.h>
 
+static void BM_pack_mat_64_small(benchmark::State &state) {
+    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, 0);
+    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, 0);
+    for (auto _ : state) {
+        pack_mat_64(a, b);
+    }
+}
+
+static void BM_pack_mat_128_small(benchmark::State &state) {
+    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, 0);
+    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, 0);
+    for (auto _ : state) {
+        pack_mat_128(a, b);
+    }
+}
+
 static void BM_pack_mat_64(benchmark::State &state) {
     const bnn::Mat a(1, 64, 64, 128, bnn::DataType::Float);
     bnn::Mat b(1, 64, 64, 128, bnn::DataType::Bit);

diff --git a/binaries/run.cpp b/binaries/run.cpp
@@ -5,20 +5,20 @@
 #include <algorithm>
 #include <chrono>
 
+#include <common/argh.h>
 #include <common/flatbuffers_helper.h>
 #include <dabnn/net.h>
 
 int main(int argc, char **argv) {
-    (void)argc;
+    argh::parser cmdl(argc, argv);
     google::InitGoogleLogging(argv[0]);
-    FLAGS_v = 1;
+    cmdl("v", 1) >> FLAGS_v;
     FLAGS_alsologtostderr = true;
     // FLAGS_logbuflevel = -1;
 
     float *input = new float[3 * 224 * 224];
     FORZ(i, 3 * 224 * 224) { input[i] = 1; }
 
-    // const std::string blob_name = "125";
     auto net1 = bnn::Net::create();
     net1->optimize = true;
     net1->run_fconv = true;
@@ -31,48 +31,26 @@ int main(int argc, char **argv) {
     FORZ(i, N) {
         LOG(INFO) << "------";
         net1->run(input);
-        // LOG(INFO) << "hh";
     }
     const auto t2 = Clock::now();
-    css blob_name = argv[2];
-    LOG(INFO) << "Fetching blob: " << blob_name;
-    const auto &blob1 = net1->get_blob(blob_name);
-    LOG(INFO) << blob1->total();
-    if (blob1->data_type == bnn::DataType::Float) {
-        blob1->dump("/data/local/tmp/mat.txt");
-    }
-    FORZ(i, std::min(static_cast<int>(blob1->total()), 10)) {
+
+    for (int i = 2; i < cmdl.size(); i++) {
+        css blob_name = argv[i];
+        LOG(INFO) << "Fetching blob: " << blob_name;
+        const auto &blob1 = net1->get_blob(blob_name);
+        LOG(INFO) << static_cast<float *>(blob1->data)[0];
         if (blob1->data_type == bnn::DataType::Float) {
-            LOG(INFO) << static_cast<float *>(blob1->data)[i];
-        } else {
-            LOG(INFO) << binrep(static_cast<uint64_t *>(blob1->data)[i]);
+            blob1->dump("/data/local/tmp/mat_" + blob_name + ".txt");
+        }
+        FORZ(j, std::min(static_cast<int>(blob1->total()), 10)) {
+            if (blob1->data_type == bnn::DataType::Float) {
+                LOG(INFO) << blob_name << ": " << static_cast<float *>(blob1->data)[j];
+            } else {
+                LOG(INFO) << blob_name << ": " << binrep(static_cast<uint64_t *>(blob1->data) + j, 64, true);
+            }
         }
     }
-    LOG(INFO) << "Time: "
-              << 1.f *
-                     std::chrono::duration_cast<std::chrono::nanoseconds>(t2 -
-                                                                          t1)
-                         .count() /
-                     N / 1000000000;
 #ifdef BNN_BENCHMARK
     net1->print_time();
 #endif
-
-    /*
-    bnn::Net net2;
-    net2.model_ = model;
-    net2.prepare();
-    LOG(INFO) << "-----";
-    net2.optimize = false;
-
-    net2.run(input);
-    const auto &blob2 = net2.get_blob(blob_name);
-    LOG(INFO) << blob2->total();
-    FORZ(i, std::min(static_cast<int>(blob2->total()), 10)) {
-        LOG(INFO) << static_cast<float *>(blob2->data)[i];
-    }
-
-    const bool eq = (*blob1 == *blob2);
-    BNN_ASSERT(eq, "");
-    */
 }
diff --git a/ci/build_aar.sh b/ci/build_aar.sh
@@ -6,7 +6,7 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
     echo "The system is Mac OS X, alias sed to gsed"
     export PATH="/usr/local/opt/gnu-sed/libexec/gnubin:$PATH"
     echo "Output of sed -v:"
-    sed -v
+    sed --version
 fi
 
 MY_ANDROID_HOME="${ANDROID_HOME:-$HOME/Android/Sdk}"

diff --git a/common/baseline.h b/common/baseline.h
@@ -6,7 +6,7 @@
 #include <bitset>
 
 #include <common/helper.h>
-#include <dabnn/bitpack.h>
+#include <common/common_bitpack.h>
 #include <dabnn/mat.h>
 
 inline int bitcount(uint64_t x) {

diff --git a/common/common_bitpack.h b/common/common_bitpack.h
@@ -8,6 +8,7 @@
 #include <cstdint>
 
 #include <common/helper.h>
+#include <dabnn/mat.h>
 
 inline void pack_128_fallback(const float *float_ptr, void *binary_ptr,
                               size_t size) {
@@ -183,4 +184,23 @@ inline void pack_64_bitfield(const float *fptr, uint64_t *buf) {
     *buf = u.u64;
 }
 
+inline void pack_mat_64(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
+    BNN_ASSERT(
+        float_mat.w * float_mat.c > 0 && float_mat.w * float_mat.c % 64 == 0,
+        float_mat.w * float_mat.c);
+    BNN_ASSERT(float_mat.c / 64 == binary_mat.c && float_mat.c % 64 == 0, "");
+
+    FORZ(n, float_mat.n) {
+        FORZ(h, float_mat.h) {
+            auto *fptr = float_mat.point<float>(n, h, 0);
+            auto *bptr = binary_mat.point<uint64_t>(n, h, 0);
+            FORZ(i, float_mat.w * float_mat.c / 64) {
+                pack_64_bitfield(fptr, bptr);
+                fptr += 64;
+                bptr++;
+            }
+        }
+    }
+}
+
 #endif /* COMMON_BITPACK_H */
diff --git a/common/helper.h b/common/helper.h
@@ -80,6 +80,9 @@ inline float random_float() {
     static std::normal_distribution<float> distr;
 
     float rand_float = distr(eng) / 10;
+    if (rand_float == 0) {
+        return random_float();
+    }
     // LOG(INFO) << "Random float: " << rand_float;
 
     return rand_float;
@@ -113,27 +116,20 @@ inline void fill_rand_uint64(uint64_t *data, size_t num) {
     FORZ(i, num) { *(data + i) = random_uint64(); }
 }
 
-template <typename T>
-std::string binrep(const T &a) {
-    const char *beg = reinterpret_cast<const char *>(&a);
-    const char *end = beg + sizeof(a);
-
-    std::stringstream ss;
-
-    while (beg != end) ss << std::bitset<CHAR_BIT>(*beg++) << ' ';
-    ss << '\n';
-    return ss.str();
-}
-
-template <typename T>
-std::string binrep(const T &a, const size_t size) {
-    const char *beg = reinterpret_cast<const char *>(&a);
+/**
+ * parameter human will make the output on little endian machines human-readable
+ */
+inline std::string binrep(const void *a, const size_t size, bool reverse) {
+    const char *beg = static_cast<const char *>(a);
     const char *end = beg + size;
 
     std::stringstream ss;
 
-    while (beg != end) ss << std::bitset<CHAR_BIT>(*beg++) << ' ';
-    ss << '\n';
+    if (reverse) {
+        while (beg != end) ss << std::bitset<CHAR_BIT>(*(end-- - 1)) << ' ';
+    } else {
+        while (beg != end) ss << std::bitset<CHAR_BIT>(*beg++) << ' ';
+    }
     return ss.str();
 }
 

diff --git a/dabnn/bitpack.h b/dabnn/bitpack.h
@@ -16,23 +16,85 @@
 #include <glog/logging.h>
 #include "mat.h"
 
-inline void pack_mat_64(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
-    BNN_ASSERT(
-        float_mat.w * float_mat.c > 0 && float_mat.w * float_mat.c % 64 == 0,
-        float_mat.w * float_mat.c);
-    BNN_ASSERT(float_mat.c / 64 == binary_mat.c && float_mat.c % 64 == 0, "");
-
-    FORZ(n, float_mat.n) {
-        FORZ(h, float_mat.h) {
-            auto *fptr = float_mat.point<float>(n, h, 0);
-            auto *bptr = binary_mat.point<uint64_t>(n, h, 0);
-            FORZ(i, float_mat.w * float_mat.c / 64) {
-                pack_64_bitfield(fptr, bptr);
-                fptr += 64;
-                bptr++;
-            }
-        }
-    }
+inline void pack_128_2(const float *float_ptr, void *binary_ptr, size_t size) {
+    size_t nn_size = size >> 7;
+
+    asm volatile(
+        "0:     \n"
+        "prfm   pldl1keep, [%0]     \n"
+        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64    \n"
+        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64    \n"
+        "sri    v0.4s, v4.4s, #1    \n"
+        "sri    v1.4s, v5.4s, #1    \n"
+        "sri    v2.4s, v6.4s, #1    \n"
+        "sri    v3.4s, v7.4s, #1    \n"
+
+        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64    \n"
+        "prfm   pldl1keep, [%0, #64]     \n"
+        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64    \n"
+        "sri    v8.4s, v12.4s, #1    \n"
+        "sri    v9.4s, v13.4s, #1    \n"
+        "sri    v10.4s, v14.4s, #1    \n"
+        "sri    v11.4s, v15.4s, #1    \n"
+
+        "subs   %2, %2, #1          \n"
+
+        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64    \n"
+        "prfm   pldl1keep, [%0, #64]     \n"
+        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64    \n"
+
+        "sri    v0.4s, v8.4s, #2    \n"
+        "sri    v1.4s, v9.4s, #2    \n"
+        "sri    v2.4s, v10.4s, #2   \n"
+        "sri    v3.4s, v11.4s, #2   \n"
+
+        "sri    v16.4s, v20.4s, #1    \n"
+        "sri    v17.4s, v21.4s, #1    \n"
+        "sri    v18.4s, v22.4s, #1    \n"
+        "sri    v19.4s, v23.4s, #1    \n"
+
+        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64    \n"
+        "prfm   pldl1keep, [%0, #64]     \n"
+        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64    \n"
+        "sri    v8.4s, v12.4s, #1    \n"
+        "sri    v9.4s, v13.4s, #1    \n"
+        "sri    v10.4s, v14.4s, #1    \n"
+        "sri    v11.4s, v15.4s, #1    \n"
+
+        "sri    v16.4s, v8.4s, #2   \n"
+        "sri    v17.4s, v9.4s, #2   \n"
+        "sri    v18.4s, v10.4s, #2   \n"
+        "sri    v19.4s, v11.4s, #2   \n"
+
+        "sri    v0.4s, v16.4s, #4   \n"
+        "sri    v1.4s, v17.4s, #4   \n"
+        "sri    v2.4s, v18.4s, #4   \n"
+        "sri    v3.4s, v19.4s, #4   \n"
+
+        "sri    v0.4s, v1.4s, #8    \n"
+        "sri    v2.4s, v3.4s, #8    \n"
+        "sri    v0.4s, v2.4s, #16    \n"
+
+        // Bit-packing with sign bit is introduced after the first version
+        // of dabnn is published. Sign bit will be 1 when x < 0, 0 when x > 0,
+        // which is different with the way we used before --- set bit to 1 if
+        // x > 0 or 0 if x < 0
+        // So for the compatibility we add a "not" instruction here.
+        // Maybe we can save this instruction by introducing "version" for
+        // dabnn model and force users to upgrade.
+        // Note: If this line is removed, the padding value of binary convolution
+        // should also be changed from 0 (-1 in xnor) to -1 (1 in xnor)
+        "not    v0.16b, v0.16b        \n"
+
+        "st1    {v0.4s}, [%1], #16         \n"
+        "bne    0b                  \n"
+        : "+r"(float_ptr),   // %0
+          "+r"(binary_ptr),  // %1
+          "+r"(nn_size)      // %2
+        :
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+            "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+            "v19", "v20", "v21", "v22", "v23", "x0");
 }
 
 inline void pack_128(const float *float_ptr, void *binary_ptr, size_t size) {
@@ -123,6 +185,13 @@ inline void pack_128(const float *float_ptr, void *binary_ptr, size_t size) {
           "x0");
 }
 
+inline void pack_mat_128_2(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
+    assert(!binary_mat.empty());
+
+    pack_128_2(static_cast<float *>(float_mat.data), binary_mat.data,
+             float_mat.total());
+}
+
 inline void pack_mat_128(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
     assert(!binary_mat.empty());
 
@@ -133,7 +202,7 @@ inline void pack_mat_128(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
 inline void pack_mat(const bnn::Mat &float_mat, bnn::Mat &binary_mat) {
     BNN_ASSERT(float_mat.c % 64 == 0, float_mat.c);
     if (float_mat.c % 128 == 0) {
-        pack_mat_128(float_mat, binary_mat);
+        pack_mat_128_2(float_mat, binary_mat);
     } else {
         pack_mat_64(float_mat, binary_mat);
     }