From 7eadf45934a435c009f56be8ce06dc15be2fb36c Mon Sep 17 00:00:00 2001 From: daquexian Date: Wed, 5 Jun 2019 11:33:48 +0800 Subject: [PATCH 1/4] Add english docs, update cn docs --- docs/bconv.md | 61 +++++++++++++++++ docs/bconv_CN.md | 4 +- docs/bconv_out.md | 47 +++++++++++++ docs/design.md | 11 +++ docs/onnx2bnn.md | 29 ++++++++ .../svgs/09e963a9a257d451169d317f04f4cf59.svg | 9 +++ .../svgs/0df921cda6b933526ab3fb2c8ba713c0.svg | 57 ++++++++++++++++ .../svgs/15e03f3c82848a46865db186cb4c1092.svg | 19 ++++++ .../svgs/1cb45f0e1e422f5a042ce0dc8710ed27.svg | 11 +++ .../svgs/22aefc0b275701a94e3684ede71e1cbf.svg | 13 ++++ .../svgs/252b59b1233ed40f0396e2cd369f514d.svg | 13 ++++ .../svgs/25bbbd23c3609fee3f26aa5f809dbe2e.svg | 26 +++++++ .../svgs/2c08f38f094ac03aea56779378242468.svg | 61 +++++++++++++++++ .../svgs/2e32e0141d372413f25c35045d246695.svg | 11 +++ .../svgs/2e67a96431b169a7b134a2ab4c5f3457.svg | 21 ++++++ .../svgs/3b5fe08410dc2e357ad56d5e09c013c5.svg | 13 ++++ .../svgs/4723cf14b1da3a0da99410e67984882d.svg | 25 +++++++ .../svgs/484ca82711fa06cc47440002b35c5a66.svg | 19 ++++++ .../svgs/4fd661cfefdf4318d1aa35fb483796b2.svg | 9 +++ .../svgs/5615b81594cc5f5f54f6c86a17443fea.svg | 19 ++++++ .../svgs/68d27da8ea3f60dda13e915b722c2c25.svg | 19 ++++++ .../svgs/7adcdcafe095c28283fc5a319a9b6cdb.svg | 17 +++++ .../svgs/7ceafe91152205b4b4d2b3a8b1565477.svg | 67 +++++++++++++++++++ .../svgs/81299da238f63ff881f8365a2a3b638a.svg | 11 +++ .../svgs/8217ed3c32a785f0b5aad4055f432ad8.svg | 9 +++ .../svgs/873e1472d3de67a29299013cd3eebb39.svg | 32 +++++++++ .../svgs/88cf5350b4c645c31edaa0cbba3ee5f9.svg | 15 +++++ .../svgs/8edf0a665654dc211972f609f97cb684.svg | 19 ++++++ .../svgs/9034606aa4dd18758a6889347abf0302.svg | 13 ++++ .../svgs/904d8a3dfde39f4fb05df9337f05b65f.svg | 30 +++++++++ .../svgs/9998129ab540f7bc0985032e06e974ed.svg | 56 ++++++++++++++++ .../svgs/a6b6654f6dbe55b7fa2c8f5104fb8370.svg | 17 +++++ .../svgs/ab03e97f653c3b2963d6a503b2a9719b.svg | 11 +++ .../svgs/bdbf342b57819773421273d508dba586.svg | 9 +++ .../svgs/c745b9b57c145ec5577b82542b2df546.svg | 9 +++ .../svgs/c82ef99e46a995ca2c9e5865a66d022f.svg | 19 ++++++ .../svgs/cc1dbbcd450fb3182ca125d94560c60d.svg | 20 ++++++ .../svgs/cc6323a2493d6ca0f60a01cf3df23307.svg | 13 ++++ .../svgs/cfeb1241a5083b5a03c00504e02f69b3.svg | 26 +++++++ .../svgs/d0740c8f4fc4e3563ada4e53f43a81a1.svg | 19 ++++++ .../svgs/d0b09e58d8b197fff6fc95ea3bca20fe.svg | 9 +++ .../svgs/d236438ccc18393a004614a9ddc04188.svg | 39 +++++++++++ .../svgs/d4e2df111ef06f71a9c4108cb9542aff.svg | 60 +++++++++++++++++ .../svgs/d6328eaebbcd5c358f426dbea4bdbf70.svg | 9 +++ .../svgs/dad09b192ab596ebe6643c184b041b78.svg | 61 +++++++++++++++++ .../svgs/dc61d515b6f36dadf6ab7371698a9ef1.svg | 48 +++++++++++++ .../svgs/eb7ee640b8ff98c0068ed4d9ec3baf60.svg | 35 ++++++++++ .../svgs/f0fa7d7a09a30703b30ba8aae9c1c1b5.svg | 11 +++ .../svgs/f29d99803e443e4e6e87180539b3197f.svg | 30 +++++++++ .../svgs/f3d9f6f447d13bcef7127ff6c98710a3.svg | 11 +++ .../svgs/f5feb9f32839cb69ccdf8b0838d8c7cb.svg | 17 +++++ .../svgs/f6128a2d469857252e8e52385e7a00c5.svg | 9 +++ .../svgs/f8b4daba6c4183a3c1000ebb2d64de5f.svg | 63 +++++++++++++++++ .../svgs/f9c4988898e7f532b9f826a75014ed3c.svg | 9 +++ .../svgs/fb97d38bcc19230b0acd442e17db879c.svg | 9 +++ .../svgs/ff7cbf533a4e41019c689366004849fb.svg | 9 +++ 56 files changed, 1336 insertions(+), 2 deletions(-) create mode 100644 docs/bconv.md create mode 100644 docs/bconv_out.md create mode 100644 docs/design.md create mode 100644 docs/onnx2bnn.md create mode 100644 docs/svgs/09e963a9a257d451169d317f04f4cf59.svg create mode 100644 docs/svgs/0df921cda6b933526ab3fb2c8ba713c0.svg create mode 100644 docs/svgs/15e03f3c82848a46865db186cb4c1092.svg create mode 100644 docs/svgs/1cb45f0e1e422f5a042ce0dc8710ed27.svg create mode 100644 docs/svgs/22aefc0b275701a94e3684ede71e1cbf.svg create mode 100644 docs/svgs/252b59b1233ed40f0396e2cd369f514d.svg create mode 100644 docs/svgs/25bbbd23c3609fee3f26aa5f809dbe2e.svg create mode 100644 docs/svgs/2c08f38f094ac03aea56779378242468.svg create mode 100644 docs/svgs/2e32e0141d372413f25c35045d246695.svg create mode 100644 docs/svgs/2e67a96431b169a7b134a2ab4c5f3457.svg create mode 100644 docs/svgs/3b5fe08410dc2e357ad56d5e09c013c5.svg create mode 100644 docs/svgs/4723cf14b1da3a0da99410e67984882d.svg create mode 100644 docs/svgs/484ca82711fa06cc47440002b35c5a66.svg create mode 100644 docs/svgs/4fd661cfefdf4318d1aa35fb483796b2.svg create mode 100644 docs/svgs/5615b81594cc5f5f54f6c86a17443fea.svg create mode 100644 docs/svgs/68d27da8ea3f60dda13e915b722c2c25.svg create mode 100644 docs/svgs/7adcdcafe095c28283fc5a319a9b6cdb.svg create mode 100644 docs/svgs/7ceafe91152205b4b4d2b3a8b1565477.svg create mode 100644 docs/svgs/81299da238f63ff881f8365a2a3b638a.svg create mode 100644 docs/svgs/8217ed3c32a785f0b5aad4055f432ad8.svg create mode 100644 docs/svgs/873e1472d3de67a29299013cd3eebb39.svg create mode 100644 docs/svgs/88cf5350b4c645c31edaa0cbba3ee5f9.svg create mode 100644 docs/svgs/8edf0a665654dc211972f609f97cb684.svg create mode 100644 docs/svgs/9034606aa4dd18758a6889347abf0302.svg create mode 100644 docs/svgs/904d8a3dfde39f4fb05df9337f05b65f.svg create mode 100644 docs/svgs/9998129ab540f7bc0985032e06e974ed.svg create mode 100644 docs/svgs/a6b6654f6dbe55b7fa2c8f5104fb8370.svg create mode 100644 docs/svgs/ab03e97f653c3b2963d6a503b2a9719b.svg create mode 100644 docs/svgs/bdbf342b57819773421273d508dba586.svg create mode 100644 docs/svgs/c745b9b57c145ec5577b82542b2df546.svg create mode 100644 docs/svgs/c82ef99e46a995ca2c9e5865a66d022f.svg create mode 100644 docs/svgs/cc1dbbcd450fb3182ca125d94560c60d.svg create mode 100644 docs/svgs/cc6323a2493d6ca0f60a01cf3df23307.svg create mode 100644 docs/svgs/cfeb1241a5083b5a03c00504e02f69b3.svg create mode 100644 docs/svgs/d0740c8f4fc4e3563ada4e53f43a81a1.svg create mode 100644 docs/svgs/d0b09e58d8b197fff6fc95ea3bca20fe.svg create mode 100644 docs/svgs/d236438ccc18393a004614a9ddc04188.svg create mode 100644 docs/svgs/d4e2df111ef06f71a9c4108cb9542aff.svg create mode 100644 docs/svgs/d6328eaebbcd5c358f426dbea4bdbf70.svg create mode 100644 docs/svgs/dad09b192ab596ebe6643c184b041b78.svg create mode 100644 docs/svgs/dc61d515b6f36dadf6ab7371698a9ef1.svg create mode 100644 docs/svgs/eb7ee640b8ff98c0068ed4d9ec3baf60.svg create mode 100644 docs/svgs/f0fa7d7a09a30703b30ba8aae9c1c1b5.svg create mode 100644 docs/svgs/f29d99803e443e4e6e87180539b3197f.svg create mode 100644 docs/svgs/f3d9f6f447d13bcef7127ff6c98710a3.svg create mode 100644 docs/svgs/f5feb9f32839cb69ccdf8b0838d8c7cb.svg create mode 100644 docs/svgs/f6128a2d469857252e8e52385e7a00c5.svg create mode 100644 docs/svgs/f8b4daba6c4183a3c1000ebb2d64de5f.svg create mode 100644 docs/svgs/f9c4988898e7f532b9f826a75014ed3c.svg create mode 100644 docs/svgs/fb97d38bcc19230b0acd442e17db879c.svg create mode 100644 docs/svgs/ff7cbf533a4e41019c689366004849fb.svg diff --git a/docs/bconv.md b/docs/bconv.md new file mode 100644 index 0000000..3bb7519 --- /dev/null +++ b/docs/bconv.md @@ -0,0 +1,61 @@ +## Bit-packing + +Bit-packing is performed in `Binarize` layers. It pack N 32-bit float/integer to an N-bit operand according their signs. For example, performing bit-packing on 128 float numbers produces a 128-bit operand. xnor/xor is only enabled on these packed operands. + +The details of bit-packing are in + +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L20 (optimized, for tensors of 128 and more channels) +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L204 (normal, for tensors of less than 128 channels) + +The optmized version is 4X faster than the normal version. Bit-packing algorithm directly leverage the sign bits of int32 and IEEE 754 float numbers, and then eliminate the comparison with zeros. SIMD instructions are also used to speed up this process. Note that after SIMD instructions is performed, the N bit in the result will be re-arranged so that they are not in the same order with the N 32-bit inputs. Fortunately, the output of xnor/xor is not affected as long as the input and weight is re-arranged in the same way. Given this observation, we re-arranged the weights of binary convs whose inputs is bit-packed in the optmized way. The details are in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/net.cpp#L82. + +dabnn present the following two optmized implementation of binary convs. + +## BGEMM + +SGEMM (Single float GEneral Matrix Multiplication) is a widely adopted approach to implement float convolutions in various high-performance scientific programs. In the context of BNNs, an alternative operation to SGEMM is BGEMM, which performs binary matrix multiplication for binary convolution after [im2col](https://github.com/JDAI-CV/dabnn/blob/master/dabnn/im2col.h). dabnn present optmized BGEMM. The advantage of GEMM is that it covers all cases of convolutions (various kernel size, stride, padding, ..) and it is easy to implement. + +The detailed implementation of BGEMM is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bgemm.h. + +## Binary Direct Convolution + +However, we argue that BGEMM is sub-optimal for BGEMM especially on ARM devices. + +In addition to the common multiplication and add operations, BGEMM includes extra operations that count how many 1s are in a vector. Specifically, we denote $U^{M \times N}$ as the space of matrices with dimension $M \times N$ and each element of it is a bit-packed vector. Given two matrices (i.e., $ \boldsymbol{A} \in U^{M \times K}$ and $\boldsymbol{B} \in U^{K \times N}$), $\boldsymbol{C} \in \mathbb{N}^{M \times N}$ ($\mathbb{N}$ represents the set of non-negative integers), $\boldsymbol{C} = BGEMM(\boldsymbol{A}, \boldsymbol{B})$ is measured as: +$$ + C_{i,j} = \sum\nolimits_{k} bitcount(xnor(\Vec{A_{i,k}}, \Vec{B_{k,j}})), +$$ + +where $\Vec{A_{i,k}}$ and $\Vec{B_{k,j}}$ denotes each element in $ \boldsymbol{A}$ and $\boldsymbol{B}$. In SGEMM, to amortize the cost of loading memory, $\boldsymbol{C}$ is often calculated as +$$ + \boldsymbol{C^{k}} = \boldsymbol{m^{k}}\boldsymbol{n^{k}}, +$$ +$$ + \boldsymbol{C} \mathrel{+}= \boldsymbol{C^{k}}, +$$ + +where $\boldsymbol{m^{k}}$ is the $k_{th}$ column of $\boldsymbol{A}$ and $\boldsymbol{n^{k}}$ is the $k_{th}$ row of $\boldsymbol{B}$. + +In particular, on ARMv8 (the 64-bit ARM architecture) devices, the operation of bitcount contains two instructions: "cnt" and "addv". "cnt" takes an $N$-byte vector $\alpha$ as input and outputs an $N$-byte vector $\beta$, which $\beta_{i} = the\_number\_of\_1s(\alpha_{i})$ where $\alpha_{i}$ and $\beta_{i}$ are the $i_{th}$ byte of $\alpha$ and $\beta$ respectively. "addv" sums up all bytes in a vector and outputs the aggregated scalar. The equation is then expanded as: +$$ + C_{i,j} \mathrel{+}= addv(cnt(xnor(\Vec{m^{k}_{i}}, \Vec{n^{k}_{j}}))). +$$ + +Thus, the above equation shows that the operation of binary multiply-addition on ARMv8 devices consists of four instructions: xnor, cnt, addv, and addition. Moreover, on ARMv7 (the 32-bit ARM architecture) devices, there is even no "addv" instruction and $\lceil \log_{2}N \rceil$ instructions are needed to sum up all bytes in an $N$-byte vector, so the operation of binary multiply-addition consists of $\lceil \log_{2}N \rceil+3$ instructions on these devices. To improve the efficiency of this operation, we re-arrange the calculation order and calculate $\boldsymbol{C}=BGEMM(\boldsymbol{A},\boldsymbol{B})$ as the multiplication of a row vector $\boldsymbol{p} \in U^{1 \times N}$ and $\boldsymbol{q} \in U^{M \times 1}$: +$$ + C_{i,j} = \boldsymbol{p^{i}}\boldsymbol{q^{j}}, +$$ + +where $\boldsymbol{p^{i}}$ is the $i_{th}$ row of $\boldsymbol{A}$ and $\boldsymbol{q^{j}}$ is the $j_{th}$ column of $\boldsymbol{B}$. + +In this way, the cost of "addv" instructions can be mostly squeezed by summing up the results of "cnt" in advance: +$$ + \Vec{C_{i,j}} = \sum\nolimits_{k} cnt(xnor(\Vec{A_{i,k}}, \Vec{B_{k,j}})), +$$ +$$ + C_{i,j} = addv(\Vec{C_{i,j}}). +$$ + +Please note that the same transformation can not be employed in BGEMM because $\boldsymbol{C}$ is stored as 32-bit integers to save the valuable registers. Therefore in the equation of BGEMM, we have to utilize "addv" to reduce the vector into an integer before every instruction of "addition". Taking a close look on the above two equations, we can observe some interesting connections between them and the operation of convolution. Specifically, if we treat $\boldsymbol{A} \in U^{M \times K}$ and $\boldsymbol{B} \in U^{K \times N}$ as the weight and the im2col-ed input ($M$: the number of output channels, $N$: output height $\times$ output width, and $K$: the number of bit-packed vectors in a weight filter), the above two equations can be directly interpreted as the definition of convolution. As such, the refined operation of binary convolution is dubbed as "Binary Direct Convolution". + +The implementation of Binary Direct Convolution is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bconv.h. diff --git a/docs/bconv_CN.md b/docs/bconv_CN.md index 99c6d76..fe50e09 100644 --- a/docs/bconv_CN.md +++ b/docs/bconv_CN.md @@ -1,5 +1,5 @@ ## Bit-packing -在执行二值卷积之前,网络需要手动插入一层`Binarize`。是指将 N 个 32 位的 float/integer,根据和 0 的大小关系,二值化为 N 个 bit (即 0 或 1),并打包成一个 N-bit 的整体,例如对 128 个浮点数进行 bit-packing 之后,就会产生一个 128-bit 的操作数。这一步叫做 bit-packing,做了这一步,后续才可以进行位运算 xnor/xor。 +Bit-packing 在 `Binarize` 层进行,是指将 N 个 32 位的 float/integer,根据和 0 的大小关系,二值化为 N 个 bit (即 0 或 1),并打包成一个 N-bit 的整体,例如对 128 个浮点数进行 bit-packing 之后,就会产生一个 128-bit 的操作数。做了这一步,后续才可以进行位运算 xnor/xor。 Bit-packing 的具体实现在 @@ -20,6 +20,6 @@ BGEMM 的具体实现在 https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bgem 然而 BGEMM 在 ARM 设备上并不高效,因为二值乘-加操作中,加法需要两步 - bitcount 和普通的加法。Bitcount 用来得到一个 N-bit 操作数中有多少 bit 是 1。在 ARMv8 设备上,bitcount 需要两条指令,ARMv7 设备上需要更多条指令。这大大限制了 BGEMM 的速度。因此 dabnn 提出了直接卷积的方法,称为 Binary Direct Convolution (BDC),它是指直接按照卷积的定义来计算卷积。在 BDC 中,通过一个简单的变换,大部分 bitcount 指令会被消除。它的优点是性能比 BGEMM 更高,但不能像 BGEMM 一样用一套代码覆盖所有的情况。 -关于 BDC 如何消除大部分 bitcount 指令,请留意我们即将 publish 的 paper。 +关于 BDC 如何消除大部分 bitcount 指令在 [bconv.md](bconv.md) 中有详细的说明。 BDC 的具体实现在 https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bconv.h。 diff --git a/docs/bconv_out.md b/docs/bconv_out.md new file mode 100644 index 0000000..4ecaae7 --- /dev/null +++ b/docs/bconv_out.md @@ -0,0 +1,47 @@ +## Bit-packing + +Bit-packing is performed in `Binarize` layers. It pack N 32-bit float/integer to an N-bit operand according their signs. For example, performing bit-packing on 128 float numbers produces a 128-bit operand. xnor/xor is only enabled on these packed operands. + +The details of bit-packing are in + +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L20 (optimized, for tensors of 128 and more channels) +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L204 (normal, for tensors of less than 128 channels) + +The optmized version is 4X faster than the normal version. Bit-packing algorithm directly leverage the sign bits of int32 and IEEE 754 float numbers, and then eliminate the comparison with zeros. SIMD instructions are also used to speed up this process. Note that after SIMD instructions is performed, the N bit in the result will be re-arranged so that they are not in the same order with the N 32-bit inputs. Fortunately, the output of xnor/xor is not affected as long as the input and weight is re-arranged in the same way. Given this observation, we re-arranged the weights of binary convs whose inputs is bit-packed in the optmized way. The details are in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/net.cpp#L82. + +dabnn present the following two optmized implementation of binary convs. + +## BGEMM + +SGEMM (Single float GEneral Matrix Multiplication) is a widely adopted approach to implement float convolutions in various high-performance scientific programs. In the context of BNNs, an alternative operation to SGEMM is BGEMM, which performs binary matrix multiplication for binary convolution after [im2col](https://github.com/JDAI-CV/dabnn/blob/master/dabnn/im2col.h). dabnn present optmized BGEMM. The advantage of GEMM is that it covers all cases of convolutions (various kernel size, stride, padding, ..) and it is easy to implement. + +The detailed implementation of BGEMM is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bgemm.h. + +## Binary Direct Convolution + +However, we argue that BGEMM is sub-optimal for BGEMM especially on ARM devices. + +In addition to the common multiplication and add operations, BGEMM includes extra operations that count how many 1s are in a vector. Specifically, we denote as the space of matrices with dimension and each element of it is a bit-packed vector. Given two matrices (i.e., and ), ( represents the set of non-negative integers), is measured as: +

+ +where and denotes each element in and . In SGEMM, to amortize the cost of loading memory, is often calculated as +

+

+ +where is the column of and is the row of . + +In particular, on ARMv8 (the 64-bit ARM architecture) devices, the operation of bitcount contains two instructions: "cnt" and "addv". "cnt" takes an -byte vector as input and outputs an -byte vector , which where and are the byte of and respectively. "addv" sums up all bytes in a vector and outputs the aggregated scalar. The equation is then expanded as: +

+ +Thus, the above equation shows that the operation of binary multiply-addition on ARMv8 devices consists of four instructions: xnor, cnt, addv, and addition. Moreover, on ARMv7 (the 32-bit ARM architecture) devices, there is even no "addv" instruction and instructions are needed to sum up all bytes in an -byte vector, so the operation of binary multiply-addition consists of instructions on these devices. To improve the efficiency of this operation, we re-arrange the calculation order and calculate as the multiplication of a row vector and : +

+ +where is the row of and is the column of . + +In this way, the cost of "addv" instructions can be mostly squeezed by summing up the results of "cnt" in advance: +

+

+ +Please note that the same transformation can not be employed in BGEMM because is stored as 32-bit integers to save the valuable registers. Therefore in the equation of BGEMM, we have to utilize "addv" to reduce the vector into an integer before every instruction of "addition". Taking a close look on the above two equations, we can observe some interesting connections between them and the operation of convolution. Specifically, if we treat and as the weight and the im2col-ed input (: the number of output channels, : output height output width, and : the number of bit-packed vectors in a weight filter), the above two equations can be directly interpreted as the definition of convolution. As such, the refined operation of binary convolution is dubbed as "Binary Direct Convolution". + +The implementation of Binary Direct Convolution is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bconv.h. diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 0000000..0f326e5 --- /dev/null +++ b/docs/design.md @@ -0,0 +1,11 @@ +## Background + +Binary Neural Networks is proposed in [Binary Neural Networks](https://arxiv.org/abs/1602.02830) and [XNOR-Net](https://arxiv.org/abs/1603.05279). In the following papers, [Bi-Real Net](https://arxiv.org/abs/1808.00278) presented some new training method in order to improve the performance, [BENN](https://arxiv.org/abs/1806.07550) leverages emsemble on BNNs. + +BNNs can save 10X+ memory, and several times as fast as float NNs. What's more, it theoretically [saves 10X energy](https://camo.githubusercontent.com/e725038be60ce4bb698b22480603b636a92beeaf/687474703a2f2f66696c652e656c656366616e732e636f6d2f776562312f4d30302f35352f37392f7049594241467373565f5341504f63534141435742546f6d6531633033392e706e67). So the battery life of devices will be expanded a lot. + +## Some notes + +1. The BNN models can be trained by any frameworks which support ONNX. Note that binary convs are custom operations, please check out [onnx2bnn.md](docs/onnx2bnn.md) for how to make the model comptabile with dabnn. + +1. For the implementation of binary convolutions, please check out [bconv.md](docs/bconv.md). diff --git a/docs/onnx2bnn.md b/docs/onnx2bnn.md new file mode 100644 index 0000000..85d26fc --- /dev/null +++ b/docs/onnx2bnn.md @@ -0,0 +1,29 @@ +## About ONNX + +[ONNX](http://onnx.ai) (Open Neural Network Exchange) is an open format which is greatly supported or officially integrated by [many frameworks and tools](http://onnx.ai/supported-tools). + +## How onnx2bnn converts the models + +1. Recognizing binary convolutions, whose weights will be bit-packed. The developers of dabnn added several [optimizer](https://github.com/onnx/onnx/blob/master/docs/Optimizer.md) to ONNX in order to recognize binary convolutions. The details is in dabnn_*.h of https://github.com/daquexian/onnx/tree/optimizer_for_bnn/onnx/optimizer/passes. For bit-packing, please check out [this documentation](bconv.md) + +1. Update the weight and bias of BN layers following binary conv layers. Since -1 in binary convs is represented by a unset bit (i.e., 0), and bitcount returns the number of set bits (i.e., 1) in a N-bit operand, a correction is needed to get the correct result of binary convs. Specifically, denote a as an N-bit operand, b as the number of set bits in a, c as the number of unset bits in a, the result we want is + +> b - c = b - (N - b) = 2 * b - N = 2 * bitcount(a) - N + +It is an affine transform of bitcount(a), so we accordingly update the weight and bias of the corresponding BN layers. + +The details is in https://github.com/JDAI-CV/dabnn/blob/master/tools/onnx2bnn/OnnxConverter.cpp#L530. + +1. Other layers are converted as usual. + +## Notes (Need Attention) + +There are some notes for model conversion. + +1. **The number of input channels of binary convs must be 64 or a multiple of 128 for now.** + +1. Binary convolutions are custom operations in training frameworks (e.g., TensorFlow, PyTorch), so the implementations are various. Unfortunately, the most existing implementations of binary convs are not correct. For example, they always pad 0 to their input, while the input should only be +1 or -1. The developers of dabnn provide [a standard implementation of binary convs in PyTorch](https://gist.github.com/daquexian/7db1e7f1e0a92ab13ac1ad028233a9eb). We advise trainers of BNNs to use this implementation, or implement binary convs in their own training frameworks according to this implementation. + +1. onnx2bnn has multiple recognizing levels. It can even recognize the incorrect binary convs described above (the result will be incorrect though). Please check out [this documentation](https://github.com/JDAI-CV/dabnn/wiki/Train,-export-and-convert-a-dabnn-model) for details. + +1. `group` is not supported for now. diff --git a/docs/svgs/09e963a9a257d451169d317f04f4cf59.svg b/docs/svgs/09e963a9a257d451169d317f04f4cf59.svg new file mode 100644 index 0000000..d0db841 --- /dev/null +++ b/docs/svgs/09e963a9a257d451169d317f04f4cf59.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/0df921cda6b933526ab3fb2c8ba713c0.svg b/docs/svgs/0df921cda6b933526ab3fb2c8ba713c0.svg new file mode 100644 index 0000000..d9c652e --- /dev/null +++ b/docs/svgs/0df921cda6b933526ab3fb2c8ba713c0.svg @@ -0,0 +1,57 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/15e03f3c82848a46865db186cb4c1092.svg b/docs/svgs/15e03f3c82848a46865db186cb4c1092.svg new file mode 100644 index 0000000..4b05949 --- /dev/null +++ b/docs/svgs/15e03f3c82848a46865db186cb4c1092.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/1cb45f0e1e422f5a042ce0dc8710ed27.svg b/docs/svgs/1cb45f0e1e422f5a042ce0dc8710ed27.svg new file mode 100644 index 0000000..81f8b4f --- /dev/null +++ b/docs/svgs/1cb45f0e1e422f5a042ce0dc8710ed27.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/22aefc0b275701a94e3684ede71e1cbf.svg b/docs/svgs/22aefc0b275701a94e3684ede71e1cbf.svg new file mode 100644 index 0000000..7bda69f --- /dev/null +++ b/docs/svgs/22aefc0b275701a94e3684ede71e1cbf.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/252b59b1233ed40f0396e2cd369f514d.svg b/docs/svgs/252b59b1233ed40f0396e2cd369f514d.svg new file mode 100644 index 0000000..a61af42 --- /dev/null +++ b/docs/svgs/252b59b1233ed40f0396e2cd369f514d.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/25bbbd23c3609fee3f26aa5f809dbe2e.svg b/docs/svgs/25bbbd23c3609fee3f26aa5f809dbe2e.svg new file mode 100644 index 0000000..b8109ef --- /dev/null +++ b/docs/svgs/25bbbd23c3609fee3f26aa5f809dbe2e.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/2c08f38f094ac03aea56779378242468.svg b/docs/svgs/2c08f38f094ac03aea56779378242468.svg new file mode 100644 index 0000000..fe04c60 --- /dev/null +++ b/docs/svgs/2c08f38f094ac03aea56779378242468.svg @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/2e32e0141d372413f25c35045d246695.svg b/docs/svgs/2e32e0141d372413f25c35045d246695.svg new file mode 100644 index 0000000..924eec2 --- /dev/null +++ b/docs/svgs/2e32e0141d372413f25c35045d246695.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/2e67a96431b169a7b134a2ab4c5f3457.svg b/docs/svgs/2e67a96431b169a7b134a2ab4c5f3457.svg new file mode 100644 index 0000000..c3db29e --- /dev/null +++ b/docs/svgs/2e67a96431b169a7b134a2ab4c5f3457.svg @@ -0,0 +1,21 @@ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/3b5fe08410dc2e357ad56d5e09c013c5.svg b/docs/svgs/3b5fe08410dc2e357ad56d5e09c013c5.svg new file mode 100644 index 0000000..b5f1e6c --- /dev/null +++ b/docs/svgs/3b5fe08410dc2e357ad56d5e09c013c5.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/4723cf14b1da3a0da99410e67984882d.svg b/docs/svgs/4723cf14b1da3a0da99410e67984882d.svg new file mode 100644 index 0000000..02721e4 --- /dev/null +++ b/docs/svgs/4723cf14b1da3a0da99410e67984882d.svg @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/484ca82711fa06cc47440002b35c5a66.svg b/docs/svgs/484ca82711fa06cc47440002b35c5a66.svg new file mode 100644 index 0000000..31db846 --- /dev/null +++ b/docs/svgs/484ca82711fa06cc47440002b35c5a66.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/4fd661cfefdf4318d1aa35fb483796b2.svg b/docs/svgs/4fd661cfefdf4318d1aa35fb483796b2.svg new file mode 100644 index 0000000..c8e8732 --- /dev/null +++ b/docs/svgs/4fd661cfefdf4318d1aa35fb483796b2.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/5615b81594cc5f5f54f6c86a17443fea.svg b/docs/svgs/5615b81594cc5f5f54f6c86a17443fea.svg new file mode 100644 index 0000000..9525557 --- /dev/null +++ b/docs/svgs/5615b81594cc5f5f54f6c86a17443fea.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/68d27da8ea3f60dda13e915b722c2c25.svg b/docs/svgs/68d27da8ea3f60dda13e915b722c2c25.svg new file mode 100644 index 0000000..e05a724 --- /dev/null +++ b/docs/svgs/68d27da8ea3f60dda13e915b722c2c25.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/7adcdcafe095c28283fc5a319a9b6cdb.svg b/docs/svgs/7adcdcafe095c28283fc5a319a9b6cdb.svg new file mode 100644 index 0000000..e64cb84 --- /dev/null +++ b/docs/svgs/7adcdcafe095c28283fc5a319a9b6cdb.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/7ceafe91152205b4b4d2b3a8b1565477.svg b/docs/svgs/7ceafe91152205b4b4d2b3a8b1565477.svg new file mode 100644 index 0000000..26d7913 --- /dev/null +++ b/docs/svgs/7ceafe91152205b4b4d2b3a8b1565477.svg @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/81299da238f63ff881f8365a2a3b638a.svg b/docs/svgs/81299da238f63ff881f8365a2a3b638a.svg new file mode 100644 index 0000000..f809892 --- /dev/null +++ b/docs/svgs/81299da238f63ff881f8365a2a3b638a.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/8217ed3c32a785f0b5aad4055f432ad8.svg b/docs/svgs/8217ed3c32a785f0b5aad4055f432ad8.svg new file mode 100644 index 0000000..0f2f566 --- /dev/null +++ b/docs/svgs/8217ed3c32a785f0b5aad4055f432ad8.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/873e1472d3de67a29299013cd3eebb39.svg b/docs/svgs/873e1472d3de67a29299013cd3eebb39.svg new file mode 100644 index 0000000..43951de --- /dev/null +++ b/docs/svgs/873e1472d3de67a29299013cd3eebb39.svg @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/88cf5350b4c645c31edaa0cbba3ee5f9.svg b/docs/svgs/88cf5350b4c645c31edaa0cbba3ee5f9.svg new file mode 100644 index 0000000..35ac928 --- /dev/null +++ b/docs/svgs/88cf5350b4c645c31edaa0cbba3ee5f9.svg @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/8edf0a665654dc211972f609f97cb684.svg b/docs/svgs/8edf0a665654dc211972f609f97cb684.svg new file mode 100644 index 0000000..d45aadb --- /dev/null +++ b/docs/svgs/8edf0a665654dc211972f609f97cb684.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/9034606aa4dd18758a6889347abf0302.svg b/docs/svgs/9034606aa4dd18758a6889347abf0302.svg new file mode 100644 index 0000000..6934810 --- /dev/null +++ b/docs/svgs/9034606aa4dd18758a6889347abf0302.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/904d8a3dfde39f4fb05df9337f05b65f.svg b/docs/svgs/904d8a3dfde39f4fb05df9337f05b65f.svg new file mode 100644 index 0000000..71e8e4d --- /dev/null +++ b/docs/svgs/904d8a3dfde39f4fb05df9337f05b65f.svg @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/9998129ab540f7bc0985032e06e974ed.svg b/docs/svgs/9998129ab540f7bc0985032e06e974ed.svg new file mode 100644 index 0000000..c35c54f --- /dev/null +++ b/docs/svgs/9998129ab540f7bc0985032e06e974ed.svg @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/a6b6654f6dbe55b7fa2c8f5104fb8370.svg b/docs/svgs/a6b6654f6dbe55b7fa2c8f5104fb8370.svg new file mode 100644 index 0000000..dab4ad9 --- /dev/null +++ b/docs/svgs/a6b6654f6dbe55b7fa2c8f5104fb8370.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/ab03e97f653c3b2963d6a503b2a9719b.svg b/docs/svgs/ab03e97f653c3b2963d6a503b2a9719b.svg new file mode 100644 index 0000000..fe61acf --- /dev/null +++ b/docs/svgs/ab03e97f653c3b2963d6a503b2a9719b.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/bdbf342b57819773421273d508dba586.svg b/docs/svgs/bdbf342b57819773421273d508dba586.svg new file mode 100644 index 0000000..460e84d --- /dev/null +++ b/docs/svgs/bdbf342b57819773421273d508dba586.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/c745b9b57c145ec5577b82542b2df546.svg b/docs/svgs/c745b9b57c145ec5577b82542b2df546.svg new file mode 100644 index 0000000..39b5283 --- /dev/null +++ b/docs/svgs/c745b9b57c145ec5577b82542b2df546.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/c82ef99e46a995ca2c9e5865a66d022f.svg b/docs/svgs/c82ef99e46a995ca2c9e5865a66d022f.svg new file mode 100644 index 0000000..c57a7f7 --- /dev/null +++ b/docs/svgs/c82ef99e46a995ca2c9e5865a66d022f.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/cc1dbbcd450fb3182ca125d94560c60d.svg b/docs/svgs/cc1dbbcd450fb3182ca125d94560c60d.svg new file mode 100644 index 0000000..3cddb10 --- /dev/null +++ b/docs/svgs/cc1dbbcd450fb3182ca125d94560c60d.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/cc6323a2493d6ca0f60a01cf3df23307.svg b/docs/svgs/cc6323a2493d6ca0f60a01cf3df23307.svg new file mode 100644 index 0000000..c945b1e --- /dev/null +++ b/docs/svgs/cc6323a2493d6ca0f60a01cf3df23307.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/cfeb1241a5083b5a03c00504e02f69b3.svg b/docs/svgs/cfeb1241a5083b5a03c00504e02f69b3.svg new file mode 100644 index 0000000..d062c3d --- /dev/null +++ b/docs/svgs/cfeb1241a5083b5a03c00504e02f69b3.svg @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/d0740c8f4fc4e3563ada4e53f43a81a1.svg b/docs/svgs/d0740c8f4fc4e3563ada4e53f43a81a1.svg new file mode 100644 index 0000000..c57a7f7 --- /dev/null +++ b/docs/svgs/d0740c8f4fc4e3563ada4e53f43a81a1.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/d0b09e58d8b197fff6fc95ea3bca20fe.svg b/docs/svgs/d0b09e58d8b197fff6fc95ea3bca20fe.svg new file mode 100644 index 0000000..b0c94b5 --- /dev/null +++ b/docs/svgs/d0b09e58d8b197fff6fc95ea3bca20fe.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/d236438ccc18393a004614a9ddc04188.svg b/docs/svgs/d236438ccc18393a004614a9ddc04188.svg new file mode 100644 index 0000000..72d3442 --- /dev/null +++ b/docs/svgs/d236438ccc18393a004614a9ddc04188.svg @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/d4e2df111ef06f71a9c4108cb9542aff.svg b/docs/svgs/d4e2df111ef06f71a9c4108cb9542aff.svg new file mode 100644 index 0000000..34236e8 --- /dev/null +++ b/docs/svgs/d4e2df111ef06f71a9c4108cb9542aff.svg @@ -0,0 +1,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/d6328eaebbcd5c358f426dbea4bdbf70.svg b/docs/svgs/d6328eaebbcd5c358f426dbea4bdbf70.svg new file mode 100644 index 0000000..6ed976d --- /dev/null +++ b/docs/svgs/d6328eaebbcd5c358f426dbea4bdbf70.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/dad09b192ab596ebe6643c184b041b78.svg b/docs/svgs/dad09b192ab596ebe6643c184b041b78.svg new file mode 100644 index 0000000..75e8bbf --- /dev/null +++ b/docs/svgs/dad09b192ab596ebe6643c184b041b78.svg @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/dc61d515b6f36dadf6ab7371698a9ef1.svg b/docs/svgs/dc61d515b6f36dadf6ab7371698a9ef1.svg new file mode 100644 index 0000000..6cc54a6 --- /dev/null +++ b/docs/svgs/dc61d515b6f36dadf6ab7371698a9ef1.svg @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/eb7ee640b8ff98c0068ed4d9ec3baf60.svg b/docs/svgs/eb7ee640b8ff98c0068ed4d9ec3baf60.svg new file mode 100644 index 0000000..a405b3c --- /dev/null +++ b/docs/svgs/eb7ee640b8ff98c0068ed4d9ec3baf60.svg @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/f0fa7d7a09a30703b30ba8aae9c1c1b5.svg b/docs/svgs/f0fa7d7a09a30703b30ba8aae9c1c1b5.svg new file mode 100644 index 0000000..b2e1b05 --- /dev/null +++ b/docs/svgs/f0fa7d7a09a30703b30ba8aae9c1c1b5.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/f29d99803e443e4e6e87180539b3197f.svg b/docs/svgs/f29d99803e443e4e6e87180539b3197f.svg new file mode 100644 index 0000000..71e8e4d --- /dev/null +++ b/docs/svgs/f29d99803e443e4e6e87180539b3197f.svg @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/f3d9f6f447d13bcef7127ff6c98710a3.svg b/docs/svgs/f3d9f6f447d13bcef7127ff6c98710a3.svg new file mode 100644 index 0000000..8aa9a82 --- /dev/null +++ b/docs/svgs/f3d9f6f447d13bcef7127ff6c98710a3.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/f5feb9f32839cb69ccdf8b0838d8c7cb.svg b/docs/svgs/f5feb9f32839cb69ccdf8b0838d8c7cb.svg new file mode 100644 index 0000000..0f42931 --- /dev/null +++ b/docs/svgs/f5feb9f32839cb69ccdf8b0838d8c7cb.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/f6128a2d469857252e8e52385e7a00c5.svg b/docs/svgs/f6128a2d469857252e8e52385e7a00c5.svg new file mode 100644 index 0000000..7974cf0 --- /dev/null +++ b/docs/svgs/f6128a2d469857252e8e52385e7a00c5.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/f8b4daba6c4183a3c1000ebb2d64de5f.svg b/docs/svgs/f8b4daba6c4183a3c1000ebb2d64de5f.svg new file mode 100644 index 0000000..7df3a09 --- /dev/null +++ b/docs/svgs/f8b4daba6c4183a3c1000ebb2d64de5f.svg @@ -0,0 +1,63 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/f9c4988898e7f532b9f826a75014ed3c.svg b/docs/svgs/f9c4988898e7f532b9f826a75014ed3c.svg new file mode 100644 index 0000000..aaa57ff --- /dev/null +++ b/docs/svgs/f9c4988898e7f532b9f826a75014ed3c.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/fb97d38bcc19230b0acd442e17db879c.svg b/docs/svgs/fb97d38bcc19230b0acd442e17db879c.svg new file mode 100644 index 0000000..c07dbbc --- /dev/null +++ b/docs/svgs/fb97d38bcc19230b0acd442e17db879c.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/docs/svgs/ff7cbf533a4e41019c689366004849fb.svg b/docs/svgs/ff7cbf533a4e41019c689366004849fb.svg new file mode 100644 index 0000000..d0db841 --- /dev/null +++ b/docs/svgs/ff7cbf533a4e41019c689366004849fb.svg @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file From 281a79676b7153b54b5a44b4cc4bf3efb5856b1b Mon Sep 17 00:00:00 2001 From: daquexian Date: Wed, 5 Jun 2019 12:53:56 +0800 Subject: [PATCH 2/4] Rename design.md -> overall.md --- docs/{design.md => overall.md} | 0 docs/{design_CN.md => overall_CN.md} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename docs/{design.md => overall.md} (100%) rename docs/{design_CN.md => overall_CN.md} (100%) diff --git a/docs/design.md b/docs/overall.md similarity index 100% rename from docs/design.md rename to docs/overall.md diff --git a/docs/design_CN.md b/docs/overall_CN.md similarity index 100% rename from docs/design_CN.md rename to docs/overall_CN.md From 0f0c1835e5fadc81d2e929f4206bd6b25dd3deb9 Mon Sep 17 00:00:00 2001 From: daquexian Date: Wed, 5 Jun 2019 12:55:04 +0800 Subject: [PATCH 3/4] Rename bconv_out.md to bconv.md --- docs/bconv.md | 44 ++++++++++++---------------------- docs/bconv.md.in | 61 +++++++++++++++++++++++++++++++++++++++++++++++ docs/bconv_out.md | 47 ------------------------------------ 3 files changed, 76 insertions(+), 76 deletions(-) create mode 100644 docs/bconv.md.in delete mode 100644 docs/bconv_out.md diff --git a/docs/bconv.md b/docs/bconv.md index 3bb7519..4ecaae7 100644 --- a/docs/bconv.md +++ b/docs/bconv.md @@ -21,41 +21,27 @@ The detailed implementation of BGEMM is in https://github.com/JDAI-CV/dabnn/blob However, we argue that BGEMM is sub-optimal for BGEMM especially on ARM devices. -In addition to the common multiplication and add operations, BGEMM includes extra operations that count how many 1s are in a vector. Specifically, we denote $U^{M \times N}$ as the space of matrices with dimension $M \times N$ and each element of it is a bit-packed vector. Given two matrices (i.e., $ \boldsymbol{A} \in U^{M \times K}$ and $\boldsymbol{B} \in U^{K \times N}$), $\boldsymbol{C} \in \mathbb{N}^{M \times N}$ ($\mathbb{N}$ represents the set of non-negative integers), $\boldsymbol{C} = BGEMM(\boldsymbol{A}, \boldsymbol{B})$ is measured as: -$$ - C_{i,j} = \sum\nolimits_{k} bitcount(xnor(\Vec{A_{i,k}}, \Vec{B_{k,j}})), -$$ +In addition to the common multiplication and add operations, BGEMM includes extra operations that count how many 1s are in a vector. Specifically, we denote as the space of matrices with dimension and each element of it is a bit-packed vector. Given two matrices (i.e., and ), ( represents the set of non-negative integers), is measured as: +

-where $\Vec{A_{i,k}}$ and $\Vec{B_{k,j}}$ denotes each element in $ \boldsymbol{A}$ and $\boldsymbol{B}$. In SGEMM, to amortize the cost of loading memory, $\boldsymbol{C}$ is often calculated as -$$ - \boldsymbol{C^{k}} = \boldsymbol{m^{k}}\boldsymbol{n^{k}}, -$$ -$$ - \boldsymbol{C} \mathrel{+}= \boldsymbol{C^{k}}, -$$ +where and denotes each element in and . In SGEMM, to amortize the cost of loading memory, is often calculated as +

+

-where $\boldsymbol{m^{k}}$ is the $k_{th}$ column of $\boldsymbol{A}$ and $\boldsymbol{n^{k}}$ is the $k_{th}$ row of $\boldsymbol{B}$. +where is the column of and is the row of . -In particular, on ARMv8 (the 64-bit ARM architecture) devices, the operation of bitcount contains two instructions: "cnt" and "addv". "cnt" takes an $N$-byte vector $\alpha$ as input and outputs an $N$-byte vector $\beta$, which $\beta_{i} = the\_number\_of\_1s(\alpha_{i})$ where $\alpha_{i}$ and $\beta_{i}$ are the $i_{th}$ byte of $\alpha$ and $\beta$ respectively. "addv" sums up all bytes in a vector and outputs the aggregated scalar. The equation is then expanded as: -$$ - C_{i,j} \mathrel{+}= addv(cnt(xnor(\Vec{m^{k}_{i}}, \Vec{n^{k}_{j}}))). -$$ +In particular, on ARMv8 (the 64-bit ARM architecture) devices, the operation of bitcount contains two instructions: "cnt" and "addv". "cnt" takes an -byte vector as input and outputs an -byte vector , which where and are the byte of and respectively. "addv" sums up all bytes in a vector and outputs the aggregated scalar. The equation is then expanded as: +

-Thus, the above equation shows that the operation of binary multiply-addition on ARMv8 devices consists of four instructions: xnor, cnt, addv, and addition. Moreover, on ARMv7 (the 32-bit ARM architecture) devices, there is even no "addv" instruction and $\lceil \log_{2}N \rceil$ instructions are needed to sum up all bytes in an $N$-byte vector, so the operation of binary multiply-addition consists of $\lceil \log_{2}N \rceil+3$ instructions on these devices. To improve the efficiency of this operation, we re-arrange the calculation order and calculate $\boldsymbol{C}=BGEMM(\boldsymbol{A},\boldsymbol{B})$ as the multiplication of a row vector $\boldsymbol{p} \in U^{1 \times N}$ and $\boldsymbol{q} \in U^{M \times 1}$: -$$ - C_{i,j} = \boldsymbol{p^{i}}\boldsymbol{q^{j}}, -$$ +Thus, the above equation shows that the operation of binary multiply-addition on ARMv8 devices consists of four instructions: xnor, cnt, addv, and addition. Moreover, on ARMv7 (the 32-bit ARM architecture) devices, there is even no "addv" instruction and instructions are needed to sum up all bytes in an -byte vector, so the operation of binary multiply-addition consists of instructions on these devices. To improve the efficiency of this operation, we re-arrange the calculation order and calculate as the multiplication of a row vector and : +

-where $\boldsymbol{p^{i}}$ is the $i_{th}$ row of $\boldsymbol{A}$ and $\boldsymbol{q^{j}}$ is the $j_{th}$ column of $\boldsymbol{B}$. +where is the row of and is the column of . In this way, the cost of "addv" instructions can be mostly squeezed by summing up the results of "cnt" in advance: -$$ - \Vec{C_{i,j}} = \sum\nolimits_{k} cnt(xnor(\Vec{A_{i,k}}, \Vec{B_{k,j}})), -$$ -$$ - C_{i,j} = addv(\Vec{C_{i,j}}). -$$ - -Please note that the same transformation can not be employed in BGEMM because $\boldsymbol{C}$ is stored as 32-bit integers to save the valuable registers. Therefore in the equation of BGEMM, we have to utilize "addv" to reduce the vector into an integer before every instruction of "addition". Taking a close look on the above two equations, we can observe some interesting connections between them and the operation of convolution. Specifically, if we treat $\boldsymbol{A} \in U^{M \times K}$ and $\boldsymbol{B} \in U^{K \times N}$ as the weight and the im2col-ed input ($M$: the number of output channels, $N$: output height $\times$ output width, and $K$: the number of bit-packed vectors in a weight filter), the above two equations can be directly interpreted as the definition of convolution. As such, the refined operation of binary convolution is dubbed as "Binary Direct Convolution". +

+

+ +Please note that the same transformation can not be employed in BGEMM because is stored as 32-bit integers to save the valuable registers. Therefore in the equation of BGEMM, we have to utilize "addv" to reduce the vector into an integer before every instruction of "addition". Taking a close look on the above two equations, we can observe some interesting connections between them and the operation of convolution. Specifically, if we treat and as the weight and the im2col-ed input (: the number of output channels, : output height output width, and : the number of bit-packed vectors in a weight filter), the above two equations can be directly interpreted as the definition of convolution. As such, the refined operation of binary convolution is dubbed as "Binary Direct Convolution". The implementation of Binary Direct Convolution is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bconv.h. diff --git a/docs/bconv.md.in b/docs/bconv.md.in new file mode 100644 index 0000000..3bb7519 --- /dev/null +++ b/docs/bconv.md.in @@ -0,0 +1,61 @@ +## Bit-packing + +Bit-packing is performed in `Binarize` layers. It pack N 32-bit float/integer to an N-bit operand according their signs. For example, performing bit-packing on 128 float numbers produces a 128-bit operand. xnor/xor is only enabled on these packed operands. + +The details of bit-packing are in + +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L20 (optimized, for tensors of 128 and more channels) +* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L204 (normal, for tensors of less than 128 channels) + +The optmized version is 4X faster than the normal version. Bit-packing algorithm directly leverage the sign bits of int32 and IEEE 754 float numbers, and then eliminate the comparison with zeros. SIMD instructions are also used to speed up this process. Note that after SIMD instructions is performed, the N bit in the result will be re-arranged so that they are not in the same order with the N 32-bit inputs. Fortunately, the output of xnor/xor is not affected as long as the input and weight is re-arranged in the same way. Given this observation, we re-arranged the weights of binary convs whose inputs is bit-packed in the optmized way. The details are in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/net.cpp#L82. + +dabnn present the following two optmized implementation of binary convs. + +## BGEMM + +SGEMM (Single float GEneral Matrix Multiplication) is a widely adopted approach to implement float convolutions in various high-performance scientific programs. In the context of BNNs, an alternative operation to SGEMM is BGEMM, which performs binary matrix multiplication for binary convolution after [im2col](https://github.com/JDAI-CV/dabnn/blob/master/dabnn/im2col.h). dabnn present optmized BGEMM. The advantage of GEMM is that it covers all cases of convolutions (various kernel size, stride, padding, ..) and it is easy to implement. + +The detailed implementation of BGEMM is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bgemm.h. + +## Binary Direct Convolution + +However, we argue that BGEMM is sub-optimal for BGEMM especially on ARM devices. + +In addition to the common multiplication and add operations, BGEMM includes extra operations that count how many 1s are in a vector. Specifically, we denote $U^{M \times N}$ as the space of matrices with dimension $M \times N$ and each element of it is a bit-packed vector. Given two matrices (i.e., $ \boldsymbol{A} \in U^{M \times K}$ and $\boldsymbol{B} \in U^{K \times N}$), $\boldsymbol{C} \in \mathbb{N}^{M \times N}$ ($\mathbb{N}$ represents the set of non-negative integers), $\boldsymbol{C} = BGEMM(\boldsymbol{A}, \boldsymbol{B})$ is measured as: +$$ + C_{i,j} = \sum\nolimits_{k} bitcount(xnor(\Vec{A_{i,k}}, \Vec{B_{k,j}})), +$$ + +where $\Vec{A_{i,k}}$ and $\Vec{B_{k,j}}$ denotes each element in $ \boldsymbol{A}$ and $\boldsymbol{B}$. In SGEMM, to amortize the cost of loading memory, $\boldsymbol{C}$ is often calculated as +$$ + \boldsymbol{C^{k}} = \boldsymbol{m^{k}}\boldsymbol{n^{k}}, +$$ +$$ + \boldsymbol{C} \mathrel{+}= \boldsymbol{C^{k}}, +$$ + +where $\boldsymbol{m^{k}}$ is the $k_{th}$ column of $\boldsymbol{A}$ and $\boldsymbol{n^{k}}$ is the $k_{th}$ row of $\boldsymbol{B}$. + +In particular, on ARMv8 (the 64-bit ARM architecture) devices, the operation of bitcount contains two instructions: "cnt" and "addv". "cnt" takes an $N$-byte vector $\alpha$ as input and outputs an $N$-byte vector $\beta$, which $\beta_{i} = the\_number\_of\_1s(\alpha_{i})$ where $\alpha_{i}$ and $\beta_{i}$ are the $i_{th}$ byte of $\alpha$ and $\beta$ respectively. "addv" sums up all bytes in a vector and outputs the aggregated scalar. The equation is then expanded as: +$$ + C_{i,j} \mathrel{+}= addv(cnt(xnor(\Vec{m^{k}_{i}}, \Vec{n^{k}_{j}}))). +$$ + +Thus, the above equation shows that the operation of binary multiply-addition on ARMv8 devices consists of four instructions: xnor, cnt, addv, and addition. Moreover, on ARMv7 (the 32-bit ARM architecture) devices, there is even no "addv" instruction and $\lceil \log_{2}N \rceil$ instructions are needed to sum up all bytes in an $N$-byte vector, so the operation of binary multiply-addition consists of $\lceil \log_{2}N \rceil+3$ instructions on these devices. To improve the efficiency of this operation, we re-arrange the calculation order and calculate $\boldsymbol{C}=BGEMM(\boldsymbol{A},\boldsymbol{B})$ as the multiplication of a row vector $\boldsymbol{p} \in U^{1 \times N}$ and $\boldsymbol{q} \in U^{M \times 1}$: +$$ + C_{i,j} = \boldsymbol{p^{i}}\boldsymbol{q^{j}}, +$$ + +where $\boldsymbol{p^{i}}$ is the $i_{th}$ row of $\boldsymbol{A}$ and $\boldsymbol{q^{j}}$ is the $j_{th}$ column of $\boldsymbol{B}$. + +In this way, the cost of "addv" instructions can be mostly squeezed by summing up the results of "cnt" in advance: +$$ + \Vec{C_{i,j}} = \sum\nolimits_{k} cnt(xnor(\Vec{A_{i,k}}, \Vec{B_{k,j}})), +$$ +$$ + C_{i,j} = addv(\Vec{C_{i,j}}). +$$ + +Please note that the same transformation can not be employed in BGEMM because $\boldsymbol{C}$ is stored as 32-bit integers to save the valuable registers. Therefore in the equation of BGEMM, we have to utilize "addv" to reduce the vector into an integer before every instruction of "addition". Taking a close look on the above two equations, we can observe some interesting connections between them and the operation of convolution. Specifically, if we treat $\boldsymbol{A} \in U^{M \times K}$ and $\boldsymbol{B} \in U^{K \times N}$ as the weight and the im2col-ed input ($M$: the number of output channels, $N$: output height $\times$ output width, and $K$: the number of bit-packed vectors in a weight filter), the above two equations can be directly interpreted as the definition of convolution. As such, the refined operation of binary convolution is dubbed as "Binary Direct Convolution". + +The implementation of Binary Direct Convolution is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bconv.h. diff --git a/docs/bconv_out.md b/docs/bconv_out.md deleted file mode 100644 index 4ecaae7..0000000 --- a/docs/bconv_out.md +++ /dev/null @@ -1,47 +0,0 @@ -## Bit-packing - -Bit-packing is performed in `Binarize` layers. It pack N 32-bit float/integer to an N-bit operand according their signs. For example, performing bit-packing on 128 float numbers produces a 128-bit operand. xnor/xor is only enabled on these packed operands. - -The details of bit-packing are in - -* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L20 (optimized, for tensors of 128 and more channels) -* https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bitpack.h#L204 (normal, for tensors of less than 128 channels) - -The optmized version is 4X faster than the normal version. Bit-packing algorithm directly leverage the sign bits of int32 and IEEE 754 float numbers, and then eliminate the comparison with zeros. SIMD instructions are also used to speed up this process. Note that after SIMD instructions is performed, the N bit in the result will be re-arranged so that they are not in the same order with the N 32-bit inputs. Fortunately, the output of xnor/xor is not affected as long as the input and weight is re-arranged in the same way. Given this observation, we re-arranged the weights of binary convs whose inputs is bit-packed in the optmized way. The details are in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/net.cpp#L82. - -dabnn present the following two optmized implementation of binary convs. - -## BGEMM - -SGEMM (Single float GEneral Matrix Multiplication) is a widely adopted approach to implement float convolutions in various high-performance scientific programs. In the context of BNNs, an alternative operation to SGEMM is BGEMM, which performs binary matrix multiplication for binary convolution after [im2col](https://github.com/JDAI-CV/dabnn/blob/master/dabnn/im2col.h). dabnn present optmized BGEMM. The advantage of GEMM is that it covers all cases of convolutions (various kernel size, stride, padding, ..) and it is easy to implement. - -The detailed implementation of BGEMM is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bgemm.h. - -## Binary Direct Convolution - -However, we argue that BGEMM is sub-optimal for BGEMM especially on ARM devices. - -In addition to the common multiplication and add operations, BGEMM includes extra operations that count how many 1s are in a vector. Specifically, we denote as the space of matrices with dimension and each element of it is a bit-packed vector. Given two matrices (i.e., and ), ( represents the set of non-negative integers), is measured as: -

- -where and denotes each element in and . In SGEMM, to amortize the cost of loading memory, is often calculated as -

-

- -where is the column of and is the row of . - -In particular, on ARMv8 (the 64-bit ARM architecture) devices, the operation of bitcount contains two instructions: "cnt" and "addv". "cnt" takes an -byte vector as input and outputs an -byte vector , which where and are the byte of and respectively. "addv" sums up all bytes in a vector and outputs the aggregated scalar. The equation is then expanded as: -

- -Thus, the above equation shows that the operation of binary multiply-addition on ARMv8 devices consists of four instructions: xnor, cnt, addv, and addition. Moreover, on ARMv7 (the 32-bit ARM architecture) devices, there is even no "addv" instruction and instructions are needed to sum up all bytes in an -byte vector, so the operation of binary multiply-addition consists of instructions on these devices. To improve the efficiency of this operation, we re-arrange the calculation order and calculate as the multiplication of a row vector and : -

- -where is the row of and is the column of . - -In this way, the cost of "addv" instructions can be mostly squeezed by summing up the results of "cnt" in advance: -

-

- -Please note that the same transformation can not be employed in BGEMM because is stored as 32-bit integers to save the valuable registers. Therefore in the equation of BGEMM, we have to utilize "addv" to reduce the vector into an integer before every instruction of "addition". Taking a close look on the above two equations, we can observe some interesting connections between them and the operation of convolution. Specifically, if we treat and as the weight and the im2col-ed input (: the number of output channels, : output height output width, and : the number of bit-packed vectors in a weight filter), the above two equations can be directly interpreted as the definition of convolution. As such, the refined operation of binary convolution is dubbed as "Binary Direct Convolution". - -The implementation of Binary Direct Convolution is in https://github.com/JDAI-CV/dabnn/blob/master/dabnn/bconv.h. From 26b4fcf9266be69c75827f37d84a523c8a171d96 Mon Sep 17 00:00:00 2001 From: daquexian Date: Wed, 5 Jun 2019 13:02:05 +0800 Subject: [PATCH 4/4] Update README and misc changes --- README.md | 4 +++- README_CN.md | 4 +++- docs/overall.md | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6f20fb8..6681ed6 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,9 @@ We publish two pretrained binary neural network models based on [Bi-Real Net](ht ## Implementation Details -We plan to participate the [ACM Multimedia 2019 Open Source Software Competition](https://www.acmmm.org/2019/call-for-open-source-software-competition/). Our implementation details will be presented in a 4-page short paper soon. +* The Implementation of Binary Convolutions: [docs/bconv.md](docs/bconv.md) + +* Model Conversion: [docs/onnx2bnn.md](docs/onnx2bnn.md) ## Example project diff --git a/README_CN.md b/README_CN.md index 2062cc8..501a23a 100644 --- a/README_CN.md +++ b/README_CN.md @@ -64,7 +64,9 @@ dabnn_bireal18_imagenet_stem 43279353 ns 41533009 ns 14 <--- ## 技术细节 -我们计划参加 [ACM Multimedia 2019 Open Source Software Competition](https://www.acmmm.org/2019/call-for-open-source-software-competition/). dabnn 的技术细节很快会在一篇四页的短论文中描述。 +* Binary Convolutions 的实现: [docs/bconv.md](docs/bconv_CN.md) + +* 模型转换: [docs/onnx2bnn.md](docs/onnx2bnn_CN.md) ## 示例工程 diff --git a/docs/overall.md b/docs/overall.md index 0f326e5..0d99e61 100644 --- a/docs/overall.md +++ b/docs/overall.md @@ -2,7 +2,7 @@ Binary Neural Networks is proposed in [Binary Neural Networks](https://arxiv.org/abs/1602.02830) and [XNOR-Net](https://arxiv.org/abs/1603.05279). In the following papers, [Bi-Real Net](https://arxiv.org/abs/1808.00278) presented some new training method in order to improve the performance, [BENN](https://arxiv.org/abs/1806.07550) leverages emsemble on BNNs. -BNNs can save 10X+ memory, and several times as fast as float NNs. What's more, it theoretically [saves 10X energy](https://camo.githubusercontent.com/e725038be60ce4bb698b22480603b636a92beeaf/687474703a2f2f66696c652e656c656366616e732e636f6d2f776562312f4d30302f35352f37392f7049594241467373565f5341504f63534141435742546f6d6531633033392e706e67). So the battery life of devices will be expanded a lot. +BNNs can save 10X+ memory, and several times as fast as float NNs. What's more, it theoretically [saves 10X energy](https://camo.githubusercontent.com/e725038be60ce4bb698b22480603b636a92beeaf/687474703a2f2f66696c652e656c656366616e732e636f6d2f776562312f4d30302f35352f37392f7049594241467373565f5341504f63534141435742546f6d6531633033392e706e67), so the battery life of devices will be expanded a lot. ## Some notes