diff --git a/README.md b/README.md index 8535f3f..5a31db3 100644 --- a/README.md +++ b/README.md @@ -79,13 +79,14 @@ chmod +x ./bin/test_thinker ## 能力展示 * [thinker API](thinker/docs/tutorial/thinker_api.md) -* [支持量化OP列表](https://github.com/LISTENAI/linger/blob/main/doc/tutorial/support_quant_ops.md)及[模型结构限制说明](thinker/docs/tutorial/restrain_of_model.md) +* [支持量化OP列表](https://github.com/LISTENAI/linger/blob/main/doc/tutorial/support_quant_ops.md) +* [模型结构限制说明](thinker/docs/tutorial/restrain_of_model.md) ## 应用示例 * 鼾声检测[https://github.com/mywang44/snoring_net] ## 版本说明 -- 请参考[RELEASE](doc/tutorial/release.md) +- 请参考[RELEASE](thinker/docs/tutorial/release.md) ## 交流与反馈 - 欢迎您通过 Github Issues 来提交 BUG 与建议 diff --git a/setup.py b/setup.py index a817521..4c8da0d 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="pythinker", - version="1.1.0", + version="2.0.0", description="A DeepLearning inference framework for venus", author="listenai", author_email="lingerthinker@listenai.com", diff --git a/thinker/docs/tutorial/install.md b/thinker/docs/tutorial/install.md index b435744..6c24deb 100644 --- a/thinker/docs/tutorial/install.md +++ b/thinker/docs/tutorial/install.md @@ -39,7 +39,7 @@ cd thinker && sh ./script/x86_linux.sh ### pip包安装方式 ``` shell -pip install pythinker==1.1.0 +pip install pythinker ``` ### docker镜像安装方式 @@ -73,12 +73,12 @@ $ sudo systemctl start docker # systemctl 命令的用法 3、拉取镜像并加载 1)、拉取镜像 ```shell -docker pull listenai/thinker:1.1.0 +docker pull listenai/thinker:2.0.0 ``` 2)、运行容器 ```shell -docker container run -it listenai/thinker:1.1.0 /bin/bash +docker container run -it listenai/thinker:2.0.0 /bin/bash ``` 如果一切正常,运行上面的命令以后,就会返回一个命令行提示符。 diff --git a/thinker/docs/tutorial/release.md b/thinker/docs/tutorial/release.md index 9ed9297..b3336da 100644 --- a/thinker/docs/tutorial/release.md +++ b/thinker/docs/tutorial/release.md @@ -1,2 +1,3 @@ +v2.0.0 2023.8.17 修复iqAdd和iqSub的离线内存分配问题,打包工具需和引擎代码保持一致,升级大版本; v1.1.0 2023.8.15 打包工具增加常量折叠、指定数据存放位置功能,优化内存分析报告的生成。引擎执行器增加一些常用接口,增加通用性;完善算子和支持更多的新算子; V1.0.0 2022.10.24 初始版本 \ No newline at end of file diff --git a/thinker/docs/tutorial/restrain_of_model.md b/thinker/docs/tutorial/restrain_of_model.md index d2ff329..e28ce21 100644 --- a/thinker/docs/tutorial/restrain_of_model.md +++ b/thinker/docs/tutorial/restrain_of_model.md @@ -3,16 +3,16 @@ - PSRAM整体可用空间为8MB,内置FLASH可用空间为8MB。单个模型大小整体不超过8M。 ## 二、单个算子限制 ### 1. conv1dint/conv2dint/deconv2dint/pool相关算子共有限制 - - kernel_size = {1,2,3,4,5},支持kernel_h ≠ kernel_w(conv1dint支持kernel_size >5) - - stride_size = {1,2,4},支持stride_h ≠ stride_w - - pad_size = {0,1,2,3},支持四个方向pad独立设置 + - kernel_size = (1,2,3,4,5),支持kernel_h ≠ kernel_w(conv1dint支持kernel_size >5) + - stride_size = (1,2,4),支持stride_h ≠ stride_w + - pad_size = (0,1,2,3),支持四个方向pad独立设置 - in_w >= weight_w,同时in_h >= weight_h - weight_w >= stride_w,同时weight_h >= stride_h - pad_h_up < weight_h,同时pad_h_down < weight_h - pad_w_right < weight_w, 同时pad_w_left < weight_w ### 2. deconv的限制 - * stride_h(stride_w) = 2时,kernel_h(kernel_w) = {2,3,4,5} - * stirde_h(stride_w) = 4时,kernel_h(kernel_w) = {4,5} + * stride_h(stride_w) = 2时,kernel_h(kernel_w) = (2,3,4,5) + * stirde_h(stride_w) = 4时,kernel_h(kernel_w) = (4,5) ### 3. linearInt/BmmInt的限制 * 左边输入矩阵(M*N)对齐后大小不超过64KB * 不同数据类型下对齐方式: diff --git a/thinker/executor/c_api/thinker_define.h b/thinker/executor/c_api/thinker_define.h index 4e2c686..4516d66 100644 --- a/thinker/executor/c_api/thinker_define.h +++ b/thinker/executor/c_api/thinker_define.h @@ -20,8 +20,8 @@ #define STR_IMP(x) #x #define STR(x) STR_IMP(x) -#define THINKER_VERSION_MAJOR 1 -#define THINKER_VERSION_MINOR 1 +#define THINKER_VERSION_MAJOR 2 +#define THINKER_VERSION_MINOR 0 #define THINKER_VERSION_PATCH 0 #define THINKER_VERSION \ STR(THINKER_VERSION_MAJOR) \ diff --git a/thinker/executor/core/ops/venus/iqadd.h b/thinker/executor/core/ops/venus/iqadd.h index a3bd25a..a89a06a 100644 --- a/thinker/executor/core/ops/venus/iqadd.h +++ b/thinker/executor/core/ops/venus/iqadd.h @@ -16,19 +16,17 @@ int32_t iqadd_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) { void *src1 = (void *)X1->dptr_; void *src2 = (void *)X2->dptr_; void *dst = (void *)Y->dptr_; - int8_t *tmp_buf1 = NULL; - int8_t *tmp_buf2 = NULL; size_t size = getTensorSize(X1); + + if ((x1_q < y_q) || x2_q < y_q) { + return ret; + } + int32_t x1_is_psram = 0; int32_t x2_is_psram = 0; int32_t y_is_psram = 0; int32_t used_tmp_size = 0; - if (Temp) - { - tmp_buf1 = (int8_t *)Temp->dptr_; - } - if ((1 == X1->mem_.type_ || 3 == X1->mem_.type_) && (Temp)) // need copy psram to share { @@ -48,55 +46,28 @@ int32_t iqadd_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) { } if (equalShape(&X1->shape_, &X2->shape_) && (X1->dtype_ == X2->dtype_)) { - int32_t scale1 = 1; - int32_t scale2 = 1; - int32_t shift1 = 0; - int32_t shift2 = 0; - - if (x1_q > y_q) - { - shift1 = x1_q - y_q; - } - else - { - scale1 = (1 << (y_q - x1_q)); - } - if (x2_q > y_q) - { - shift2 = x2_q - y_q; - } - else - { - scale2 = (1 << (y_q - x2_q)); - } + int32_t shift1 = x1_q - y_q; + int32_t shift2 = x2_q - y_q; switch (X1->dtype_) { case Int8: { if (x1_is_psram){ src1 = (int8_t *)Temp->dptr_; memcpy(src1, (void *)X1->dptr_, size * sizeof(int8_t)); - if (x1_q != y_q){ - ret = luna_scale_q7_int8((const q7_t *)src1, (scale1), (int8_t *)src1, size, shift1); - } } - else{ - if (x1_q != y_q){ - src1 = (int8_t *)Temp->dptr_; - ret = luna_scale_q7_int8((const q7_t *)X1->dptr_, (scale1), (int8_t *)src1, size, shift1); - } + + if (x1_q != y_q){ + ret = luna_scale_q7_int8((const q7_t *)src1, (1), (int8_t *)Temp->dptr_, size, shift1); + src1 = (int8_t *)Temp->dptr_; } + if (x2_is_psram){ - src2 = (int8_t *)Temp->dptr_ + (x1_is_psram) * size; + src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q)) * size; memcpy(src2, (void *)X2->dptr_, size * sizeof(int8_t)); + } if (x2_q != y_q){ - ret = luna_scale_q7_int8((const q7_t *)src2, (scale2), (int8_t *)src2, size, shift2); + ret = luna_scale_q7_int8((const q7_t *)src2, (1), (int8_t *)(int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size, size, shift2); + src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size; } - } - else{ - if (x2_q != y_q){ - src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q)) * size; - ret = luna_scale_q7_int8((const q7_t *)X2->dptr_, (scale2), (int8_t *)src2, size, shift2); - } - } if (y_is_psram){ dst = (int8_t *)Temp->dptr_; @@ -107,62 +78,6 @@ int32_t iqadd_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) { if (y_is_psram){ memcpy((void *)Y->dptr_, dst, size * sizeof(int8_t)); } - - // if (Temp) - // { - // tmp_buf2 = tmp_buf1 + size * sizeof(int8_t); - // if (x1_is_psram) - // { - // memcpy(tmp_buf1, src1, size * sizeof(int8_t)); - // src1 = tmp_buf1; - // used_tmp_size += size * sizeof(int8_t); - // } - // if (x2_is_psram) - // { - // memcpy(tmp_buf2, src2, size * sizeof(int8_t)); - // src2 = tmp_buf2; - // used_tmp_size += size * sizeof(int8_t); - // } - // if (y_is_psram) - // { - // dst = tmp_buf1; - // } - // ret = luna_scale_q7_int8((const q7_t *)src1, (scale1), (int8_t *)tmp_buf1, size, - // shift1); - // ret = luna_scale_q7_int8((const q7_t *)src2, (scale2), (int8_t *)tmp_buf2, size, - // shift2); - // ret = luna_add_q7_int8((const q7_t *)tmp_buf1, (q7_t *)tmp_buf2, (int8_t *)dst, - // size, 0); - // } - // else - // { - // ret = luna_scale_q7_int8((const q7_t *)src1, (scale1), (int8_t *)dst, size, - // shift1); - // ret = luna_scale_q7_int8((const q7_t *)src2, (scale2), (int8_t *)src2, size, - // shift2); - // ret = luna_add_q7_int8((const q7_t *)dst, (q7_t *)src2, (int8_t *)dst, - // size, 0); - // } - // if (y_is_psram) - // { - // memcpy((void *)Y->dptr_, dst, size * sizeof(int8_t)); - // } - } break; - case Int16: { - ret = luna_scale_q15_int16((const q15_t *)src1, (scale1), (int16_t *)dst, - size, shift1); - ret = luna_scale_q15_int16((const q15_t *)src2, (scale2), (int16_t *)src2, - size, shift2); - ret = luna_add_q15_int16((const q15_t *)dst, (q15_t *)src2, - (int16_t *)dst, size, 0); - } break; - case Int32: { - ret = luna_scale_q31_int32((const q31_t *)src1, (scale1), (int32_t *)dst, - size, shift1); - ret = luna_scale_q31_int32((const q31_t *)src2, (scale2), (int32_t *)src2, - size, shift2); - ret = luna_add_q31_int32((const q31_t *)dst, (q31_t *)src2, - (int32_t *)dst, size, 0); } break; default: break; diff --git a/thinker/executor/core/ops/venus/iqsub.h b/thinker/executor/core/ops/venus/iqsub.h index 7e97e02..2c9e681 100644 --- a/thinker/executor/core/ops/venus/iqsub.h +++ b/thinker/executor/core/ops/venus/iqsub.h @@ -22,18 +22,27 @@ int32_t iqsub_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) { return ret; } + int32_t x1_is_psram = 0; + int32_t x2_is_psram = 0; + int32_t y_is_psram = 0; + int32_t used_tmp_size = 0; + if ((1 == X1->mem_.type_ || 3 == X1->mem_.type_) && (Temp)) // need copy psram to share { - src1 = (void *)Temp->dptr_; - memcpy(src1, (void *)X1->dptr_, size * X1->byte_); + x1_is_psram = 1; } if ((1 == X2->mem_.type_ || 3 == X2->mem_.type_) && (Temp)) // need copy psram to share { - src2 = (void *)Temp->dptr_; - memcpy(src2, (void *)X2->dptr_, size * X2->byte_); + x2_is_psram = 1; + } + + if ((1 == Y->mem_.type_ || 3 == Y->mem_.type_) && + (Temp)) // need copy psram to share + { + y_is_psram = 1; } if (equalShape(&X1->shape_, &X2->shape_) && (X1->dtype_ == X2->dtype_)) { @@ -41,28 +50,35 @@ int32_t iqsub_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) { int32_t shift2 = x2_q - y_q; switch (X1->dtype_) { case Int8: { - ret = luna_scale_q7_int8((const q7_t *)src1, (1), (int8_t *)dst, size, - shift1); - ret = luna_scale_q7_int8((const q7_t *)src2, (1), (int8_t *)src2, size, - shift2); + if (x1_is_psram){ + src1 = (int8_t *)Temp->dptr_; + memcpy(src1, (void *)X1->dptr_, size * sizeof(int8_t)); + } + if (x1_q != y_q){ + ret = luna_scale_q7_int8((const q7_t *)src1, (1), (int8_t *)Temp->dptr_, size, shift1); + src1 = (int8_t *)Temp->dptr_; + } + + if (x2_is_psram){ + src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q)) * size; + memcpy(src2, (void *)X2->dptr_, size * sizeof(int8_t)); + } + if (x2_q != y_q){ + ret = luna_scale_q7_int8((const q7_t *)src2, (1), (int8_t *)(int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size, size, shift2); + src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size; + } + + if (y_is_psram){ + dst = (int8_t *)Temp->dptr_; + } + ret = luna_sub_q7_int8((const q7_t *)dst, (q7_t *)src2, (int8_t *)dst, size, 0); - } break; - case Int16: { - ret = luna_scale_q15_int16((const q15_t *)src1, (1), (int16_t *)dst, - size, shift1); - ret = luna_scale_q15_int16((const q15_t *)src2, (1), (int16_t *)src2, - size, shift2); - ret = luna_sub_q15_int16((const q15_t *)dst, (q15_t *)src2, - (int16_t *)dst, size, 0); - } break; - case Int32: { - ret = luna_scale_q31_int32((const q31_t *)src1, (1), (int32_t *)dst, - size, shift1); - ret = luna_scale_q31_int32((const q31_t *)src2, (1), (int32_t *)src2, - size, shift2); - ret = luna_sub_q31_int32((const q31_t *)dst, (q31_t *)src2, - (int32_t *)dst, size, 0); + + if (y_is_psram){ + memcpy((void *)Y->dptr_, dst, size * sizeof(int8_t)); + } + } break; default: break; diff --git a/thinker/resource_packer/ops/iqSub.py b/thinker/resource_packer/ops/iqSub.py index eb70ff5..0f6cb1c 100644 --- a/thinker/resource_packer/ops/iqSub.py +++ b/thinker/resource_packer/ops/iqSub.py @@ -1,11 +1,33 @@ import math +import numpy as np +from typing import List + +from ...graph import Tensor from ...enum_defines import DevType from .base import iqBinaryOperator, register_op @register_op class iqSub(iqBinaryOperator): - pass + def get_workspace(self, dev_type: DevType) -> List[Tensor]: + x1 = self.inputs[0] + x2 = self.inputs[1] + size = x1.nbytes + Y = self.outputs[0] + + scale_x = self.attrs["scale_x"] + scale_y = self.attrs["scale_y"] + scale_o = self.attrs["scale_o"] + + workspace_size = 0 + if (scale_x != scale_o) or x1.mem_type != MemType.SHARE_MEM: + workspace_size += size + if (scale_y != scale_o) or x2.mem_type != MemType.SHARE_MEM: + workspace_size += size + if Y.mem_type != MemType.SHARE_MEM: + workspace_size = max(workspace_size, size) + max_workspace = Tensor.from_shape([workspace_size], np.int8, MemType.SHARE_MEM) + return [max_workspace] __all__ = ["iqSub"]