diff --git a/README.md b/README.md
index 8535f3f..5a31db3 100644
--- a/README.md
+++ b/README.md
@@ -79,13 +79,14 @@ chmod +x ./bin/test_thinker
 
 ## 能力展示
 * [thinker API](thinker/docs/tutorial/thinker_api.md)
-* [支持量化OP列表](https://github.com/LISTENAI/linger/blob/main/doc/tutorial/support_quant_ops.md)及[模型结构限制说明](thinker/docs/tutorial/restrain_of_model.md)
+* [支持量化OP列表](https://github.com/LISTENAI/linger/blob/main/doc/tutorial/support_quant_ops.md)
+* [模型结构限制说明](thinker/docs/tutorial/restrain_of_model.md)
 
 ## 应用示例
 * 鼾声检测[https://github.com/mywang44/snoring_net]
 
 ## 版本说明
-- 请参考[RELEASE](doc/tutorial/release.md)
+- 请参考[RELEASE](thinker/docs/tutorial/release.md)
 
 ## 交流与反馈
 - 欢迎您通过 Github Issues 来提交 BUG 与建议
diff --git a/setup.py b/setup.py
index a817521..4c8da0d 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
 	name="pythinker",
-	version="1.1.0",
+	version="2.0.0",
 	description="A DeepLearning inference framework for venus",
 	author="listenai",
 	author_email="lingerthinker@listenai.com",
diff --git a/thinker/docs/tutorial/install.md b/thinker/docs/tutorial/install.md
index b435744..6c24deb 100644
--- a/thinker/docs/tutorial/install.md
+++ b/thinker/docs/tutorial/install.md
@@ -39,7 +39,7 @@ cd thinker && sh ./script/x86_linux.sh
 
 ### pip包安装方式
 ``` shell
-pip install pythinker==1.1.0
+pip install pythinker
 ```
 
 ### docker镜像安装方式
@@ -73,12 +73,12 @@ $ sudo systemctl start docker # systemctl 命令的用法
 3、拉取镜像并加载  
 1）、拉取镜像
 ```shell
-docker pull listenai/thinker:1.1.0
+docker pull listenai/thinker:2.0.0
 ```
 
 2）、运行容器
 ```shell
-docker container run -it listenai/thinker:1.1.0 /bin/bash
+docker container run -it listenai/thinker:2.0.0 /bin/bash
 ```
 
 如果一切正常，运行上面的命令以后，就会返回一个命令行提示符。
diff --git a/thinker/docs/tutorial/release.md b/thinker/docs/tutorial/release.md
index 9ed9297..b3336da 100644
--- a/thinker/docs/tutorial/release.md
+++ b/thinker/docs/tutorial/release.md
@@ -1,2 +1,3 @@
+v2.0.0  2023.8.17 修复iqAdd和iqSub的离线内存分配问题，打包工具需和引擎代码保持一致，升级大版本；
 v1.1.0  2023.8.15 打包工具增加常量折叠、指定数据存放位置功能，优化内存分析报告的生成。引擎执行器增加一些常用接口，增加通用性；完善算子和支持更多的新算子；
 V1.0.0  2022.10.24 初始版本
\ No newline at end of file
diff --git a/thinker/docs/tutorial/restrain_of_model.md b/thinker/docs/tutorial/restrain_of_model.md
index d2ff329..e28ce21 100644
--- a/thinker/docs/tutorial/restrain_of_model.md
+++ b/thinker/docs/tutorial/restrain_of_model.md
@@ -3,16 +3,16 @@
 - PSRAM整体可用空间为8MB，内置FLASH可用空间为8MB。单个模型大小整体不超过8M。  
 ## 二、单个算子限制  
 ### 1. conv1dint/conv2dint/deconv2dint/pool相关算子共有限制  
-  - kernel_size = {1,2,3,4,5},支持kernel_h ≠ kernel_w(conv1dint支持kernel_size >5)
-  - stride_size = {1,2,4},支持stride_h ≠ stride_w
-  - pad_size = {0,1,2,3},支持四个方向pad独立设置
+  - kernel_size = (1,2,3,4,5),支持kernel_h ≠ kernel_w(conv1dint支持kernel_size >5)
+  - stride_size = (1,2,4),支持stride_h ≠ stride_w
+  - pad_size = (0,1,2,3),支持四个方向pad独立设置
   - in_w >= weight_w，同时in_h >= weight_h
   - weight_w >= stride_w，同时weight_h >= stride_h
   - pad_h_up < weight_h，同时pad_h_down < weight_h
   - pad_w_right < weight_w, 同时pad_w_left < weight_w
 ### 2. deconv的限制  
-  * stride_h(stride_w) = 2时，kernel_h(kernel_w) = {2,3,4,5}
-  * stirde_h(stride_w) = 4时，kernel_h(kernel_w) = {4,5}  
+  * stride_h(stride_w) = 2时，kernel_h(kernel_w) = (2,3,4,5)
+  * stirde_h(stride_w) = 4时，kernel_h(kernel_w) = (4,5)  
 ### 3. linearInt/BmmInt的限制  
   * 左边输入矩阵(M*N)对齐后大小不超过64KB  
   * 不同数据类型下对齐方式：  
diff --git a/thinker/executor/c_api/thinker_define.h b/thinker/executor/c_api/thinker_define.h
index 4e2c686..4516d66 100644
--- a/thinker/executor/c_api/thinker_define.h
+++ b/thinker/executor/c_api/thinker_define.h
@@ -20,8 +20,8 @@
 
 #define STR_IMP(x) #x
 #define STR(x) STR_IMP(x)
-#define THINKER_VERSION_MAJOR 1
-#define THINKER_VERSION_MINOR 1
+#define THINKER_VERSION_MAJOR 2
+#define THINKER_VERSION_MINOR 0
 #define THINKER_VERSION_PATCH 0
 #define THINKER_VERSION      \
   STR(THINKER_VERSION_MAJOR) \
diff --git a/thinker/executor/core/ops/venus/iqadd.h b/thinker/executor/core/ops/venus/iqadd.h
index a3bd25a..a89a06a 100644
--- a/thinker/executor/core/ops/venus/iqadd.h
+++ b/thinker/executor/core/ops/venus/iqadd.h
@@ -16,19 +16,17 @@ int32_t iqadd_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) {
   void *src1 = (void *)X1->dptr_;
   void *src2 = (void *)X2->dptr_;
   void *dst = (void *)Y->dptr_;
-  int8_t *tmp_buf1 = NULL;
-  int8_t *tmp_buf2 = NULL;
   size_t size = getTensorSize(X1);
+
+  if ((x1_q < y_q) || x2_q < y_q) {
+    return ret;
+  }
+
   int32_t x1_is_psram = 0;
   int32_t x2_is_psram = 0;
   int32_t y_is_psram = 0;
   int32_t used_tmp_size = 0;
 
-  if (Temp)
-  {
-    tmp_buf1 = (int8_t *)Temp->dptr_;
-  }
-
   if ((1 == X1->mem_.type_ || 3 == X1->mem_.type_) &&
       (Temp))  // need copy psram to share
   {
@@ -48,55 +46,28 @@ int32_t iqadd_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) {
   }
 
   if (equalShape(&X1->shape_, &X2->shape_) && (X1->dtype_ == X2->dtype_)) {
-	  int32_t scale1 = 1;
-	  int32_t scale2 = 1;
-	  int32_t shift1 = 0;
-	  int32_t shift2 = 0;
-
-	  if (x1_q > y_q)
-	  {
-		  shift1 = x1_q - y_q;
-	  }
-	  else
-	  {
-		  scale1 = (1 << (y_q - x1_q));
-	  }
-	  if (x2_q > y_q)
-	  {
-		  shift2 = x2_q - y_q;
-	  }
-	  else
-	  {
-		  scale2 = (1 << (y_q - x2_q));
-	  }
+    int32_t shift1 = x1_q - y_q;
+    int32_t shift2 = x2_q - y_q;
     switch (X1->dtype_) {
       case Int8: {
         if (x1_is_psram){
           src1 = (int8_t *)Temp->dptr_;
           memcpy(src1, (void *)X1->dptr_, size * sizeof(int8_t));
-          if (x1_q != y_q){
-            ret = luna_scale_q7_int8((const q7_t *)src1, (scale1), (int8_t *)src1, size, shift1);
-          }
         }
-        else{
-            if (x1_q != y_q){
-              src1 = (int8_t *)Temp->dptr_;
-              ret = luna_scale_q7_int8((const q7_t *)X1->dptr_, (scale1), (int8_t *)src1, size, shift1);
-            }
+
+        if (x1_q != y_q){
+          ret = luna_scale_q7_int8((const q7_t *)src1, (1), (int8_t *)Temp->dptr_, size, shift1);
+          src1 = (int8_t *)Temp->dptr_;
         }
+
         if (x2_is_psram){
-          src2 = (int8_t *)Temp->dptr_ + (x1_is_psram) * size;
+          src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q)) * size;
           memcpy(src2, (void *)X2->dptr_, size * sizeof(int8_t));
+        }
           if (x2_q != y_q){
-            ret = luna_scale_q7_int8((const q7_t *)src2, (scale2), (int8_t *)src2, size, shift2);
+            ret = luna_scale_q7_int8((const q7_t *)src2, (1), (int8_t *)(int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size, size, shift2);
+            src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size;
           }
-        }
-        else{
-            if (x2_q != y_q){
-              src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q)) * size;
-              ret = luna_scale_q7_int8((const q7_t *)X2->dptr_, (scale2), (int8_t *)src2, size, shift2);
-            }          
-        }
 
         if (y_is_psram){
           dst = (int8_t *)Temp->dptr_;
@@ -107,62 +78,6 @@ int32_t iqadd_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) {
         if (y_is_psram){
           memcpy((void *)Y->dptr_, dst, size * sizeof(int8_t));
         }
-
-        // if (Temp)
-        // {
-        //   tmp_buf2 = tmp_buf1 + size * sizeof(int8_t);
-        //   if (x1_is_psram)
-        //   {
-        //     memcpy(tmp_buf1, src1, size * sizeof(int8_t));
-        //     src1 = tmp_buf1;
-        //     used_tmp_size += size * sizeof(int8_t);
-        //   }
-        //   if (x2_is_psram)
-        //   {
-        //     memcpy(tmp_buf2, src2, size * sizeof(int8_t));
-        //     src2 = tmp_buf2;
-        //     used_tmp_size += size * sizeof(int8_t);
-        //   }
-        //   if (y_is_psram)
-        //   {
-        //     dst = tmp_buf1;
-        //   }
-        //   ret = luna_scale_q7_int8((const q7_t *)src1, (scale1), (int8_t *)tmp_buf1, size,
-        //                          shift1);
-        //   ret = luna_scale_q7_int8((const q7_t *)src2, (scale2), (int8_t *)tmp_buf2, size,
-        //                           shift2);        
-        //   ret = luna_add_q7_int8((const q7_t *)tmp_buf1, (q7_t *)tmp_buf2, (int8_t *)dst,
-        //                          size, 0);
-        // }
-        // else
-        // {
-        //   ret = luna_scale_q7_int8((const q7_t *)src1, (scale1), (int8_t *)dst, size,
-        //                          shift1);
-        //   ret = luna_scale_q7_int8((const q7_t *)src2, (scale2), (int8_t *)src2, size,
-        //                           shift2);        
-        //   ret = luna_add_q7_int8((const q7_t *)dst, (q7_t *)src2, (int8_t *)dst,
-        //                         size, 0);
-        // }
-        // if (y_is_psram)
-        // {
-        //   memcpy((void *)Y->dptr_, dst, size * sizeof(int8_t));
-        // }
-      } break;
-      case Int16: {
-        ret = luna_scale_q15_int16((const q15_t *)src1, (scale1), (int16_t *)dst,
-                                   size, shift1);
-        ret = luna_scale_q15_int16((const q15_t *)src2, (scale2), (int16_t *)src2,
-                                   size, shift2);
-        ret = luna_add_q15_int16((const q15_t *)dst, (q15_t *)src2,
-                                 (int16_t *)dst, size, 0);
-      } break;
-      case Int32: {
-        ret = luna_scale_q31_int32((const q31_t *)src1, (scale1), (int32_t *)dst,
-                                   size, shift1);
-        ret = luna_scale_q31_int32((const q31_t *)src2, (scale2), (int32_t *)src2,
-                                   size, shift2);
-        ret = luna_add_q31_int32((const q31_t *)dst, (q31_t *)src2,
-                                 (int32_t *)dst, size, 0);
       } break;
       default:
         break;
diff --git a/thinker/executor/core/ops/venus/iqsub.h b/thinker/executor/core/ops/venus/iqsub.h
index 7e97e02..2c9e681 100644
--- a/thinker/executor/core/ops/venus/iqsub.h
+++ b/thinker/executor/core/ops/venus/iqsub.h
@@ -22,18 +22,27 @@ int32_t iqsub_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) {
     return ret;
   }
 
+  int32_t x1_is_psram = 0;
+  int32_t x2_is_psram = 0;
+  int32_t y_is_psram = 0;
+  int32_t used_tmp_size = 0;
+
   if ((1 == X1->mem_.type_ || 3 == X1->mem_.type_) &&
       (Temp))  // need copy psram to share
   {
-    src1 = (void *)Temp->dptr_;
-    memcpy(src1, (void *)X1->dptr_, size * X1->byte_);
+    x1_is_psram = 1;
   }
 
   if ((1 == X2->mem_.type_ || 3 == X2->mem_.type_) &&
       (Temp))  // need copy psram to share
   {
-    src2 = (void *)Temp->dptr_;
-    memcpy(src2, (void *)X2->dptr_, size * X2->byte_);
+    x2_is_psram = 1;
+  }
+
+  if ((1 == Y->mem_.type_ || 3 == Y->mem_.type_) &&
+      (Temp))  // need copy psram to share
+  {
+    y_is_psram = 1;
   }
 
   if (equalShape(&X1->shape_, &X2->shape_) && (X1->dtype_ == X2->dtype_)) {
@@ -41,28 +50,35 @@ int32_t iqsub_luna(tTensor *X1, tTensor *X2, tTensor *Temp, tTensor *Y) {
     int32_t shift2 = x2_q - y_q;
     switch (X1->dtype_) {
       case Int8: {
-        ret = luna_scale_q7_int8((const q7_t *)src1, (1), (int8_t *)dst, size,
-                                 shift1);
-        ret = luna_scale_q7_int8((const q7_t *)src2, (1), (int8_t *)src2, size,
-                                 shift2);
+        if (x1_is_psram){
+          src1 = (int8_t *)Temp->dptr_;
+          memcpy(src1, (void *)X1->dptr_, size * sizeof(int8_t));
+        }
+        if (x1_q != y_q){
+          ret = luna_scale_q7_int8((const q7_t *)src1, (1), (int8_t *)Temp->dptr_, size, shift1);
+          src1 = (int8_t *)Temp->dptr_;
+        }
+
+        if (x2_is_psram){
+          src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q)) * size;
+          memcpy(src2, (void *)X2->dptr_, size * sizeof(int8_t));
+        }
+        if (x2_q != y_q){
+          ret = luna_scale_q7_int8((const q7_t *)src2, (1), (int8_t *)(int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size, size, shift2);
+          src2 = (int8_t *)Temp->dptr_ + ((x1_is_psram) || (x1_q != y_q))* size;
+        }
+
+        if (y_is_psram){
+          dst = (int8_t *)Temp->dptr_;
+        }
+
         ret = luna_sub_q7_int8((const q7_t *)dst, (q7_t *)src2, (int8_t *)dst,
                                size, 0);
-      } break;
-      case Int16: {
-        ret = luna_scale_q15_int16((const q15_t *)src1, (1), (int16_t *)dst,
-                                   size, shift1);
-        ret = luna_scale_q15_int16((const q15_t *)src2, (1), (int16_t *)src2,
-                                   size, shift2);
-        ret = luna_sub_q15_int16((const q15_t *)dst, (q15_t *)src2,
-                                 (int16_t *)dst, size, 0);
-      } break;
-      case Int32: {
-        ret = luna_scale_q31_int32((const q31_t *)src1, (1), (int32_t *)dst,
-                                   size, shift1);
-        ret = luna_scale_q31_int32((const q31_t *)src2, (1), (int32_t *)src2,
-                                   size, shift2);
-        ret = luna_sub_q31_int32((const q31_t *)dst, (q31_t *)src2,
-                                 (int32_t *)dst, size, 0);
+
+        if (y_is_psram){
+          memcpy((void *)Y->dptr_, dst, size * sizeof(int8_t));
+        }
+
       } break;
       default:
         break;
diff --git a/thinker/resource_packer/ops/iqSub.py b/thinker/resource_packer/ops/iqSub.py
index eb70ff5..0f6cb1c 100644
--- a/thinker/resource_packer/ops/iqSub.py
+++ b/thinker/resource_packer/ops/iqSub.py
@@ -1,11 +1,33 @@
 import math
+import numpy as np
+from typing import List
+
+from ...graph import Tensor
 from ...enum_defines import DevType
 from .base import iqBinaryOperator, register_op
 
 
 @register_op
 class iqSub(iqBinaryOperator):
-    pass
+    def get_workspace(self, dev_type: DevType) -> List[Tensor]:
+        x1 = self.inputs[0]
+        x2 = self.inputs[1]
+        size = x1.nbytes
+        Y = self.outputs[0]
+
+        scale_x = self.attrs["scale_x"]
+        scale_y = self.attrs["scale_y"]
+        scale_o = self.attrs["scale_o"]
+
+        workspace_size = 0
+        if (scale_x != scale_o) or x1.mem_type != MemType.SHARE_MEM:
+            workspace_size += size
+        if (scale_y != scale_o) or x2.mem_type != MemType.SHARE_MEM:
+            workspace_size += size
+        if Y.mem_type != MemType.SHARE_MEM:
+            workspace_size = max(workspace_size, size)
 
+        max_workspace = Tensor.from_shape([workspace_size], np.int8, MemType.SHARE_MEM)
+        return [max_workspace]
 
 __all__ = ["iqSub"]