diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0a19014897..01d19de732 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -285,7 +285,7 @@ add_subdirectory(src)
 add_subdirectory(examples)
 
 if(BUILD_TEST)
-    add_subdirectory(tests)
+    add_subdirectory(tests/csrc)
 endif()
 
 # # Mesaure the compile time
diff --git a/src/turbomind/kernels/logprob_kernels.cu b/src/turbomind/kernels/logprob_kernels.cu
index 20474a7ab2..c94c4f45be 100644
--- a/src/turbomind/kernels/logprob_kernels.cu
+++ b/src/turbomind/kernels/logprob_kernels.cu
@@ -182,29 +182,29 @@ void invokeLogProbFromLogits(float*       cum_log_probs,
         cum_log_probs, log_probs, input_lengths, max_input_length, batch_size, batch_first);
 }
 
-// template void invokeLogProbFromLogits(float*       cum_log_probs,
-//                                       const float* logits,
-//                                       const int*   input_ids,
-//                                       const int*   input_lengths,
-//                                       const size_t max_input_length,
-//                                       const size_t batch_size,
-//                                       const size_t vocab_size,
-//                                       const size_t vocab_size_padded,
-//                                       void*        workspace,
-//                                       const size_t workspace_size,
-//                                       cudaStream_t stream,
-//                                       const bool   batch_first);
-
-// template void invokeLogProbFromLogits(float*       cum_log_probs,
-//                                       const half*  logits,
-//                                       const int*   input_ids,
-//                                       const int*   input_lengths,
-//                                       const size_t max_input_length,
-//                                       const size_t batch_size,
-//                                       const size_t vocab_size,
-//                                       const size_t vocab_size_padded,
-//                                       void*        workspace,
-//                                       const size_t workspace_size,
-//                                       cudaStream_t stream,
-//                                       const bool   batch_first);
+template void invokeLogProbFromLogits(float*       cum_log_probs,
+                                      const float* logits,
+                                      const int*   input_ids,
+                                      const int*   input_lengths,
+                                      const size_t max_input_length,
+                                      const size_t batch_size,
+                                      const size_t vocab_size,
+                                      const size_t vocab_size_padded,
+                                      void*        workspace,
+                                      const size_t workspace_size,
+                                      cudaStream_t stream,
+                                      const bool   batch_first);
+
+template void invokeLogProbFromLogits(float*       cum_log_probs,
+                                      const half*  logits,
+                                      const int*   input_ids,
+                                      const int*   input_lengths,
+                                      const size_t max_input_length,
+                                      const size_t batch_size,
+                                      const size_t vocab_size,
+                                      const size_t vocab_size_padded,
+                                      void*        workspace,
+                                      const size_t workspace_size,
+                                      cudaStream_t stream,
+                                      const bool   batch_first);
 }  // end of namespace turbomind
diff --git a/tests/CMakeLists.txt b/tests/csrc/CMakeLists.txt
similarity index 96%
rename from tests/CMakeLists.txt
rename to tests/csrc/CMakeLists.txt
index 37d971daed..61a9b7383d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/csrc/CMakeLists.txt
@@ -15,6 +15,5 @@
 add_subdirectory(unittests)
 if(BUILD_PYT)
     add_subdirectory(gemm_dequantize)
-    add_subdirectory(moe)
     add_subdirectory(int8_gemm)
 endif()
diff --git a/tests/gemm_dequantize/CMakeLists.txt b/tests/csrc/gemm_dequantize/CMakeLists.txt
similarity index 100%
rename from tests/gemm_dequantize/CMakeLists.txt
rename to tests/csrc/gemm_dequantize/CMakeLists.txt
diff --git a/tests/gemm_dequantize/th_gemm_dequantize.cc b/tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
similarity index 100%
rename from tests/gemm_dequantize/th_gemm_dequantize.cc
rename to tests/csrc/gemm_dequantize/th_gemm_dequantize.cc
diff --git a/tests/gemm_dequantize/th_gemm_dequantize.py b/tests/csrc/gemm_dequantize/th_gemm_dequantize.py
similarity index 100%
rename from tests/gemm_dequantize/th_gemm_dequantize.py
rename to tests/csrc/gemm_dequantize/th_gemm_dequantize.py
diff --git a/tests/int8_gemm/CMakeLists.txt b/tests/csrc/int8_gemm/CMakeLists.txt
similarity index 100%
rename from tests/int8_gemm/CMakeLists.txt
rename to tests/csrc/int8_gemm/CMakeLists.txt
diff --git a/tests/int8_gemm/int8_gemm_test.cu b/tests/csrc/int8_gemm/int8_gemm_test.cu
similarity index 100%
rename from tests/int8_gemm/int8_gemm_test.cu
rename to tests/csrc/int8_gemm/int8_gemm_test.cu
diff --git a/tests/unittests/CMakeLists.txt b/tests/csrc/unittests/CMakeLists.txt
similarity index 87%
rename from tests/unittests/CMakeLists.txt
rename to tests/csrc/unittests/CMakeLists.txt
index 5531ffca6a..101bea0da2 100644
--- a/tests/unittests/CMakeLists.txt
+++ b/tests/csrc/unittests/CMakeLists.txt
@@ -42,7 +42,7 @@ target_compile_features(unittest PRIVATE cxx_std_14)
 target_link_libraries(  # Libs for test_attention_kernels
   unittest PUBLIC
     -lcudart -lcurand
-    gen_relative_pos_bias gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
+    gpt_kernels gtest memory_utils tensor unfused_attention_kernels cuda_utils logger)
 target_link_libraries(  # Libs for test_logprob_kernels
   unittest PUBLIC
     -lcudart
@@ -50,7 +50,7 @@ target_link_libraries(  # Libs for test_logprob_kernels
 target_link_libraries(  # Libs for test_penalty_kernels
   unittest PUBLIC
     -lcublas -lcublasLt -lcudart
-    sampling_penalty_kernels beam_search_penalty_kernels memory_utils cuda_utils logger)
+    sampling_penalty_kernels memory_utils cuda_utils logger)
 target_link_libraries(  # Libs for test_sampling_kernel
   unittest PUBLIC
     -lcudart
@@ -71,11 +71,6 @@ add_executable(test_gpt_kernels test_gpt_kernels.cu)
 target_link_libraries(test_gpt_kernels PUBLIC
                       gpt_kernels memory_utils tensor cuda_utils logger)
 
-add_executable(test_activation test_activation.cu)
-target_link_libraries(test_activation PUBLIC
-                    -lcublas -lcublasLt -lcudart
-                    activation_kernels memory_utils cuda_utils logger)
-
 add_executable(test_context_attention_layer test_context_attention_layer.cu)
 target_link_libraries(test_context_attention_layer PUBLIC
                       Llama -lcublas -lcublasLt -lcudart
diff --git a/tests/unittests/gtest_utils.h b/tests/csrc/unittests/gtest_utils.h
similarity index 100%
rename from tests/unittests/gtest_utils.h
rename to tests/csrc/unittests/gtest_utils.h
diff --git a/tests/unittests/test_attention_kernels.cu b/tests/csrc/unittests/test_attention_kernels.cu
similarity index 66%
rename from tests/unittests/test_attention_kernels.cu
rename to tests/csrc/unittests/test_attention_kernels.cu
index c36de79667..f2a869381e 100644
--- a/tests/unittests/test_attention_kernels.cu
+++ b/tests/csrc/unittests/test_attention_kernels.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include "src/turbomind/kernels/gen_relative_pos_bias.h"
+
 #include "src/turbomind/kernels/gpt_kernels.h"
 #include "src/turbomind/kernels/unfused_attention_kernels.h"
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/memory_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"
-#include "tests/unittests/gtest_utils.h"
+#include "gtest_utils.h"
 
 #include <curand.h>
 #include <sstream>
@@ -333,121 +333,6 @@ public:
             EXPECT_TRUE(passed);
         }
     }
-
-    void runTestAlibiMaskedSoftmax(AttentionKernelTestParam param, bool is_benchmark = false)
-    {
-        DataType dtype = getTensorType<T>();
-
-        std::vector<size_t> qk_shape{param.batch_size, param.head_num, param.q_length, param.k_length};
-
-        bool use_fp32_qk = param.use_fp32_qk_buf && dtype != TYPE_FP32;
-
-        Tensor qk           = createTensor(MEMORY_GPU, dtype, qk_shape);
-        Tensor qk_fp32      = use_fp32_qk ? createTensor(MEMORY_GPU, TYPE_FP32, qk_shape) : Tensor();
-        Tensor attn_mask    = randomAttentionMask({param.batch_size, 1, param.q_length, param.k_length});
-        Tensor alibi_slopes = createTensor(MEMORY_GPU, dtype, {param.head_num});
-
-        // Input random initialization
-        if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
-            utils::normal<float>(curng, qk_fp32);
-        }
-        else {
-            utils::normal<T>(curng, qk);
-        }
-        invokeBuildAlibiSlopes(alibi_slopes.getPtr<T>(), param.head_num, stream);
-        sync_check_cuda_error();
-
-        Tensor h_alibi_slopes = createTensor(MEMORY_CPU, dtype, {param.head_num});
-        Tensor h_alibi_bias =
-            is_benchmark ? Tensor() : createTensor(MEMORY_CPU, dtype, {param.head_num, param.q_length, param.k_length});
-        // The nearest power of 2 equal to / smaller than num_heads followed by HF's implementation.
-        T*  alibi_slope_ptr = h_alibi_slopes.getPtr<T>();
-        int num_heads_pow2  = utils::pow2_rounddown(param.head_num);
-        for (size_t h = 0; h < param.head_num; ++h) {
-            // The slope of linear bias of the attention head
-            if (h < num_heads_pow2) {
-                alibi_slope_ptr[h] = static_cast<T>(powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2) - 3.f)), h + 1));
-            }
-            else {
-                alibi_slope_ptr[h] = static_cast<T>(
-                    powf(powf(0.5f, powf(0.5f, log2f(num_heads_pow2 << 1) - 3.f)), (h - num_heads_pow2) * 2 + 1));
-            }
-            if (h_alibi_bias.size() > 0) {
-                T* alibi_bias_ptr = h_alibi_bias.getPtr<T>();
-                for (size_t qi = 0; qi < param.q_length; ++qi) {
-                    for (size_t ki = 0; ki < param.k_length; ++ki) {
-                        size_t hqk_idx          = (h * param.q_length + qi) * param.k_length + ki;
-                        alibi_bias_ptr[hqk_idx] = ::math::mul(alibi_slope_ptr[h], T(0.0f + ki - qi));
-                    }
-                }
-            }
-        }
-        EXPECT_TRUE(
-            checkResult("CheckAlibiSlopes", alibi_slopes.getPtr<T>(), h_alibi_slopes.getPtr<T>(), param.head_num));
-
-        // Clone to host for reference computation if needed.
-        Tensor h_qk        = is_benchmark ? Tensor() : toHost<T>(qk);
-        Tensor h_attn_mask = is_benchmark ? Tensor() : toHost<T>(attn_mask);
-        Tensor h_qk_fp32   = is_benchmark ? Tensor() : toHost<float>(qk_fp32);
-
-        T scale = static_cast<T>(1 / sqrtf(param.size_per_head * 1.0f));
-
-        if (param.use_fp32_qk_buf && dtype != TYPE_FP32) {
-            MaskedSoftmaxParam<T, float> softmax_param;
-            softmax_param.attention_score    = qk.getPtr<T>();
-            softmax_param.qk                 = qk_fp32.getPtr<float>();
-            softmax_param.attention_mask     = attn_mask.getPtr<T>();
-            softmax_param.linear_bias_slopes = alibi_slopes.getPtr<T>();
-            softmax_param.batch_size         = param.batch_size;
-            softmax_param.num_heads          = param.head_num;
-            softmax_param.q_length           = param.q_length;
-            softmax_param.k_length           = param.k_length;
-            softmax_param.qk_scale           = scale;
-            invokeMaskedSoftmax(softmax_param, stream);
-            sync_check_cuda_error();
-        }
-        else {
-            MaskedSoftmaxParam<T, T> softmax_param;
-            softmax_param.attention_score    = qk.getPtr<T>();
-            softmax_param.qk                 = qk.getPtr<T>();
-            softmax_param.attention_mask     = attn_mask.getPtr<T>();
-            softmax_param.linear_bias_slopes = alibi_slopes.getPtr<T>();
-            softmax_param.batch_size         = param.batch_size;
-            softmax_param.num_heads          = param.head_num;
-            softmax_param.q_length           = param.q_length;
-            softmax_param.k_length           = param.k_length;
-            softmax_param.qk_scale           = scale;
-            invokeMaskedSoftmax(softmax_param, stream);
-            sync_check_cuda_error();
-        }
-
-        if (!is_benchmark) {
-            if (use_fp32_qk) {
-                computeQkSoftmax(h_qk.getPtr<T>(),
-                                 h_qk_fp32.getPtr<T>(),
-                                 h_attn_mask.getPtr<T>(),
-                                 h_alibi_bias.getPtr<T>(),
-                                 param.batch_size,
-                                 param.head_num,
-                                 param.q_length,
-                                 param.k_length,
-                                 scale);
-            }
-            else {
-                computeQkSoftmax(h_qk.getPtr<T>(),
-                                 h_qk.getPtr<T>(),
-                                 h_attn_mask.getPtr<T>(),
-                                 h_alibi_bias.getPtr<T>(),
-                                 param.batch_size,
-                                 param.head_num,
-                                 param.q_length,
-                                 param.k_length,
-                                 scale);
-            }
-            bool passed = checkResult("AlibiMaskedSoftmax", qk.getPtr<T>(), h_qk.getPtr<T>(), qk.size());
-            EXPECT_TRUE(passed);
-        }
-    }
 };
 
 TYPED_TEST_SUITE(AttentionKernelTest, SupportTypes);
@@ -511,48 +396,4 @@ TYPED_TEST(AttentionKernelTest, Benchmark_MaskedSoftmax_LongSequence4096)
     this->runTestMaskedSoftmax({8, 4096, 4096, 14, 128, false, 0, false, true}, true);
 }
 
-TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence1)
-{
-    this->runTestAlibiMaskedSoftmax({1, 12, 12, 4, 32, false, 0, false});
-}
-
-TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence2)
-{
-    // q_length is not multiple of 4.
-    this->runTestAlibiMaskedSoftmax({1, 11, 11, 4, 32, false, 0, false});
-}
-
-TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence_HasPrompt1)
-{
-    this->runTestAlibiMaskedSoftmax({1, 12, 20, 4, 32, false, 0, false});
-}
-
-TYPED_TEST(AttentionKernelTest, AlibiMaskedSoftmax_ShortSequence_HasPrompt2)
-{
-    // q_length is not multiple of 4.
-    this->runTestAlibiMaskedSoftmax({1, 11, 20, 4, 32, false, 0, false});
-}
-
-// Tests for long sentence generation. Assume the bloom 176B model with 8 TP.
-
-TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence1024)
-{
-    this->runTestAlibiMaskedSoftmax({8, 1024, 1024, 14, 128, false, 0, false, true}, true);
-}
-
-TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence2048)
-{
-    this->runTestAlibiMaskedSoftmax({8, 2048, 2048, 14, 128, false, 0, false, true}, true);
-}
-
-TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence3072)
-{
-    this->runTestAlibiMaskedSoftmax({8, 3072, 3072, 14, 128, false, 0, false, true}, true);
-}
-
-TYPED_TEST(AttentionKernelTest, Benchmark_AlibiMaskedSoftmax_LongSequence4096)
-{
-    this->runTestAlibiMaskedSoftmax({4, 4096, 4096, 14, 128, false, 0, false, true}, true);
-}
-
 }  // end of namespace
diff --git a/tests/unittests/test_context_attention_layer.cu b/tests/csrc/unittests/test_context_attention_layer.cu
similarity index 99%
rename from tests/unittests/test_context_attention_layer.cu
rename to tests/csrc/unittests/test_context_attention_layer.cu
index cf4e30ea33..66ca250019 100644
--- a/tests/unittests/test_context_attention_layer.cu
+++ b/tests/csrc/unittests/test_context_attention_layer.cu
@@ -360,6 +360,7 @@ int main(int argc, const char* argv[])
                                              .out_accum    = accum_buf_ptr,
                                              .cu_seqlens_q = cu_seqlens_ptr,
                                              .cu_seqlens_k = nullptr,
+                                             .group_size   = 1,
                                              .layout_q     = layout_q,
                                              .layout_k     = layout_k,
                                              .layout_v     = layout_v,
diff --git a/tests/unittests/test_gemm.cu b/tests/csrc/unittests/test_gemm.cu
similarity index 100%
rename from tests/unittests/test_gemm.cu
rename to tests/csrc/unittests/test_gemm.cu
diff --git a/tests/unittests/test_gpt_kernels.cu b/tests/csrc/unittests/test_gpt_kernels.cu
similarity index 100%
rename from tests/unittests/test_gpt_kernels.cu
rename to tests/csrc/unittests/test_gpt_kernels.cu
diff --git a/tests/unittests/test_int8.cu b/tests/csrc/unittests/test_int8.cu
similarity index 98%
rename from tests/unittests/test_int8.cu
rename to tests/csrc/unittests/test_int8.cu
index 8813b01be6..96f0daeade 100644
--- a/tests/unittests/test_int8.cu
+++ b/tests/csrc/unittests/test_int8.cu
@@ -14,7 +14,7 @@
 #include <iostream>
 #include <random>
 
-#include "tests/unittests/gtest_utils.h"
+#include "gtest_utils.h"
 
 using namespace turbomind;
 
diff --git a/tests/unittests/test_logprob_kernels.cu b/tests/csrc/unittests/test_logprob_kernels.cu
similarity index 99%
rename from tests/unittests/test_logprob_kernels.cu
rename to tests/csrc/unittests/test_logprob_kernels.cu
index aee090c2ef..242b31ccf8 100644
--- a/tests/unittests/test_logprob_kernels.cu
+++ b/tests/csrc/unittests/test_logprob_kernels.cu
@@ -12,7 +12,7 @@
 #include "src/turbomind/utils/logger.h"
 #include "src/turbomind/utils/memory_utils.h"
 
-#include "tests/unittests/gtest_utils.h"
+#include "gtest_utils.h"
 
 using namespace turbomind;
 
diff --git a/tests/unittests/test_penalty_kernels.cu b/tests/csrc/unittests/test_penalty_kernels.cu
similarity index 99%
rename from tests/unittests/test_penalty_kernels.cu
rename to tests/csrc/unittests/test_penalty_kernels.cu
index 5a075d6028..3ddb1ab9f3 100644
--- a/tests/unittests/test_penalty_kernels.cu
+++ b/tests/csrc/unittests/test_penalty_kernels.cu
@@ -27,14 +27,11 @@
 #include <cublasLt.h>
 #include <cuda_runtime.h>
 
-#include "src/turbomind/kernels/beam_search_penalty_kernels.h"
 #include "src/turbomind/kernels/penalty_types.h"
 #include "src/turbomind/kernels/sampling_penalty_kernels.h"
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
-
-// #include "tests/unittests/unittest_utils.h"
-#include "tests/unittests/gtest_utils.h"
+#include "gtest_utils.h"
 
 using namespace turbomind;
 
diff --git a/tests/unittests/test_sampling.cu b/tests/csrc/unittests/test_sampling.cu
similarity index 100%
rename from tests/unittests/test_sampling.cu
rename to tests/csrc/unittests/test_sampling.cu
diff --git a/tests/unittests/test_sampling_kernels.cu b/tests/csrc/unittests/test_sampling_kernels.cu
similarity index 99%
rename from tests/unittests/test_sampling_kernels.cu
rename to tests/csrc/unittests/test_sampling_kernels.cu
index 370bbcfdf1..041b70d72d 100644
--- a/tests/unittests/test_sampling_kernels.cu
+++ b/tests/csrc/unittests/test_sampling_kernels.cu
@@ -19,7 +19,7 @@
 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/memory_utils.h"
 
-#include "tests/unittests/gtest_utils.h"
+#include "gtest_utils.h"
 
 using namespace turbomind;
 
diff --git a/tests/unittests/test_sampling_layer.cu b/tests/csrc/unittests/test_sampling_layer.cu
similarity index 99%
rename from tests/unittests/test_sampling_layer.cu
rename to tests/csrc/unittests/test_sampling_layer.cu
index b95fcb186e..6b8bcce7c5 100644
--- a/tests/unittests/test_sampling_layer.cu
+++ b/tests/csrc/unittests/test_sampling_layer.cu
@@ -17,8 +17,7 @@
 #include "src/turbomind/utils/memory_utils.h"
 #include "src/turbomind/utils/Tensor.h"
 
-// #include "tests/unittests/unittest_utils.h"
-#include "tests/unittests/gtest_utils.h"
+#include "gtest_utils.h"
 
 using namespace turbomind;
 
diff --git a/tests/unittests/test_tensor.cu b/tests/csrc/unittests/test_tensor.cu
similarity index 100%
rename from tests/unittests/test_tensor.cu
rename to tests/csrc/unittests/test_tensor.cu
diff --git a/tests/unittests/unittest_utils.h b/tests/csrc/unittests/unittest_utils.h
similarity index 100%
rename from tests/unittests/unittest_utils.h
rename to tests/csrc/unittests/unittest_utils.h
diff --git a/tests/unittests/test_activation.cu b/tests/unittests/test_activation.cu
deleted file mode 100644
index 767cc1a3b9..0000000000
--- a/tests/unittests/test_activation.cu
+++ /dev/null
@@ -1,154 +0,0 @@
-#include <iostream>   // snprintf
-#include <string>     // std::string
-#include <vector>     // std::vector
-
-#include "src/turbomind/kernels/activation_kernels.h"
-#include "src/turbomind/utils/cuda_utils.h"
-#include "src/turbomind/utils/memory_utils.h"
-#include "src/turbomind/utils/logger.h"
-
-#include "unittest_utils.h"
-
-using namespace turbomind;
-
-#define PRINT_LIMIT 16
-#define EPSILON (1e-20)
-#define EPSILON_FP16 (1e-10)
-
-struct TestCase {
-    std::string name;
-    size_t m;
-    size_t n;
-    size_t ite;
-
-    std::string toString()
-    {
-        char buf[100];
-        snprintf(buf, sizeof(buf), "TestCase[name=%s, m=%ld, n=%ld]", name.c_str(), m, n);
-        return buf;
-    }
-
-    void print()
-    {
-        TM_LOG_INFO(toString());
-    }
-};
-
-template<typename T>
-void testActivationKernel(TestCase tc)
-{
-    const int m = tc.m;
-    const int n = tc.n;
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-
-    T *output_baseline, *output_opt1, *bias;
-    deviceMalloc(&output_baseline, m * n);
-    deviceMalloc(&output_opt1, m * n);
-    deviceMalloc(&bias, n);
-    cudaD2Dcpy(output_opt1, output_baseline, m * n);
-    invokeGenericActivation<GeluActivation>(output_baseline,
-                                            (const T*) bias,
-                                            (const T*) nullptr,
-                                            (const T*) nullptr,
-                                            (const int*) nullptr,
-                                            (const T*) nullptr,
-                                            m,
-                                            n,
-                                            0,
-                                            (const float*) nullptr,
-                                            (const float*) nullptr,
-                                            stream);
-    invokeAddBiasGeluV2(output_opt1, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
-    bool passed = checkResult(tc.name, output_baseline, output_opt1, m * n, true, true);
-    FT_CHECK(passed);
-
-    const int ite = tc.ite;
-    CudaTimer cuda_timer_baseline(stream);
-    // warmup
-    for (int i = 0; i < ite; i++) {
-        invokeGenericActivation<GeluActivation>(output_baseline,
-                                                (const T*) bias,
-                                                (const T*) nullptr,
-                                                (const T*) nullptr,
-                                                (const int*) nullptr,
-                                                (const T*) nullptr,
-                                                m,
-                                                n,
-                                                0,
-                                                (const float*) nullptr,
-                                                (const float*) nullptr,
-                                                stream);
-    }
-    cuda_timer_baseline.start();
-    for (int i = 0; i < ite; i++) {
-        invokeGenericActivation<GeluActivation>(output_baseline,
-                                                (const T*) bias,
-                                                (const T*) nullptr,
-                                                (const T*) nullptr,
-                                                (const int*) nullptr,
-                                                (const T*) nullptr,
-                                                m,
-                                                n,
-                                                0,
-                                                (const float*) nullptr,
-                                                (const float*) nullptr,
-                                                stream);
-    }
-    float total_time_baseline = cuda_timer_baseline.stop();
-
-    CudaTimer cuda_timer_opt(stream);
-    // warmup
-    for (int i = 0; i < ite; i++) {
-        invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
-    }
-    cuda_timer_opt.start();
-    for (int i = 0; i < ite; i++) {
-        invokeAddBiasGeluV2(output_baseline, bias, (const int*) nullptr, (const T*) nullptr, m, n, stream);
-    }
-    float total_time_opt = cuda_timer_opt.stop();
-    TM_LOG_INFO("%s baseline_time: %f us, opt_time: %f us, speedup: %f (ite: %d)",
-                tc.toString().c_str(),
-                total_time_baseline / ite * 1000.f,
-                total_time_opt / ite * 1000.f,
-                total_time_baseline / total_time_opt,
-                ite);
-
-    deviceFree(output_baseline);
-    deviceFree(output_opt1);
-    deviceFree(bias);
-}
-
-int main()
-{
-    printf("[INFO] Device: %s \n", getDeviceName().c_str());
-    std::vector<TestCase> test_cases{
-        // TC: name / m / n
-        TestCase{"addBiasGelu", 32, 1024, 1000},
-        TestCase{"addBiasGelu", 128, 1024, 1000},
-        TestCase{"addBiasGelu", 2048, 1024, 1000},
-        TestCase{"addBiasGelu", 32, 3072, 1000},
-        TestCase{"addBiasGelu", 128, 3072, 1000},
-        TestCase{"addBiasGelu", 2048, 3072, 1000},
-        TestCase{"addBiasGelu", 32, 4096, 1000},
-        TestCase{"addBiasGelu", 128, 4096, 1000},
-        TestCase{"addBiasGelu", 2048, 4096, 1000},
-        TestCase{"addBiasGelu", 32, 8192, 1000},
-        TestCase{"addBiasGelu", 128, 8192, 1000},
-        TestCase{"addBiasGelu", 2048, 8192, 1000},
-        TestCase{"addBiasGelu", 32, 49152, 1000},
-        TestCase{"addBiasGelu", 128, 49152, 1000},
-        TestCase{"addBiasGelu", 2048, 49152, 1000},
-        TestCase{"addBiasGelu", 32, 81920, 1000},
-        TestCase{"addBiasGelu", 128, 81920, 1000},
-        TestCase{"addBiasGelu", 2048, 81920, 1000},
-    };
-
-    for (auto& tc : test_cases) {
-        // testActivationKernel<float>(tc);
-        testActivationKernel<half>(tc);
-    }
-    TM_LOG_INFO("testActivationKernel done");
-
-    return 0;
-}