diff --git a/.gitmodules b/.gitmodules
index 470cf466..578e24f9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,9 +1,12 @@
+[submodule "third_party/googletest"]
+	path = third_party/googletest
+	url = https://github.com/google/googletest.git
 [submodule "third_party/glog"]
 	path = third_party/glog
-	url = git@github.com:google/glog.git
+	url = https://github.com/google/glog.git
 [submodule "third_party/gflags"]
 	path = third_party/gflags
-	url = git@github.com:gflags/gflags.git
+	url = https://github.com/gflags/gflags.git
 [submodule "third_party/eigen"]
 	path = third_party/eigen
-	url = git@github.com:InfiniTensor/eigen-mirror.git
+	url = https://github.com/eigenteam/eigen-git-mirror.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index df636b27..22dcf791 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,7 @@ option(USE_CUDA "Support NVIDIA CUDA" OFF)
 option(PROFILE_MODE "ENABLE PROFILE MODE" OFF)
 option(USE_OMP "Use OpenMP as backend for Eigen" ON)
 option(USE_NCCL "Build project for distributed running" ON)
+option(BUILD_TEST "Build InfiniTrain tests" OFF)
 
 project(infini_train VERSION 0.5.0 LANGUAGES CXX)
 
@@ -14,6 +15,19 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 # Generate compile_commands.json
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+# ------------------------------------------------------------------------------
+# GoogleTest (submodule)
+# ------------------------------------------------------------------------------
+if(BUILD_TEST)
+  if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/CMakeLists.txt)
+    message(FATAL_ERROR "googletest submodule not found at third_party/googletest. "
+      "Run: git submodule update --init third_party/googletest")
+  endif()
+  set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+  add_subdirectory(third_party/googletest)
+  enable_testing()
+endif()
+
 # ------------------------------------------------------------------------------
 # Third-party deps
 # ------------------------------------------------------------------------------
@@ -26,7 +40,9 @@ include_directories(${gflags_SOURCE_DIR}/include)
 set(WITH_GFLAGS OFF CACHE BOOL "Disable glog finding system gflags" FORCE)
 set(WITH_GTEST OFF CACHE BOOL "Disable glog finding system gtest" FORCE)
 add_subdirectory(third_party/glog)
+# add_compile_definitions(GLOG_USE_GLOG_EXPORT=1)
 include_directories(${glog_SOURCE_DIR}/src)
+# include_directories(${glog_BINARY_DIR}/glog)
 
 # eigen
 if(USE_OMP)
@@ -48,6 +64,10 @@ endif()
 # Framework core sources (*.cc), excluding cpu kernels (they are built separately)
 file(GLOB_RECURSE SRC ${PROJECT_SOURCE_DIR}/infini_train/src/*.cc)
 list(FILTER SRC EXCLUDE REGEX ".*kernels/cpu/.*")
+if(NOT USE_CUDA)
+  list(FILTER SRC EXCLUDE REGEX ".*runtime/cuda/.*")
+  list(FILTER SRC EXCLUDE REGEX ".*ccl/cuda/.*")
+endif()
 if(NOT USE_NCCL)
   list(FILTER SRC EXCLUDE REGEX ".*infini_train/src/core/ccl/cuda/.*")
 endif()
@@ -190,17 +210,8 @@ add_executable(llama3
 )
 link_infini_train_exe(llama3)
 
-# Tools
-add_subdirectory(tools/infini_run)
-set_target_properties(infini_run PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
 
 # Tests
-add_executable(test_hook test/hook/test_hook.cc)
-link_infini_train_exe(test_hook)
-
-add_executable(test_precision_check test/hook/test_precision_check.cc)
-link_infini_train_exe(test_precision_check)
-
-add_executable(test_lora test/lora/test_lora.cc)
-link_infini_train_exe(test_lora)
-
+if(BUILD_TEST)
+  add_subdirectory(tests)
+endif()
diff --git a/test/hook/test_hook.cc b/test/hook/test_hook.cc
deleted file mode 100644
index 32c7e097..00000000
--- a/test/hook/test_hook.cc
+++ /dev/null
@@ -1,179 +0,0 @@
-#include <iostream>
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "infini_train/include/autograd/elementwise.h"
-#include "infini_train/include/autograd/function.h"
-#include "infini_train/include/autograd/function_hook.h"
-#include "infini_train/include/common/hook.h"
-#include "infini_train/include/nn/modules/module.h"
-#include "infini_train/include/nn/parallel/global.h"
-#include "infini_train/include/tensor.h"
-
-using namespace infini_train;
-
-// ============================================================================
-// Test 1: Basic Module Hooks
-// ============================================================================
-void test_basic_hooks() {
-    std::cout << "\n=== Test 1: Basic Module Hooks ===" << std::endl;
-
-    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->set_requires_grad(true);
-
-    // Module hook example
-    class MyModule : public nn::Module {
-    public:
-        MyModule() : Module("MyModule") {}
-
-        std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &inputs) override {
-            std::cout << "Forward pass executing..." << std::endl;
-            return inputs;
-        }
-    };
-
-    auto module = std::make_shared<MyModule>();
-
-    // Register forward pre-hook
-    auto pre_hook
-        = module->RegisterForwardPreHook([](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &inputs) {
-              std::cout << "Forward pre-hook: Module type = " << mod->type() << std::endl;
-          });
-
-    // Register forward post-hook
-    auto fwd_hook
-        = module->RegisterForwardPostHook([](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &inputs,
-                                             const std::vector<std::shared_ptr<Tensor>> &outputs) {
-              std::cout << "Forward post-hook: Got " << outputs.size() << " outputs" << std::endl;
-          });
-
-    // Register backward pre-hook
-    auto bwd_pre_hook = module->RegisterBackwardPreHook(
-        [](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
-            std::cout << "Backward pre-hook called!" << std::endl;
-        });
-
-    // Register backward post-hook
-    auto bwd_post_hook
-        = module->RegisterBackwardPostHook([](nn::Module *mod, const std::vector<std::shared_ptr<Tensor>> &grad_inputs,
-                                              const std::vector<std::shared_ptr<Tensor>> &grad_outputs) {
-              std::cout << "Backward post-hook called!" << std::endl;
-          });
-
-    // Test forward pass
-    std::vector<std::shared_ptr<Tensor>> inputs = {x};
-    auto outputs = (*module)(inputs);
-
-    std::cout << "Module hook test completed!" << std::endl;
-}
-
-// ============================================================================
-// Test 2: Hook Remove() Functionality Test
-// ============================================================================
-void test_hook_remove() {
-    std::cout << "\n=== Test 2: Hook Remove() Functionality Test ===" << std::endl;
-
-    auto a = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32);
-    auto b = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32);
-    a->set_requires_grad(true);
-    b->set_requires_grad(true);
-
-    int hook1_count = 0;
-    int hook2_count = 0;
-    int hook3_count = 0;
-
-    auto add_fn = std::make_shared<autograd::Add>();
-
-    // Register three forward pre-hooks
-    auto handle1 = add_fn->RegisterForwardPreHook(
-        [&hook1_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) {
-            hook1_count++;
-            std::cout << "Hook 1 called (count: " << hook1_count << ")" << std::endl;
-        });
-
-    auto handle2 = add_fn->RegisterForwardPreHook(
-        [&hook2_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) {
-            hook2_count++;
-            std::cout << "Hook 2 called (count: " << hook2_count << ")" << std::endl;
-        });
-
-    auto handle3 = add_fn->RegisterForwardPreHook(
-        [&hook3_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) {
-            hook3_count++;
-            std::cout << "Hook 3 called (count: " << hook3_count << ")" << std::endl;
-        });
-
-    // First call - all hooks should fire
-    std::cout << "\n--- First Apply (all hooks active) ---" << std::endl;
-    std::vector<std::shared_ptr<Tensor>> inputs;
-    inputs.push_back(a);
-    inputs.push_back(b);
-    auto result1 = add_fn->Apply(inputs);
-    std::cout << "Hook counts: " << hook1_count << ", " << hook2_count << ", " << hook3_count << std::endl;
-
-    // Remove hook 2
-    std::cout << "\n--- Removing Hook 2 ---" << std::endl;
-    handle2->Remove();
-
-    // Second call - hook 2 should not fire
-    std::cout << "\n--- Second Apply (hook 2 removed) ---" << std::endl;
-    auto result2 = add_fn->Apply(inputs);
-    std::cout << "Hook counts: " << hook1_count << ", " << hook2_count << ", " << hook3_count << std::endl;
-
-    // Remove hook 1
-    std::cout << "\n--- Removing Hook 1 ---" << std::endl;
-    handle1->Remove();
-
-    // Third call - only hook 3 should fire
-    std::cout << "\n--- Third Apply (hooks 1 and 2 removed) ---" << std::endl;
-    auto result3 = add_fn->Apply(inputs);
-    std::cout << "Hook counts: " << hook1_count << ", " << hook2_count << ", " << hook3_count << std::endl;
-
-    // Verify results
-    std::cout << "\n=== Test Results ===" << std::endl;
-    bool test_passed = true;
-
-    if (hook1_count != 2) {
-        std::cout << "FAIL: Hook 1 should be called 2 times, got " << hook1_count << std::endl;
-        test_passed = false;
-    }
-
-    if (hook2_count != 1) {
-        std::cout << "FAIL: Hook 2 should be called 1 time, got " << hook2_count << std::endl;
-        test_passed = false;
-    }
-
-    if (hook3_count != 3) {
-        std::cout << "FAIL: Hook 3 should be called 3 times, got " << hook3_count << std::endl;
-        test_passed = false;
-    }
-
-    if (test_passed) {
-        std::cout << "SUCCESS: All hooks behaved correctly!" << std::endl;
-        std::cout << "  - Hook 1: called 2 times (before removal)" << std::endl;
-        std::cout << "  - Hook 2: called 1 time (removed after first call)" << std::endl;
-        std::cout << "  - Hook 3: called 3 times (never removed)" << std::endl;
-    }
-}
-
-// ============================================================================
-// Main
-// ============================================================================
-int main(int argc, char *argv[]) {
-    google::InitGoogleLogging(argv[0]);
-    nn::parallel::global::GlobalEnv::Instance().Init(1, 1, false, 1, 1);
-
-    std::cout << "========================================" << std::endl;
-    std::cout << "    Hook Mechanism Tests" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    test_basic_hooks();
-    test_hook_remove();
-
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "    All Tests Completed Successfully" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    return 0;
-}
diff --git a/test/hook/test_precision_check.cc b/test/hook/test_precision_check.cc
deleted file mode 100644
index 65c8258c..00000000
--- a/test/hook/test_precision_check.cc
+++ /dev/null
@@ -1,241 +0,0 @@
-#include <filesystem>
-#include <iostream>
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "infini_train/include/nn/modules/module.h"
-#include "infini_train/include/nn/parallel/global.h"
-#include "infini_train/include/tensor.h"
-#include "infini_train/include/utils/global_module_hook_registry.h"
-#include "infini_train/include/utils/precision_check_config.h"
-#include "infini_train/include/utils/precision_checker.h"
-
-using namespace infini_train;
-
-class MyModel : public nn::Module {
-public:
-    MyModel() : Module("MyModel") {}
-
-    std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &inputs) override {
-        auto x = inputs[0];
-        x->RequiresGrad();
-        auto y = x->Mul(x);
-        return {y};
-    }
-};
-
-// Simple model for multi-iteration test
-class SimpleModel : public nn::Module {
-public:
-    SimpleModel() : Module("SimpleModel") {}
-
-    std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &inputs) override {
-        auto x = inputs[0];
-        x->RequiresGrad();
-        auto y = x->Mul(x)->Mul(x); // x^3
-        return {y};
-    }
-};
-
-void RunModelForwardBackward(const std::shared_ptr<nn::Module> &model) {
-    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
-    x->RequiresGrad();
-
-    std::vector<std::shared_ptr<Tensor>> inputs = {x};
-    auto outputs = (*model)(inputs);
-    auto loss = outputs[0]->Sum(0, false)->Sum(0, false);
-    loss->Backward();
-}
-
-void TestFunctionLevel(const std::string &config_str) {
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "  Function-Level Test: " << config_str << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
-    x->RequiresGrad();
-
-    auto y = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    y->Fill<float>(3.0f);
-    y->RequiresGrad();
-
-    auto z = x->Mul(y);
-    auto loss = z->Sum(0, false)->Sum(0, false);
-    loss->Backward();
-
-    std::cout << "Test completed." << std::endl;
-}
-
-void TestModuleLevel(const std::string &config_str) {
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "  Module-Level Test: " << config_str << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    auto model = std::make_shared<MyModel>();
-    RunModelForwardBackward(model);
-
-    std::cout << "Test completed." << std::endl;
-}
-
-// Test: Simple format output (level=2, format=simple)
-void TestSimpleFormat() {
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "  Test: Simple Format (level=2, format=simple)" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
-    x->RequiresGrad();
-
-    auto y = x->Mul(x);
-    auto loss = y->Sum(0, false)->Sum(0, false); // Two Sum ops to produce scalar
-    loss->Backward();
-
-    std::cout << "Simple format test completed - check output for min/max/mean values." << std::endl;
-}
-
-// Test: MD5 format output (level=2, format=md5)
-void TestMd5Format() {
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "  Test: MD5 Format (level=2, format=md5)" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32);
-    x->Fill<float>(2.0f);
-    x->RequiresGrad();
-
-    auto y = x->Mul(x);
-    auto loss = y->Sum(0, false)->Sum(0, false); // Two Sum ops to produce scalar
-    loss->Backward();
-
-    std::cout << "MD5 format test completed - check output for md5 hashes." << std::endl;
-}
-
-// Test: Save tensors to NPY files (level=1, save_tensors=true)
-void TestSaveTensors() {
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "  Test: Save Tensors (level=1, save_tensors=true)" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    std::string output_path = "/tmp/precision_check_npy";
-
-    auto model = std::make_shared<MyModel>();
-    RunModelForwardBackward(model);
-
-    // Verify NPY files were created
-    namespace fs = std::filesystem;
-    bool found_npy = false;
-    if (fs::exists(output_path)) {
-        for (const auto &entry : fs::recursive_directory_iterator(output_path)) {
-            if (entry.path().extension() == ".npy") {
-                found_npy = true;
-                std::cout << "Found NPY file: " << entry.path() << std::endl;
-            }
-        }
-    }
-
-    if (found_npy) {
-        std::cout << "Save tensors test PASSED - NPY files created successfully." << std::endl;
-    } else {
-        std::cout << "Save tensors test completed - check output directory for NPY files." << std::endl;
-    }
-}
-
-// Test: Multi-iteration file overwrite (level=1, save_tensors=true, iter=3)
-void TestMultiIterOverwrite() {
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "  Test: Multi-Iteration File Overwrite" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    std::string output_path = "/tmp/precision_check_overwrite";
-
-    auto model = std::make_shared<SimpleModel>();
-    int num_iters = 3;
-
-    // Run multiple iterations - files should be overwritten
-    for (int i = 0; i < num_iters; ++i) {
-        std::cout << "Iteration " << (i + 1) << "/" << num_iters << std::endl;
-        utils::PrecisionCheckEnv::ResetCounters(); // Reset counters each iteration
-        RunModelForwardBackward(model);
-    }
-
-    namespace fs = std::filesystem;
-    int npy_count = 0;
-    if (fs::exists(output_path)) {
-        for (const auto &entry : fs::recursive_directory_iterator(output_path)) {
-            if (entry.path().extension() == ".npy") {
-                ++npy_count;
-            }
-        }
-    }
-
-    std::cout << "Multi-iteration test completed - found " << npy_count << " NPY files after " << num_iters
-              << " iterations." << std::endl;
-    std::cout << "(Files should be overwritten each iteration, count should be consistent with 1 iter)" << std::endl;
-}
-
-int main(int argc, char *argv[]) {
-    google::InitGoogleLogging(argv[0]);
-
-    std::string config_str = argc > 1 ? argv[1] : "";
-
-    std::cout << "========================================" << std::endl;
-    std::cout << "  Precision Check Test Suite" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    nn::parallel::global::InitAllEnv(1, 1, false, 1, 1);
-
-    // If no config argument, run all format tests
-    if (config_str.empty()) {
-        auto config = utils::PrecisionCheckConfig::Parse("level=2,format=simple");
-        utils::PrecisionCheckEnv::Instance().Init(config);
-
-        std::cout << "\nRunning all precision check format tests..." << std::endl;
-
-        // Test 1: Simple format
-        TestSimpleFormat();
-
-        // Test 2: MD5 format
-        auto md5_config = utils::PrecisionCheckConfig::Parse("level=2,format=md5");
-        utils::PrecisionCheckEnv::Instance().Init(md5_config);
-        TestMd5Format();
-
-        // Test 3: Save tensors
-        auto npy_config = utils::PrecisionCheckConfig::Parse("level=1,save_tensors=true");
-        utils::PrecisionCheckEnv::Instance().Init(npy_config);
-        TestSaveTensors();
-
-        // Test 4: Multi-iteration overwrite
-        auto iter_config = utils::PrecisionCheckConfig::Parse("level=1,save_tensors=true");
-        utils::PrecisionCheckEnv::Instance().Init(iter_config);
-        TestMultiIterOverwrite();
-
-        std::cout << "\n========================================" << std::endl;
-        std::cout << "  All Tests Completed Successfully" << std::endl;
-        std::cout << "========================================" << std::endl;
-        return 0;
-    }
-
-    // If config provided, run single test (original behavior)
-    auto config = utils::PrecisionCheckConfig::Parse(config_str);
-    utils::PrecisionCheckEnv::Instance().Init(config);
-
-    std::cout << "Config: " << config_str << std::endl;
-
-    if (config.level == utils::PrecisionCheckLevel::MODULE) {
-        TestModuleLevel(config_str);
-    } else if (config.level == utils::PrecisionCheckLevel::FUNCTION) {
-        TestFunctionLevel(config_str);
-    } else {
-        std::cout << "No tests to run (level=0)" << std::endl;
-    }
-
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "  Test Completed" << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    return 0;
-}
diff --git a/test/lora/test_lora.cc b/test/lora/test_lora.cc
deleted file mode 100644
index 06966809..00000000
--- a/test/lora/test_lora.cc
+++ /dev/null
@@ -1,860 +0,0 @@
-#include <cmath>
-#include <iostream>
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "infini_train/include/nn/lora/lora_config.h"
-#include "infini_train/include/nn/lora/lora_linear.h"
-#include "infini_train/include/nn/lora/lora_utils.h"
-#include "infini_train/include/nn/modules/container.h"
-#include "infini_train/include/nn/modules/linear.h"
-#include "infini_train/include/nn/modules/module.h"
-#include "infini_train/include/nn/parallel/global.h"
-#include "infini_train/include/tensor.h"
-
-using namespace infini_train;
-using namespace infini_train::nn::lora;
-
-// ============================================================================
-// Test 1: LoRAConfig
-// ============================================================================
-void test_lora_config() {
-    std::cout << "\n=== Test 1: LoRAConfig ===" << std::endl;
-
-    LoRAConfig config;
-    config.rank = 8;
-    config.alpha = 16.0f;
-
-    // Test scaling calculation
-    float expected_scaling = 16.0f / 8.0f;
-    CHECK_EQ(config.Scaling(), expected_scaling) << "Scaling calculation failed";
-    std::cout << "Scaling: " << config.Scaling() << " (expected: " << expected_scaling << ")" << std::endl;
-
-    // Test ShouldApplyLoRA
-    CHECK(config.ShouldApplyLoRA("c_attn")) << "Should match c_attn";
-    CHECK(config.ShouldApplyLoRA("transformer.h.0.attn.c_attn")) << "Should match nested c_attn";
-    CHECK(config.ShouldApplyLoRA("c_proj")) << "Should match c_proj";
-    CHECK(!config.ShouldApplyLoRA("c_fc")) << "Should not match c_fc (not in default targets)";
-    CHECK(!config.ShouldApplyLoRA("random_layer")) << "Should not match random_layer";
-
-    std::cout << "LoRAConfig tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 2: LoRALinear Initialization
-// ============================================================================
-void test_lora_linear_init() {
-    std::cout << "\n=== Test 2: LoRALinear Initialization ===" << std::endl;
-
-    LoRAConfig config;
-    config.rank = 4;
-    config.alpha = 8.0f;
-
-    int64_t in_features = 64;
-    int64_t out_features = 128;
-
-    auto lora_linear
-        = std::shared_ptr<LoRALinear>(new LoRALinear(in_features, out_features, config, /*bias=*/true, nullptr));
-
-    // Check parameter shapes
-    auto weight = lora_linear->parameter(nn::Linear::kParamWeightName);
-    auto bias = lora_linear->parameter(nn::Linear::kParamBiasName);
-    auto lora_A = lora_linear->parameter(LoRALinear::kParamLoraAName);
-    auto lora_B = lora_linear->parameter(LoRALinear::kParamLoraBName);
-
-    CHECK_EQ(weight->Dims().size(), 2);
-    CHECK_EQ(weight->Dims()[0], out_features);
-    CHECK_EQ(weight->Dims()[1], in_features);
-    std::cout << "Weight shape: [" << weight->Dims()[0] << ", " << weight->Dims()[1] << "]" << std::endl;
-
-    CHECK_EQ(bias->Dims().size(), 1);
-    CHECK_EQ(bias->Dims()[0], out_features);
-    std::cout << "Bias shape: [" << bias->Dims()[0] << "]" << std::endl;
-
-    CHECK_EQ(lora_A->Dims().size(), 2);
-    CHECK_EQ(lora_A->Dims()[0], config.rank);
-    CHECK_EQ(lora_A->Dims()[1], in_features);
-    std::cout << "LoRA A shape: [" << lora_A->Dims()[0] << ", " << lora_A->Dims()[1] << "]" << std::endl;
-
-    CHECK_EQ(lora_B->Dims().size(), 2);
-    CHECK_EQ(lora_B->Dims()[0], out_features);
-    CHECK_EQ(lora_B->Dims()[1], config.rank);
-    std::cout << "LoRA B shape: [" << lora_B->Dims()[0] << ", " << lora_B->Dims()[1] << "]" << std::endl;
-
-    // Check requires_grad
-    CHECK(!weight->requires_grad()) << "Base weight should be frozen";
-    CHECK(!bias->requires_grad()) << "Base bias should be frozen";
-    CHECK(lora_A->requires_grad()) << "LoRA A should be trainable";
-    CHECK(lora_B->requires_grad()) << "LoRA B should be trainable";
-    std::cout << "requires_grad check passed!" << std::endl;
-
-    // Check LoRAParameters() returns only LoRA params
-    auto params = lora_linear->LoRAParameters();
-    CHECK_EQ(params.size(), 2) << "LoRAParameters() should return only LoRA params";
-    std::cout << "LoRAParameters() returns " << params.size() << " tensors (LoRA A and B)" << std::endl;
-
-    std::cout << "LoRALinear initialization tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 3: LoRALinear Forward Pass
-// ============================================================================
-void test_lora_linear_forward() {
-    std::cout << "\n=== Test 3: LoRALinear Forward Pass ===" << std::endl;
-
-    LoRAConfig config;
-    config.rank = 4;
-    config.alpha = 8.0f;
-
-    int64_t in_features = 64;
-    int64_t out_features = 128;
-    int64_t batch_size = 2;
-    int64_t seq_len = 10;
-
-    auto lora_linear
-        = std::shared_ptr<LoRALinear>(new LoRALinear(in_features, out_features, config, /*bias=*/true, nullptr));
-
-    // Create input tensor
-    auto input = std::make_shared<Tensor>(std::vector<int64_t>{batch_size, seq_len, in_features}, DataType::kFLOAT32);
-
-    // Forward pass
-    auto output = (*lora_linear)({input})[0];
-
-    // Check output shape
-    CHECK_EQ(output->Dims().size(), 3);
-    CHECK_EQ(output->Dims()[0], batch_size);
-    CHECK_EQ(output->Dims()[1], seq_len);
-    CHECK_EQ(output->Dims()[2], out_features);
-    std::cout << "Output shape: [" << output->Dims()[0] << ", " << output->Dims()[1] << ", " << output->Dims()[2] << "]"
-              << std::endl;
-
-    std::cout << "LoRALinear forward pass tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 4: LoRALinear Weight Merging
-// ============================================================================
-void test_lora_linear_merge() {
-    std::cout << "\n=== Test 4: LoRALinear Weight Merging ===" << std::endl;
-
-    LoRAConfig config;
-    config.rank = 4;
-    config.alpha = 8.0f;
-
-    int64_t in_features = 32;
-    int64_t out_features = 64;
-
-    auto lora_linear
-        = std::shared_ptr<LoRALinear>(new LoRALinear(in_features, out_features, config, /*bias=*/false, nullptr));
-
-    // Print weight sum before merge
-    auto weight_before = lora_linear->parameter(nn::Linear::kParamWeightName);
-    auto lora_A = lora_linear->parameter(LoRALinear::kParamLoraAName);
-    auto lora_B = lora_linear->parameter(LoRALinear::kParamLoraBName);
-
-    float weight_before_sum = weight_before->EigenMatrix().sum();
-    float lora_A_sum = lora_A->EigenMatrix().sum();
-    float lora_B_sum = lora_B->EigenMatrix().sum();
-
-    std::cout << "\n--- Before Merge ---" << std::endl;
-    std::cout << "Base weight sum: " << weight_before_sum << std::endl;
-    std::cout << "LoRA A sum: " << lora_A_sum << std::endl;
-    std::cout << "LoRA B sum: " << lora_B_sum << std::endl;
-    std::cout << "Scaling (alpha/r): " << config.Scaling() << std::endl;
-
-    // Create input
-    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 5, in_features}, DataType::kFLOAT32);
-    input->EigenMatrix().setRandom();
-
-    // Get output before merge
-    auto output_before = (*lora_linear)({input})[0];
-    float output_before_sum = output_before->EigenMatrix().sum();
-    std::cout << "Output sum before merge: " << output_before_sum << std::endl;
-
-    // Merge weights
-    CHECK(!lora_linear->IsMerged()) << "Should not be merged initially";
-    lora_linear->MergeWeights();
-    CHECK(lora_linear->IsMerged()) << "Should be merged after MergeWeights()";
-
-    // Verify LoRA params are frozen after merge
-    CHECK(!lora_A->requires_grad()) << "lora_A should be frozen after merge";
-    CHECK(!lora_B->requires_grad()) << "lora_B should be frozen after merge";
-    std::cout << "\nWeights merged successfully, LoRA params frozen" << std::endl;
-
-    // Print weight sum after merge
-    auto weight_after = lora_linear->parameter(nn::Linear::kParamWeightName);
-    float weight_after_sum = weight_after->EigenMatrix().sum();
-    std::cout << "\n--- After Merge ---" << std::endl;
-    std::cout << "Base weight sum after merge: " << weight_after_sum << std::endl;
-    std::cout << "Weight change (should be ~LoRA contribution): " << (weight_after_sum - weight_before_sum)
-              << std::endl;
-
-    // Get output after merge
-    auto output_merged = (*lora_linear)({input})[0];
-    float output_merged_sum = output_merged->EigenMatrix().sum();
-    std::cout << "Output sum after merge: " << output_merged_sum << std::endl;
-
-    // Verify: output_after should equal output_before (numerically)
-    std::cout << "\nVerification: output_before == output_after? " << std::endl;
-    std::cout << "  Before: " << output_before_sum << std::endl;
-    std::cout << "  After:  " << output_merged_sum << std::endl;
-    std::cout << "  Diff:   " << std::abs(output_before_sum - output_merged_sum) << std::endl;
-    CHECK(std::abs(output_before_sum - output_merged_sum) < 1e-3) << "Outputs should be numerically identical!";
-
-    // Shape comparison (always same)
-    std::cout << "\nOutput shape: [" << output_before->Dims()[0] << ", " << output_before->Dims()[1] << ", "
-              << output_before->Dims()[2] << "] (unchanged)" << std::endl;
-
-    // Unmerge weights
-    lora_linear->UnmergeWeights();
-    CHECK(!lora_linear->IsMerged()) << "Should not be merged after UnmergeWeights()";
-
-    // Verify LoRA params are trainable again after unmerge
-    CHECK(lora_A->requires_grad()) << "lora_A should be trainable after unmerge";
-    CHECK(lora_B->requires_grad()) << "lora_B should be trainable after unmerge";
-
-    // Print weight sum after unmerge
-    auto weight_unmerged = lora_linear->parameter(nn::Linear::kParamWeightName);
-    float weight_unmerged_sum = weight_unmerged->EigenMatrix().sum();
-    std::cout << "\n--- After Unmerge ---" << std::endl;
-    std::cout << "Base weight sum after unmerge: " << weight_unmerged_sum << std::endl;
-
-    // Verify: weight should be restored to original value
-    std::cout << "\nVerification: weight restored after unmerge? " << std::endl;
-    std::cout << "  Original: " << weight_before_sum << std::endl;
-    std::cout << "  Unmerged: " << weight_unmerged_sum << std::endl;
-    std::cout << "  Diff:     " << std::abs(weight_before_sum - weight_unmerged_sum) << std::endl;
-    CHECK(std::abs(weight_before_sum - weight_unmerged_sum) < 1e-4) << "Weight should be restored!";
-
-    // Get output after unmerge
-    auto output_unmerged = (*lora_linear)({input})[0];
-    float output_unmerged_sum = output_unmerged->EigenMatrix().sum();
-    std::cout << "Output sum after unmerge: " << output_unmerged_sum << std::endl;
-
-    // Shape comparison: merge doesn't change shape, only weights
-    CHECK(output_before->Dims() == output_merged->Dims()) << "Shape should be identical after merge";
-    CHECK(output_merged->Dims() == output_unmerged->Dims()) << "Shape should be identical after unmerge";
-
-    std::cout << "\nLoRALinear weight merging tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 5: LoRA Utility Functions
-// ============================================================================
-void test_lora_utils() {
-    std::cout << "\n=== Test 5: LoRA Utility Functions ===" << std::endl;
-
-    LoRAConfig config;
-    config.rank = 4;
-    config.alpha = 8.0f;
-
-    auto lora_linear = std::shared_ptr<LoRALinear>(new LoRALinear(32, 64, config, /*bias=*/true, nullptr));
-
-    // Test GetLoRAParameters
-    auto lora_params = GetLoRAParameters(lora_linear);
-    CHECK_EQ(lora_params.size(), 2) << "Should have 2 LoRA parameters";
-    std::cout << "GetLoRAParameters returned " << lora_params.size() << " parameters" << std::endl;
-
-    // Test CountTrainableParameters
-    int64_t trainable = CountTrainableParameters(lora_linear);
-    int64_t expected_trainable = config.rank * 32 + 64 * config.rank; // A: [4, 32], B: [64, 4]
-    CHECK_EQ(trainable, expected_trainable) << "Trainable parameter count mismatch";
-    std::cout << "Trainable parameters: " << trainable << " (expected: " << expected_trainable << ")" << std::endl;
-
-    // Test CountTotalParameters
-    int64_t total = CountTotalParameters(lora_linear);
-    int64_t expected_total = 64 * 32 + 64 + config.rank * 32 + 64 * config.rank; // weight + bias + A + B
-    CHECK_EQ(total, expected_total) << "Total parameter count mismatch";
-    std::cout << "Total parameters: " << total << " (expected: " << expected_total << ")" << std::endl;
-
-    // Test PrintLoRASummary
-    std::cout << "\nLoRA Summary:" << std::endl;
-    PrintLoRASummary(lora_linear);
-
-    std::cout << "LoRA utility function tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 6: LoRALinear from existing Linear
-// ============================================================================
-void test_lora_from_linear() {
-    std::cout << "\n=== Test 6: LoRALinear from existing Linear ===" << std::endl;
-
-    // Create a standard Linear layer
-    auto linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true);
-
-    // Wrap it with LoRA
-    LoRAConfig config;
-    config.rank = 8;
-    config.alpha = 16.0f;
-
-    auto lora_linear = std::make_shared<LoRALinear>(linear, config);
-
-    // Check dimensions
-    CHECK_EQ(lora_linear->in_features(), 64);
-    CHECK_EQ(lora_linear->out_features(), 128);
-    CHECK_EQ(lora_linear->rank(), 8);
-    std::cout << "LoRALinear created from Linear: in=" << lora_linear->in_features()
-              << ", out=" << lora_linear->out_features() << ", rank=" << lora_linear->rank() << std::endl;
-
-    // Test forward pass
-    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 10, 64}, DataType::kFLOAT32);
-    auto output = (*lora_linear)({input})[0];
-
-    CHECK_EQ(output->Dims()[0], 2);
-    CHECK_EQ(output->Dims()[1], 10);
-    CHECK_EQ(output->Dims()[2], 128);
-    std::cout << "Forward pass successful, output shape: [" << output->Dims()[0] << ", " << output->Dims()[1] << ", "
-              << output->Dims()[2] << "]" << std::endl;
-
-    std::cout << "LoRALinear from existing Linear tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 7: LoRALinear from existing Linear (tests LoRA utilities)
-// ============================================================================
-void test_lora_model_wrapper() {
-    std::cout << "\n=== Test 7: LoRALinear from existing Linear ===" << std::endl;
-
-    // Create LoRA config
-    LoRAConfig lora_config;
-    lora_config.rank = 8;
-    lora_config.alpha = 16.0f;
-
-    // Create base Linear module (simple test without InjectLoRALayers)
-    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true);
-
-    // Create a minimal wrapper test by manually testing what LoRAModel does
-    // Apply LoRA directly to the Linear layer
-    auto lora_linear = std::make_shared<LoRALinear>(base_linear, lora_config);
-
-    // Replace the base_linear in its container
-    // Note: In a real use case, you would use InjectLoRALayers on a transformer model
-
-    // Test GetLoRAParameters on the LoRA Linear
-    auto lora_params = GetLoRAParameters(lora_linear);
-    CHECK_GT(lora_params.size(), 0) << "Should have trainable parameters";
-    std::cout << "LoRA parameters extracted: " << lora_params.size() << std::endl;
-
-    // Test CountTrainableParameters
-    int64_t trainable = CountTrainableParameters(lora_linear);
-    CHECK_EQ(trainable, lora_config.rank * 64 + 128 * lora_config.rank);
-    std::cout << "Trainable parameters: " << trainable << std::endl;
-
-    // Test PrintSummary
-    std::cout << "\nLoRA Summary for Linear wrapper:" << std::endl;
-    PrintLoRASummary(lora_linear);
-
-    // Test Save/Load LoRA on the LoRA Linear
-    const std::string test_path = "/tmp/test_lora_linear.bin";
-    SaveLoRAWeights(lora_linear, test_path);
-    std::cout << "SaveLoRAWeights completed" << std::endl;
-
-    LoadLoRAWeights(lora_linear, test_path);
-    std::cout << "LoadLoRAWeights completed" << std::endl;
-
-    // Test Merge/Unmerge on LoRA Linear
-    CHECK(!lora_linear->IsMerged()) << "Should not be merged initially";
-    lora_linear->MergeWeights();
-    CHECK(lora_linear->IsMerged()) << "Should be merged after MergeWeights()";
-    std::cout << "MergeWeights completed" << std::endl;
-
-    lora_linear->UnmergeWeights();
-    CHECK(!lora_linear->IsMerged()) << "Should be unmerged after UnmergeWeights()";
-    std::cout << "UnmergeWeights completed" << std::endl;
-
-    std::cout << "LoRALinear utility tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 8: Save/Load LoRA Weights
-// ============================================================================
-void test_lora_save_load_weights() {
-    std::cout << "\n=== Test 8: Save/Load LoRA Weights ===" << std::endl;
-
-    // Create a LoRALinear
-    LoRAConfig config;
-    config.rank = 4;
-    config.alpha = 8.0f;
-
-    int64_t in_features = 32;
-    int64_t out_features = 64;
-
-    auto linear = std::make_shared<nn::Linear>(in_features, out_features, /*bias=*/true);
-    auto lora_linear = std::make_shared<LoRALinear>(linear, config);
-
-    // Get references to lora_A and lora_B
-    auto lora_A = lora_linear->parameter(LoRALinear::kParamLoraAName);
-    auto lora_B = lora_linear->parameter(LoRALinear::kParamLoraBName);
-
-    // Set specific values to lora_A and lora_B
-    // lora_A: [rank, in_features] = [4, 32]
-    // lora_B: [out_features, rank] = [64, 4]
-    lora_A->EigenMatrix().setZero();
-    lora_B->EigenMatrix().setZero();
-
-    // Set lora_A to all 1s
-    for (int64_t i = 0; i < lora_A->Dims()[0]; ++i) {
-        for (int64_t j = 0; j < lora_A->Dims()[1]; ++j) { lora_A->EigenMatrix()(i, j) = 1.0f; }
-    }
-
-    // Set lora_B to all 2s
-    for (int64_t i = 0; i < lora_B->Dims()[0]; ++i) {
-        for (int64_t j = 0; j < lora_B->Dims()[1]; ++j) { lora_B->EigenMatrix()(i, j) = 2.0f; }
-    }
-
-    // Record original sums
-    float lora_A_sum_orig = lora_A->EigenMatrix().sum();
-    float lora_B_sum_orig = lora_B->EigenMatrix().sum();
-    // lora_A: all 1.0f, shape [rank, in_features] = [4, 32]
-    // lora_B: all 2.0f, shape [out_features, rank] = [64, 4]
-    float expected_lora_A_sum = config.rank * in_features * 1.0f;  // 4 * 32 * 1 = 128
-    float expected_lora_B_sum = out_features * config.rank * 2.0f; // 64 * 4 * 2 = 512
-    std::cout << "Original lora_A sum: " << lora_A_sum_orig << " (expected: " << expected_lora_A_sum << ")"
-              << std::endl;
-    std::cout << "Original lora_B sum: " << lora_B_sum_orig << " (expected: " << expected_lora_B_sum << ")"
-              << std::endl;
-
-    CHECK_EQ(lora_A_sum_orig, expected_lora_A_sum);
-    CHECK_EQ(lora_B_sum_orig, expected_lora_B_sum);
-
-    // Save to file
-    const std::string test_path = "/tmp/test_lora_save_load.bin";
-    SaveLoRAWeights(lora_linear, test_path);
-    std::cout << "Saved LoRA weights to: " << test_path << std::endl;
-
-    // Modify weights to different values
-    lora_A->EigenMatrix().setConstant(9.0f);
-    lora_B->EigenMatrix().setConstant(9.0f);
-
-    float lora_A_sum_modified = lora_A->EigenMatrix().sum();
-    float lora_B_sum_modified = lora_B->EigenMatrix().sum();
-    std::cout << "Modified lora_A sum: " << lora_A_sum_modified << std::endl;
-    std::cout << "Modified lora_B sum: " << lora_B_sum_modified << std::endl;
-
-    CHECK_NE(lora_A_sum_modified, lora_A_sum_orig);
-    CHECK_NE(lora_B_sum_modified, lora_B_sum_orig);
-
-    // Load from file
-    LoadLoRAWeights(lora_linear, test_path);
-    std::cout << "Loaded LoRA weights from: " << test_path << std::endl;
-
-    // Verify weights are restored
-    float lora_A_sum_loaded = lora_A->EigenMatrix().sum();
-    float lora_B_sum_loaded = lora_B->EigenMatrix().sum();
-    std::cout << "Loaded lora_A sum: " << lora_A_sum_loaded << std::endl;
-    std::cout << "Loaded lora_B sum: " << lora_B_sum_loaded << std::endl;
-
-    CHECK_EQ(lora_A_sum_loaded, lora_A_sum_orig) << "lora_A should be restored to original values";
-    CHECK_EQ(lora_B_sum_loaded, lora_B_sum_orig) << "lora_B should be restored to original values";
-
-    // Also verify individual elements
-    for (int64_t i = 0; i < lora_A->Dims()[0]; ++i) {
-        for (int64_t j = 0; j < lora_A->Dims()[1]; ++j) {
-            CHECK_EQ(lora_A->EigenMatrix()(i, j), 1.0f) << "lora_A element mismatch at (" << i << "," << j << ")";
-        }
-    }
-
-    for (int64_t i = 0; i < lora_B->Dims()[0]; ++i) {
-        for (int64_t j = 0; j < lora_B->Dims()[1]; ++j) {
-            CHECK_EQ(lora_B->EigenMatrix()(i, j), 2.0f) << "lora_B element mismatch at (" << i << "," << j << ")";
-        }
-    }
-
-    std::cout << "All elements verified correctly!" << std::endl;
-
-    // Cleanup
-    std::remove(test_path.c_str());
-    std::cout << "Test 8: Save/Load LoRA Weights passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 8: ParseLoRATargetModules parsing
-// ============================================================================
-void test_set_target_modules() {
-    std::cout << "\n=== Test 8: ParseLoRATargetModules Parsing ===" << std::endl;
-
-    // Test single target
-    auto modules = ParseLoRATargetModules("c_attn");
-    CHECK_EQ(modules.size(), 1);
-    CHECK(modules.count("c_attn"));
-    std::cout << "Single target: OK" << std::endl;
-
-    // Test multiple targets
-    modules = ParseLoRATargetModules("c_attn,c_proj,c_fc");
-    CHECK_EQ(modules.size(), 3);
-    CHECK(modules.count("c_attn"));
-    CHECK(modules.count("c_proj"));
-    CHECK(modules.count("c_fc"));
-    std::cout << "Multiple targets: OK" << std::endl;
-
-    // Test with spaces
-    modules = ParseLoRATargetModules("c_attn, c_proj , c_fc");
-    CHECK_EQ(modules.size(), 3);
-    std::cout << "Targets with spaces: OK" << std::endl;
-
-    // Test empty/whitespace
-    modules = ParseLoRATargetModules("c_attn,,c_proj");
-    CHECK_EQ(modules.size(), 2);
-    std::cout << "Empty entries ignored: OK" << std::endl;
-
-    std::cout << "ParseLoRATargetModules tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 9: ShouldApplyLoRA edge cases (attn.c_proj vs mlp.c_proj)
-// ============================================================================
-void test_should_apply_lora_edge_cases() {
-    std::cout << "\n=== Test 9: ShouldApplyLoRA Edge Cases ===" << std::endl;
-
-    // Test: Only attn.c_proj in target_modules
-    {
-        LoRAConfig config{8, 16.0f, 0.0f, ParseLoRATargetModules("c_attn,attn.c_proj")};
-
-        // Should match attention paths
-        CHECK(config.ShouldApplyLoRA("attn.c_proj"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.attn.c_proj"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.1.attn.c_proj"));
-
-        // Should NOT match mlp paths
-        CHECK(!config.ShouldApplyLoRA("mlp.c_proj"));
-        CHECK(!config.ShouldApplyLoRA("transformer.h.0.mlp.c_proj"));
-        std::cout << "attn.c_proj only: OK" << std::endl;
-    }
-
-    // Test: Only mlp.c_proj in target_modules
-    {
-        LoRAConfig config{8, 16.0f, 0.0f, ParseLoRATargetModules("c_attn,mlp.c_proj")};
-
-        // Should NOT match attention paths
-        CHECK(!config.ShouldApplyLoRA("attn.c_proj"));
-        CHECK(!config.ShouldApplyLoRA("transformer.h.0.attn.c_proj"));
-
-        // Should match mlp paths
-        CHECK(config.ShouldApplyLoRA("mlp.c_proj"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.mlp.c_proj"));
-        std::cout << "mlp.c_proj only: OK" << std::endl;
-    }
-
-    // Test: Generic c_proj in target_modules (matches both)
-    {
-        LoRAConfig config{8, 16.0f, 0.0f, ParseLoRATargetModules("c_attn,c_proj")};
-
-        // Should match both attention and mlp
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.attn.c_proj"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.mlp.c_proj"));
-        std::cout << "Generic c_proj (matches both): OK" << std::endl;
-    }
-
-    // Test: All targets
-    {
-        LoRAConfig config{8, 16.0f, 0.0f, ParseLoRATargetModules("c_attn,attn.c_proj,c_fc,c_fc2,mlp.c_proj")};
-
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.attn.c_attn"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.attn.c_proj"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.mlp.c_fc"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.mlp.c_fc2"));
-        CHECK(config.ShouldApplyLoRA("transformer.h.0.mlp.c_proj"));
-        std::cout << "All targets: OK" << std::endl;
-    }
-
-    std::cout << "ShouldApplyLoRA edge cases tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 10: ReplaceModuleByPath
-// ============================================================================
-void test_replace_module_by_path() {
-    std::cout << "\n=== Test 10: ReplaceModuleByPath ===" << std::endl;
-
-    // Test ReplaceModuleByPath by wrapping a Linear with LoRA directly
-    // This tests the core functionality that ReplaceModuleByPath provides
-
-    // Create base Linear
-    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true);
-
-    // Configure LoRA
-    LoRAConfig lora_config;
-    lora_config.rank = 4;
-    lora_config.alpha = 8.0f;
-
-    // Wrap with LoRA - this is what ReplaceModuleByPath does internally
-    auto lora_linear = std::make_shared<LoRALinear>(base_linear, lora_config);
-
-    // Verify LoRA was applied correctly
-    auto params = lora_linear->LoRAParameters();
-    CHECK_EQ(params.size(), 2) << "LoRALinear should have 2 trainable parameters (lora_A and lora_B)";
-    std::cout << "LoRALinear has " << params.size() << " trainable parameters" << std::endl;
-
-    // Verify parameter shapes
-    auto lora_a = params[0];
-    auto lora_b = params[1];
-    CHECK_EQ(lora_a->Dims()[0], lora_config.rank); // rank x in_features
-    CHECK_EQ(lora_a->Dims()[1], 64);
-    CHECK_EQ(lora_b->Dims()[0], 128); // out_features x rank
-    CHECK_EQ(lora_b->Dims()[1], lora_config.rank);
-    std::cout << "LoRA parameter shapes: OK" << std::endl;
-
-    // Verify base parameters are frozen (use named parameters instead of index)
-    auto weight = lora_linear->parameter(nn::Linear::kParamWeightName);
-    auto lora_a_param = lora_linear->parameter(LoRALinear::kParamLoraAName);
-    auto lora_b_param = lora_linear->parameter(LoRALinear::kParamLoraBName);
-    CHECK(weight != nullptr);
-    CHECK(lora_a_param != nullptr);
-    CHECK(lora_b_param != nullptr);
-    CHECK(!weight->requires_grad());      // weight is frozen
-    CHECK(lora_a_param->requires_grad()); // lora_A is trainable
-    CHECK(lora_b_param->requires_grad()); // lora_B is trainable
-    std::cout << "Base weight frozen, LoRA params trainable: OK" << std::endl;
-
-    std::cout << "ReplaceModuleByPath tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 11: FreezeBaseModel / UnfreezeModel
-// ============================================================================
-void test_freeze_unfreeze() {
-    std::cout << "\n=== Test 11: FreezeBaseModel / UnfreezeModel ===" << std::endl;
-
-    // Test with LoRALinear directly - it has both base and LoRA params
-    LoRAConfig lora_config;
-    lora_config.rank = 4;
-    lora_config.alpha = 8.0f;
-
-    auto linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true);
-    auto lora_linear = std::make_shared<LoRALinear>(linear, lora_config);
-
-    // Get all parameters from LoRALinear (includes base + LoRA)
-    auto all_params = lora_linear->Parameters();
-
-    // Initially only LoRA params should be trainable (base weights are frozen by constructor)
-    int64_t total_params = 0;
-    for (const auto &p : all_params) {
-        if (p->requires_grad()) {
-            total_params += p->NumElements();
-        }
-    }
-    // Expected: only LoRA params (lora_A + lora_B) = 4*64 + 128*4 = 256 + 512 = 768
-    // Note: LoRALinear freezes base weights in constructor by design
-    int64_t expected_total = lora_config.rank * 64 + 128 * lora_config.rank;
-    CHECK_EQ(total_params, expected_total);
-    std::cout << "Initial trainable params: " << total_params << " (expected: " << expected_total << ")" << std::endl;
-
-    // FreezeBaseModel on LoRALinear
-    FreezeBaseModel(lora_linear);
-
-    // After freeze, only LoRA params should be trainable
-    int64_t after_freeze = 0;
-    for (const auto &p : all_params) {
-        if (p->requires_grad()) {
-            after_freeze += p->NumElements();
-        }
-    }
-    // LoRA params: A (rank x in) + B (out x rank) = 4*64 + 128*4 = 256 + 512 = 768
-    int64_t expected_lora = lora_config.rank * 64 + 128 * lora_config.rank;
-    CHECK_EQ(after_freeze, expected_lora);
-    std::cout << "After freeze trainable: " << after_freeze << " (expected: " << expected_lora << ")" << std::endl;
-
-    // Unfreeze all
-    UnfreezeModel(lora_linear);
-    int64_t after_unfreeze = 0;
-    for (const auto &p : all_params) {
-        if (p->requires_grad()) {
-            after_unfreeze += p->NumElements();
-        }
-    }
-    // Should be back to all params trainable (base + LoRA)
-    int64_t expected_after_unfreeze = 64 * 128 + 128 + lora_config.rank * 64 + 128 * lora_config.rank;
-    CHECK_EQ(after_unfreeze, expected_after_unfreeze);
-    std::cout << "After unfreeze trainable: " << after_unfreeze << std::endl;
-
-    std::cout << "FreezeBaseModel / UnfreezeModel tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 12: LoRAStateDict
-// ============================================================================
-void test_lora_state_dict() {
-    std::cout << "\n=== Test 12: LoRAStateDict ===" << std::endl;
-
-    // Test with a single LoRALinear
-    LoRAConfig lora_config;
-    lora_config.rank = 4;
-    lora_config.alpha = 8.0f;
-
-    auto linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true);
-    auto lora_linear = std::make_shared<LoRALinear>(linear, lora_config);
-
-    // Get state dict - it contains all parameters with their names
-    auto state_dict = lora_linear->StateDict();
-
-    // Check that we have all expected parameters
-    CHECK(state_dict.count("weight")) << "Should have weight parameter";
-    CHECK(state_dict.count("bias")) << "Should have bias parameter";
-    CHECK(state_dict.count("lora_A")) << "Should have lora_A parameter";
-    CHECK(state_dict.count("lora_B")) << "Should have lora_B parameter";
-    std::cout << "State dict contains: weight, bias, lora_A, lora_B" << std::endl;
-
-    // Verify LoRA parameters exist and are trainable
-    CHECK(state_dict.at("lora_A")->requires_grad()) << "lora_A should be trainable";
-    CHECK(state_dict.at("lora_B")->requires_grad()) << "lora_B should be trainable";
-    CHECK(!state_dict.at("weight")->requires_grad()) << "weight should be frozen";
-    std::cout << "LoRA parameters are trainable, base weight is frozen: OK" << std::endl;
-
-    // Verify shapes
-    CHECK_EQ(state_dict.at("lora_A")->Dims()[0], lora_config.rank);
-    CHECK_EQ(state_dict.at("lora_A")->Dims()[1], 64);
-    CHECK_EQ(state_dict.at("lora_B")->Dims()[0], 128);
-    CHECK_EQ(state_dict.at("lora_B")->Dims()[1], lora_config.rank);
-    std::cout << "LoRA parameter shapes: OK" << std::endl;
-
-    std::cout << "LoRAStateDict tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 13: GetLoRAModel simplified API
-// ============================================================================
-void test_get_lora_model() {
-    std::cout << "\n=== Test 13: GetLoRAModel Simplified API ===" << std::endl;
-
-    // Test GetLoRAModel with a simple Linear layer
-    // We'll wrap it with LoRA directly and verify the wrapper works
-
-    // Create base Linear
-    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true);
-
-    // Configure LoRA
-    LoRAConfig config{4, 8.0f, 0.0f, ParseLoRATargetModules("Linear")};
-
-    // Use GetLoRAModel with the linear as the "model"
-    // Note: GetLoRAModel returns the modified model (in-place injection)
-    auto model = GetLoRAModel(base_linear, config);
-
-    CHECK(model != nullptr);
-    std::cout << "GetLoRAModel returned valid pointer" << std::endl;
-
-    // Test that LoRA was applied - check trainable parameters
-    auto lora_params = GetLoRAParameters(model);
-    // GetLoRAParameters returns vector<shared_ptr<Tensor>>, size() is the count of tensors
-    // LoRALinear has 2 trainable tensors: lora_A (rank x in) and lora_B (out x rank)
-    CHECK_EQ(lora_params.size(), 2);
-    std::cout << "Trainable parameter tensors: " << lora_params.size() << " (expected: 2)" << std::endl;
-
-    // Also verify total element count
-    int64_t total_elements = 0;
-    for (const auto &t : lora_params) { total_elements += t->NumElements(); }
-    int64_t expected_elements = config.rank * 64 + 128 * config.rank; // 768
-    CHECK_EQ(total_elements, expected_elements);
-    std::cout << "Total trainable elements: " << total_elements << " (expected: " << expected_elements << ")"
-              << std::endl;
-
-    // Test PrintSummary
-    std::cout << "\nLoRA Model Summary:" << std::endl;
-    PrintLoRASummary(model);
-
-    // Test Merge/Unmerge using utility functions
-    MergeLoRAWeights(model);
-    // Verify LoRA params frozen after merge
-    auto *lora_mod = dynamic_cast<LoRALinear *>(model.get());
-    CHECK(lora_mod != nullptr);
-    CHECK(!lora_mod->LoRAParameters()[0]->requires_grad()) << "lora_A should be frozen after merge";
-    CHECK(!lora_mod->LoRAParameters()[1]->requires_grad()) << "lora_B should be frozen after merge";
-    std::cout << "Merge: OK (LoRA params frozen)" << std::endl;
-
-    UnmergeLoRAWeights(model);
-    CHECK(lora_mod->LoRAParameters()[0]->requires_grad()) << "lora_A should be trainable after unmerge";
-    CHECK(lora_mod->LoRAParameters()[1]->requires_grad()) << "lora_B should be trainable after unmerge";
-    std::cout << "Unmerge: OK (LoRA params trainable)" << std::endl;
-
-    std::cout << "GetLoRAModel in-place injection tests passed!" << std::endl;
-}
-
-// ============================================================================
-// Test 14: MergeAndUnload
-// ============================================================================
-void test_merge_and_unload() {
-    std::cout << "\n=== Test 14: MergeAndUnload ===" << std::endl;
-
-    // Create base Linear and apply LoRA
-    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true);
-    LoRAConfig config{4, 8.0f, 0.0f, ParseLoRATargetModules("Linear")};
-    auto model = GetLoRAModel(base_linear, config);
-
-    // Verify it's a LoRA module
-    CHECK(dynamic_cast<LoRALinear *>(model.get()) != nullptr) << "Should be LoRALinear";
-
-    // Create input and get output before merge_and_unload
-    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 5, 64}, DataType::kFLOAT32);
-    input->EigenMatrix().setRandom();
-    auto output_before = (*model)({input})[0];
-    float output_before_sum = output_before->EigenMatrix().sum();
-    std::cout << "Output sum before MergeAndUnload: " << output_before_sum << std::endl;
-
-    // MergeAndUnload
-    auto unloaded_model = MergeAndUnload(model);
-    CHECK(unloaded_model != nullptr) << "MergeAndUnload should return valid model";
-
-    // Verify it's no longer a LoRA module
-    CHECK(dynamic_cast<LoRALinear *>(unloaded_model.get()) == nullptr) << "Should be plain Linear after MergeAndUnload";
-    std::cout << "Model is no longer LoRALinear: OK" << std::endl;
-
-    // Verify no LoRA parameters exist (check state dict)
-    auto state_dict = unloaded_model->StateDict();
-    for (const auto &[name, param] : state_dict) {
-        CHECK(name.find("lora_A") == std::string::npos && name.find("lora_B") == std::string::npos)
-            << "Should not have LoRA parameters after MergeAndUnload, found: " << name;
-    }
-    std::cout << "No LoRA parameters in state dict: OK" << std::endl;
-
-    // Verify forward output matches (merged output should equal unmerged LoRA output)
-    auto output_after = (*unloaded_model)({input})[0];
-    float output_after_sum = output_after->EigenMatrix().sum();
-    std::cout << "Output sum after MergeAndUnload: " << output_after_sum << std::endl;
-    std::cout << "Diff: " << std::abs(output_before_sum - output_after_sum) << std::endl;
-    CHECK(std::abs(output_before_sum - output_after_sum) < 1e-3) << "Output should match after MergeAndUnload";
-
-    // Verify all parameters have requires_grad = true (unfrozen)
-    for (const auto &param : unloaded_model->Parameters()) {
-        CHECK(param->requires_grad()) << "All parameters should be trainable after MergeAndUnload";
-    }
-    std::cout << "All parameters trainable: OK" << std::endl;
-
-    std::cout << "MergeAndUnload tests passed!" << std::endl;
-}
-
-int main(int argc, char **argv) {
-    google::InitGoogleLogging(argv[0]);
-    FLAGS_logtostderr = 1;
-
-    // Initialize parallel settings (required for some tensor operations)
-    // Parameters: nthread_per_process, tensor_parallel_size, sequence_parallel_enabled,
-    //             pipeline_parallel_size, virtual_pipeline_parallel_size
-    nn::parallel::global::InitAllEnv(1, 1, false, 1, 1);
-
-    std::cout << "========================================" << std::endl;
-    std::cout << "       LoRA Module Unit Tests          " << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    test_lora_config();
-    test_lora_linear_init();
-    test_lora_linear_forward();
-    test_lora_linear_merge();
-    test_lora_utils();
-    test_lora_from_linear();
-    test_lora_model_wrapper();
-    test_lora_save_load_weights();
-    test_set_target_modules();
-    test_should_apply_lora_edge_cases();
-    test_replace_module_by_path();
-    test_freeze_unfreeze();
-    test_lora_state_dict();
-    test_get_lora_model();
-    test_merge_and_unload();
-
-    std::cout << "\n========================================" << std::endl;
-    std::cout << "       All LoRA Tests Passed!          " << std::endl;
-    std::cout << "========================================" << std::endl;
-
-    return 0;
-}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 00000000..39a44f27
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,24 @@
+# Tests CMakeLists.txt
+# This file manages the test infrastructure for InfiniTrain
+
+# Include shared test macros (must be before any test subdirectory)
+include(${CMAKE_CURRENT_SOURCE_DIR}/common/test_macros.cmake)
+
+# Common test utilities
+add_subdirectory(common)
+
+# Tensor tests
+add_subdirectory(tensor)
+
+# Optimizer tests
+add_subdirectory(optimizer)
+
+# Autograd operator tests
+add_subdirectory(autograd)
+
+# LoRA tests
+add_subdirectory(lora)
+
+# Hook tests
+add_subdirectory(hook)
+
diff --git a/tests/autograd/CMakeLists.txt b/tests/autograd/CMakeLists.txt
new file mode 100644
index 00000000..d321f629
--- /dev/null
+++ b/tests/autograd/CMakeLists.txt
@@ -0,0 +1,11 @@
+# ============================================================================
+# Autograd tests
+# ============================================================================
+
+set(AUTOGRAD_TEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+file(GLOB AUTOGRAD_SOURCES ${AUTOGRAD_TEST_DIR}/test_autograd*.cc)
+
+infini_train_add_test_suite(test_autograd
+  SOURCES ${AUTOGRAD_SOURCES}
+)
diff --git a/tests/autograd/test_autograd.cc b/tests/autograd/test_autograd.cc
new file mode 100644
index 00000000..6401cc93
--- /dev/null
+++ b/tests/autograd/test_autograd.cc
@@ -0,0 +1,376 @@
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <vector>
+
+#include "infini_train/include/autograd/activations.h"
+#include "infini_train/include/autograd/elementwise.h"
+#include "infini_train/include/autograd/function.h"
+#include "infini_train/include/autograd/linear.h"
+#include "infini_train/include/autograd/matmul.h"
+#include "infini_train/include/autograd/misc.h"
+#include "infini_train/include/autograd/normalization.h"
+#include "infini_train/include/autograd/outer.h"
+#include "infini_train/include/autograd/reduction.h"
+#include "infini_train/include/autograd/softmax.h"
+#include "infini_train/include/autograd/transform.h"
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+// ============================================================================
+// Forward / Backward — CPU + CUDA
+// ============================================================================
+
+class AutogradForwardTest : public infini_train::test::AutogradTestBaseP {};
+class AutogradBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradForwardTest, AddForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto result = std::make_shared<autograd::Add>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 3}));
+}
+
+TEST_P(AutogradForwardTest, SubForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto result = std::make_shared<autograd::Sub>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, MulForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto result = std::make_shared<autograd::Mul>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, DivForward) {
+    auto a = createTensor({2, 3}, 6.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto result = std::make_shared<autograd::Div>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, NegForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto result = std::make_shared<autograd::Neg>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, SinForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto result = std::make_shared<autograd::Sin>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, CosForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto result = std::make_shared<autograd::Cos>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, TanhForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto result = std::make_shared<autograd::Tanh>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, ExpForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Exp>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, LogForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto result = std::make_shared<autograd::Log>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, ReciprocalForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto result = std::make_shared<autograd::Reciprocal>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, PowForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto result = std::make_shared<autograd::Pow>(2.0f)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, RsqrtForward) {
+    auto a = createTensor({2, 3}, 4.0f);
+    auto result = std::make_shared<autograd::Rsqrt>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, SigmoidForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto result = std::make_shared<autograd::Sigmoid>()->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, MatmulForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({3, 4}, 1.0f);
+    auto result = std::make_shared<autograd::Matmul>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 4}));
+}
+
+TEST_P(AutogradForwardTest, SumForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Sum>(1, false)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, MeanForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Mean>(1, false)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, MaxForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Max>(1, false)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, MinForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Min>(1, false)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, SoftmaxForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Softmax>(1)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 3}));
+}
+
+TEST_P(AutogradForwardTest, LayerNormForward) {
+    auto a = createTensor({2, 3, 4}, 1.0f);
+    auto weight = createTensor({4}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto result = std::make_shared<autograd::LayerNorm>(1e-5f)->Apply({a, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, LinearForward) {
+    auto input = createTensor({2, 3}, 1.0f);
+    auto weight = createTensor({4, 3}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto result = std::make_shared<autograd::Linear>()->Apply({input, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 4}));
+}
+
+TEST_P(AutogradForwardTest, TransposeForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Transpose>(0, 1)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{3, 2}));
+}
+
+TEST_P(AutogradForwardTest, SliceForward) {
+    auto a = createTensor({4, 4}, 1.0f);
+    auto result = std::make_shared<autograd::Slice>(std::vector<int64_t>{1, 1}, std::vector<int64_t>{3, 3},
+                                                    std::vector<int64_t>{1, 1})
+                      ->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, SplitForward) {
+    auto a = createTensor({4, 4}, 1.0f);
+    auto result = std::make_shared<autograd::Split>(2, 0)->Apply({a});
+    EXPECT_EQ(result.size(), 2);
+}
+
+TEST_P(AutogradForwardTest, ConcatForward) {
+    auto a = createTensor({2, 2}, 1.0f);
+    auto b = createTensor({2, 2}, 2.0f);
+    auto result = std::make_shared<autograd::Concat>(0)->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{4, 2}));
+}
+
+TEST_P(AutogradForwardTest, StackForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto result = std::make_shared<autograd::Stack>(0)->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 2, 3}));
+}
+
+TEST_P(AutogradForwardTest, TrilForward) {
+    auto a = createTensor({3, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Tril>(0)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, TriuForward) {
+    auto a = createTensor({3, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Triu>(0)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, OuterForward) {
+    auto a = createTensor({3}, 1.0f);
+    auto b = createTensor({4}, 1.0f);
+    auto result = std::make_shared<autograd::Outer>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{3, 4}));
+}
+
+TEST_P(AutogradForwardTest, AddScalarForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::AddScalar>(2.0f)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, MulScalarForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto result = std::make_shared<autograd::MulScalar>(3.0f)->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, LtForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto result = std::make_shared<autograd::Lt>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, LeForward) {
+    auto a = createTensor({2, 3}, 3.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto result = std::make_shared<autograd::Le>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, GtForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto result = std::make_shared<autograd::Gt>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, GeForward) {
+    auto a = createTensor({2, 3}, 3.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto result = std::make_shared<autograd::Ge>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, EqualsForward) {
+    auto a = createTensor({2, 3}, 3.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto result = std::make_shared<autograd::Equals>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, AndForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::And>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, OrForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto b = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::Or>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradForwardTest, NoOpForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto result = std::make_shared<autograd::NoOp>(std::vector<int64_t>{2, 3})->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 3}));
+}
+
+TEST_P(AutogradBackwardTest, AddBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto add_fn = std::make_shared<autograd::Add>();
+    auto result = add_fn->Apply({a, b});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = add_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+TEST_P(AutogradBackwardTest, MulBackward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto mul_fn = std::make_shared<autograd::Mul>();
+    auto result = mul_fn->Apply({a, b});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = mul_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradForwardTest);
+
+INFINI_TRAIN_REGISTER_TEST(AutogradBackwardTest);
+
+// ============================================================================
+// Distributed — requires NCCL + >=2 GPUs
+// ============================================================================
+
+class AutogradDistributedTest : public infini_train::test::DistributedInfiniTrainTestP {};
+
+TEST_P(AutogradDistributedTest, AllReduce) {
+    auto a = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    a->set_requires_grad(true);
+    infini_train::test::FillConstantTensor(a, 1.0f);
+    EXPECT_TRUE(a->GetDevice().IsCUDA());
+    EXPECT_TRUE(a->requires_grad());
+}
+
+TEST_P(AutogradDistributedTest, AllGather) {
+    auto a = std::make_shared<Tensor>(std::vector<int64_t>{4, 4}, DataType::kFLOAT32, GetDevice());
+    a->set_requires_grad(true);
+    infini_train::test::FillConstantTensor(a, 1.0f);
+    EXPECT_TRUE(a->GetDevice().IsCUDA());
+    EXPECT_EQ(a->Dims(), (std::vector<int64_t>{4, 4}));
+}
+
+TEST_P(AutogradDistributedTest, ReduceScatter) {
+    auto a = std::make_shared<Tensor>(std::vector<int64_t>{2, 8}, DataType::kFLOAT32, GetDevice());
+    a->set_requires_grad(true);
+    infini_train::test::FillConstantTensor(a, 1.0f);
+    EXPECT_TRUE(a->GetDevice().IsCUDA());
+    EXPECT_EQ(a->Dims(), (std::vector<int64_t>{2, 8}));
+}
+
+TEST_P(AutogradDistributedTest, DistributedMatmul) {
+    auto a = std::make_shared<Tensor>(std::vector<int64_t>{2, 4}, DataType::kFLOAT32, GetDevice());
+    a->set_requires_grad(true);
+    auto b = std::make_shared<Tensor>(std::vector<int64_t>{4, 2}, DataType::kFLOAT32, GetDevice());
+    b->set_requires_grad(true);
+    auto result = std::make_shared<autograd::Matmul>()->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_TRUE(result[0]->GetDevice().IsCUDA());
+}
+
+TEST_P(AutogradDistributedTest, DistributedLinear) {
+    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    input->set_requires_grad(true);
+    auto weight = std::make_shared<Tensor>(std::vector<int64_t>{4, 3}, DataType::kFLOAT32, GetDevice());
+    weight->set_requires_grad(true);
+    auto bias = std::make_shared<Tensor>(std::vector<int64_t>{4}, DataType::kFLOAT32, GetDevice());
+    bias->set_requires_grad(true);
+    auto result = std::make_shared<autograd::Linear>()->Apply({input, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 4}));
+    EXPECT_TRUE(result[0]->GetDevice().IsCUDA());
+}
+
+INFINI_TRAIN_REGISTER_TEST_DISTRIBUTED(AutogradDistributedTest);
diff --git a/tests/autograd/test_autograd_elementwise_backward.cc b/tests/autograd/test_autograd_elementwise_backward.cc
new file mode 100644
index 00000000..65ffd5a8
--- /dev/null
+++ b/tests/autograd/test_autograd_elementwise_backward.cc
@@ -0,0 +1,136 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+#include <cmath>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/elementwise.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradElementwiseBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradElementwiseBackwardTest, AddBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto add_fn = std::make_shared<autograd::Add>();
+    auto result = add_fn->Apply({a, b});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = add_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, SubBackward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto sub_fn = std::make_shared<autograd::Sub>();
+    auto result = sub_fn->Apply({a, b});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = sub_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, MulBackward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto mul_fn = std::make_shared<autograd::Mul>();
+    auto result = mul_fn->Apply({a, b});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = mul_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, DivBackward) {
+    auto a = createTensor({2, 3}, 6.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto div_fn = std::make_shared<autograd::Div>();
+    auto result = div_fn->Apply({a, b});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = div_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, NegBackward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto neg_fn = std::make_shared<autograd::Neg>();
+    auto result = neg_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = neg_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, SinBackward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto sin_fn = std::make_shared<autograd::Sin>();
+    auto result = sin_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = sin_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, CosBackward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto cos_fn = std::make_shared<autograd::Cos>();
+    auto result = cos_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = cos_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, TanhBackward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto tanh_fn = std::make_shared<autograd::Tanh>();
+    auto result = tanh_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = tanh_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, ExpBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto exp_fn = std::make_shared<autograd::Exp>();
+    auto result = exp_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = exp_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, LogBackward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto log_fn = std::make_shared<autograd::Log>();
+    auto result = log_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = log_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, ReciprocalBackward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto reciprocal_fn = std::make_shared<autograd::Reciprocal>();
+    auto result = reciprocal_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = reciprocal_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, PowBackward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto pow_fn = std::make_shared<autograd::Pow>(2.0f);
+    auto result = pow_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = pow_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradElementwiseBackwardTest, RsqrtBackward) {
+    auto a = createTensor({2, 3}, 4.0f);
+    auto rsqrt_fn = std::make_shared<autograd::Rsqrt>();
+    auto result = rsqrt_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = rsqrt_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradElementwiseBackwardTest);
diff --git a/tests/autograd/test_autograd_elementwise_forward.cc b/tests/autograd/test_autograd_elementwise_forward.cc
new file mode 100644
index 00000000..20fe658e
--- /dev/null
+++ b/tests/autograd/test_autograd_elementwise_forward.cc
@@ -0,0 +1,189 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+#include <cmath>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/elementwise.h"
+#include "infini_train/include/autograd/activations.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradElementwiseForwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradElementwiseForwardTest, AddForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto add_fn = std::make_shared<autograd::Add>();
+    auto result = add_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 3}));
+}
+
+TEST_P(AutogradElementwiseForwardTest, SubForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto sub_fn = std::make_shared<autograd::Sub>();
+    auto result = sub_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, MulForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto mul_fn = std::make_shared<autograd::Mul>();
+    auto result = mul_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, DivForward) {
+    auto a = createTensor({2, 3}, 6.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto div_fn = std::make_shared<autograd::Div>();
+    auto result = div_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, NegForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto neg_fn = std::make_shared<autograd::Neg>();
+    auto result = neg_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, SinForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto sin_fn = std::make_shared<autograd::Sin>();
+    auto result = sin_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, CosForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto cos_fn = std::make_shared<autograd::Cos>();
+    auto result = cos_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, TanhForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto tanh_fn = std::make_shared<autograd::Tanh>();
+    auto result = tanh_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, ExpForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto exp_fn = std::make_shared<autograd::Exp>();
+    auto result = exp_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, LogForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto log_fn = std::make_shared<autograd::Log>();
+    auto result = log_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, ReciprocalForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto reciprocal_fn = std::make_shared<autograd::Reciprocal>();
+    auto result = reciprocal_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, PowForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto pow_fn = std::make_shared<autograd::Pow>(2.0f);
+    auto result = pow_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, RsqrtForward) {
+    auto a = createTensor({2, 3}, 4.0f);
+    auto rsqrt_fn = std::make_shared<autograd::Rsqrt>();
+    auto result = rsqrt_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, SigmoidForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto sigmoid_fn = std::make_shared<autograd::Sigmoid>();
+    auto result = sigmoid_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, AddScalarForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto add_scalar_fn = std::make_shared<autograd::AddScalar>(2.0f);
+    auto result = add_scalar_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, MulScalarForward) {
+    auto a = createTensor({2, 3}, 2.0f);
+    auto mul_scalar_fn = std::make_shared<autograd::MulScalar>(3.0f);
+    auto result = mul_scalar_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, LtForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto lt_fn = std::make_shared<autograd::Lt>();
+    auto result = lt_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, LeForward) {
+    auto a = createTensor({2, 3}, 3.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto le_fn = std::make_shared<autograd::Le>();
+    auto result = le_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, GtForward) {
+    auto a = createTensor({2, 3}, 5.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto gt_fn = std::make_shared<autograd::Gt>();
+    auto result = gt_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, GeForward) {
+    auto a = createTensor({2, 3}, 3.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto ge_fn = std::make_shared<autograd::Ge>();
+    auto result = ge_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, EqualsForward) {
+    auto a = createTensor({2, 3}, 3.0f);
+    auto b = createTensor({2, 3}, 3.0f);
+    auto eq_fn = std::make_shared<autograd::Equals>();
+    auto result = eq_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, AndForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 1.0f);
+    auto and_fn = std::make_shared<autograd::And>();
+    auto result = and_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradElementwiseForwardTest, OrForward) {
+    auto a = createTensor({2, 3}, 0.0f);
+    auto b = createTensor({2, 3}, 1.0f);
+    auto or_fn = std::make_shared<autograd::Or>();
+    auto result = or_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradElementwiseForwardTest);
diff --git a/tests/autograd/test_autograd_linear_backward.cc b/tests/autograd/test_autograd_linear_backward.cc
new file mode 100644
index 00000000..c62920b4
--- /dev/null
+++ b/tests/autograd/test_autograd_linear_backward.cc
@@ -0,0 +1,35 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/linear.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradLinearBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradLinearBackwardTest, LinearBackward) {
+    auto input = createTensor({2, 3}, 1.0f);
+    auto weight = createTensor({4, 3}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto linear_fn = std::make_shared<autograd::Linear>();
+    auto result = linear_fn->Apply({input, weight, bias});
+    auto grad = createTensor({2, 4}, 1.0f);
+    auto grad_inputs = linear_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 3);
+}
+
+TEST_P(AutogradLinearBackwardTest, LinearBackwardNoBias) {
+    auto input = createTensor({2, 3}, 1.0f);
+    auto weight = createTensor({4, 3}, 1.0f);
+    auto linear_fn = std::make_shared<autograd::Linear>();
+    auto result = linear_fn->Apply({input, weight});
+    auto grad = createTensor({2, 4}, 1.0f);
+    auto grad_inputs = linear_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradLinearBackwardTest);
diff --git a/tests/autograd/test_autograd_linear_forward.cc b/tests/autograd/test_autograd_linear_forward.cc
new file mode 100644
index 00000000..5fb41546
--- /dev/null
+++ b/tests/autograd/test_autograd_linear_forward.cc
@@ -0,0 +1,43 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/linear.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradLinearForwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradLinearForwardTest, LinearForward) {
+    auto input = createTensor({2, 3}, 1.0f);
+    auto weight = createTensor({4, 3}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto linear_fn = std::make_shared<autograd::Linear>();
+    auto result = linear_fn->Apply({input, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 4}));
+}
+
+TEST_P(AutogradLinearForwardTest, LinearNoBias) {
+    auto input = createTensor({2, 3}, 1.0f);
+    auto weight = createTensor({4, 3}, 1.0f);
+    auto linear_fn = std::make_shared<autograd::Linear>();
+    auto result = linear_fn->Apply({input, weight});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 4}));
+}
+
+TEST_P(AutogradLinearForwardTest, LinearBatch) {
+    auto input = createTensor({32, 128}, 1.0f);
+    auto weight = createTensor({64, 128}, 1.0f);
+    auto bias = createTensor({64}, 0.0f);
+    auto linear_fn = std::make_shared<autograd::Linear>();
+    auto result = linear_fn->Apply({input, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{32, 64}));
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradLinearForwardTest);
diff --git a/tests/autograd/test_autograd_matmul_backward.cc b/tests/autograd/test_autograd_matmul_backward.cc
new file mode 100644
index 00000000..aeb26a55
--- /dev/null
+++ b/tests/autograd/test_autograd_matmul_backward.cc
@@ -0,0 +1,44 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/matmul.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradMatmulBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradMatmulBackwardTest, MatmulBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({3, 4}, 1.0f);
+    auto matmul_fn = std::make_shared<autograd::Matmul>();
+    auto result = matmul_fn->Apply({a, b});
+    auto grad = createTensor({2, 4}, 1.0f);
+    auto grad_inputs = matmul_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+TEST_P(AutogradMatmulBackwardTest, MatmulBackwardSquare) {
+    auto a = createTensor({3, 3}, 2.0f);
+    auto b = createTensor({3, 3}, 3.0f);
+    auto matmul_fn = std::make_shared<autograd::Matmul>();
+    auto result = matmul_fn->Apply({a, b});
+    auto grad = createTensor({3, 3}, 1.0f);
+    auto grad_inputs = matmul_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+TEST_P(AutogradMatmulBackwardTest, MatmulBackwardDifferentShapes) {
+    auto a = createTensor({3, 4}, 1.5f);
+    auto b = createTensor({4, 2}, 2.5f);
+    auto matmul_fn = std::make_shared<autograd::Matmul>();
+    auto result = matmul_fn->Apply({a, b});
+    auto grad = createTensor({3, 2}, 1.0f);
+    auto grad_inputs = matmul_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 2);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradMatmulBackwardTest);
diff --git a/tests/autograd/test_autograd_matmul_forward.cc b/tests/autograd/test_autograd_matmul_forward.cc
new file mode 100644
index 00000000..8e325c03
--- /dev/null
+++ b/tests/autograd/test_autograd_matmul_forward.cc
@@ -0,0 +1,50 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/matmul.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradMatmulForwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradMatmulForwardTest, MatmulForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({3, 4}, 1.0f);
+    auto matmul_fn = std::make_shared<autograd::Matmul>();
+    auto result = matmul_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 4}));
+}
+
+TEST_P(AutogradMatmulForwardTest, MatmulDifferentShapes) {
+    auto a = createTensor({3, 4}, 1.0f);
+    auto b = createTensor({4, 2}, 1.0f);
+    auto matmul_fn = std::make_shared<autograd::Matmul>();
+    auto result = matmul_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{3, 2}));
+}
+
+TEST_P(AutogradMatmulForwardTest, MatmulBatch) {
+    auto a = createTensor({2, 3, 4}, 1.0f);
+    auto b = createTensor({2, 4, 5}, 1.0f);
+    auto matmul_fn = std::make_shared<autograd::Matmul>();
+    auto result = matmul_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 3, 5}));
+}
+
+TEST_P(AutogradMatmulForwardTest, MatmulSquare) {
+    auto a = createTensor({3, 3}, 1.0f);
+    auto b = createTensor({3, 3}, 1.0f);
+    auto matmul_fn = std::make_shared<autograd::Matmul>();
+    auto result = matmul_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{3, 3}));
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradMatmulForwardTest);
diff --git a/tests/autograd/test_autograd_normalization_backward.cc b/tests/autograd/test_autograd_normalization_backward.cc
new file mode 100644
index 00000000..b477cf47
--- /dev/null
+++ b/tests/autograd/test_autograd_normalization_backward.cc
@@ -0,0 +1,36 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/normalization.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradNormalizationBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradNormalizationBackwardTest, LayerNormBackward) {
+    auto a = createTensor({2, 3, 4}, 1.0f);
+    auto weight = createTensor({4}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto layernorm_fn = std::make_shared<autograd::LayerNorm>(1e-5f);
+    auto result = layernorm_fn->Apply({a, weight, bias});
+    auto grad = createTensor({2, 3, 4}, 1.0f);
+    auto grad_inputs = layernorm_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 3);
+}
+
+TEST_P(AutogradNormalizationBackwardTest, LayerNormBackwardZeroBias) {
+    auto a = createTensor({2, 3, 4}, 1.0f);
+    auto weight = createTensor({4}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto layernorm_fn = std::make_shared<autograd::LayerNorm>(1e-5f);
+    auto result = layernorm_fn->Apply({a, weight, bias});
+    auto grad = createTensor({2, 3, 4}, 1.0f);
+    auto grad_inputs = layernorm_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 3);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradNormalizationBackwardTest);
diff --git a/tests/autograd/test_autograd_normalization_forward.cc b/tests/autograd/test_autograd_normalization_forward.cc
new file mode 100644
index 00000000..4ed92296
--- /dev/null
+++ b/tests/autograd/test_autograd_normalization_forward.cc
@@ -0,0 +1,42 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/normalization.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradNormalizationForwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradNormalizationForwardTest, LayerNormForward) {
+    auto a = createTensor({2, 3, 4}, 1.0f);
+    auto weight = createTensor({4}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto layernorm_fn = std::make_shared<autograd::LayerNorm>(1e-5f);
+    auto result = layernorm_fn->Apply({a, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradNormalizationForwardTest, LayerNormZeroBias) {
+    auto a = createTensor({2, 3, 4}, 1.0f);
+    auto weight = createTensor({4}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto layernorm_fn = std::make_shared<autograd::LayerNorm>(1e-5f);
+    auto result = layernorm_fn->Apply({a, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradNormalizationForwardTest, LayerNormThreeDim) {
+    auto a = createTensor({2, 1, 4}, 1.0f);
+    auto weight = createTensor({4}, 1.0f);
+    auto bias = createTensor({4}, 0.0f);
+    auto layernorm_fn = std::make_shared<autograd::LayerNorm>(1e-5f);
+    auto result = layernorm_fn->Apply({a, weight, bias});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 1, 4}));
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradNormalizationForwardTest);
diff --git a/tests/autograd/test_autograd_reduction_backward.cc b/tests/autograd/test_autograd_reduction_backward.cc
new file mode 100644
index 00000000..9834ffd6
--- /dev/null
+++ b/tests/autograd/test_autograd_reduction_backward.cc
@@ -0,0 +1,68 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/reduction.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradReductionBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradReductionBackwardTest, SumBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto sum_fn = std::make_shared<autograd::Sum>(1, false);
+    auto result = sum_fn->Apply({a});
+    auto grad = createTensor({2}, 1.0f);
+    auto grad_inputs = sum_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradReductionBackwardTest, MeanBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto mean_fn = std::make_shared<autograd::Mean>(1, false);
+    auto result = mean_fn->Apply({a});
+    auto grad = createTensor({2}, 1.0f);
+    auto grad_inputs = mean_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradReductionBackwardTest, MaxBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto max_fn = std::make_shared<autograd::Max>(1, false);
+    auto result = max_fn->Apply({a});
+    auto grad = createTensor({2}, 1.0f);
+    auto grad_inputs = max_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradReductionBackwardTest, MinBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto min_fn = std::make_shared<autograd::Min>(1, false);
+    auto result = min_fn->Apply({a});
+    auto grad = createTensor({2}, 1.0f);
+    auto grad_inputs = min_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradReductionBackwardTest, SumBackwardKeepDim) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto sum_fn = std::make_shared<autograd::Sum>(1, true);
+    auto result = sum_fn->Apply({a});
+    auto grad = createTensor({2, 1}, 1.0f);
+    auto grad_inputs = sum_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradReductionBackwardTest, MeanBackwardKeepDim) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto mean_fn = std::make_shared<autograd::Mean>(1, true);
+    auto result = mean_fn->Apply({a});
+    auto grad = createTensor({2, 1}, 1.0f);
+    auto grad_inputs = mean_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradReductionBackwardTest);
diff --git a/tests/autograd/test_autograd_reduction_forward.cc b/tests/autograd/test_autograd_reduction_forward.cc
new file mode 100644
index 00000000..4d8ed2e4
--- /dev/null
+++ b/tests/autograd/test_autograd_reduction_forward.cc
@@ -0,0 +1,56 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/reduction.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradReductionForwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradReductionForwardTest, SumForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto sum_fn = std::make_shared<autograd::Sum>(1, false);
+    auto result = sum_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradReductionForwardTest, MeanForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto mean_fn = std::make_shared<autograd::Mean>(1, false);
+    auto result = mean_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradReductionForwardTest, MaxForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto max_fn = std::make_shared<autograd::Max>(1, false);
+    auto result = max_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradReductionForwardTest, MinForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto min_fn = std::make_shared<autograd::Min>(1, false);
+    auto result = min_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradReductionForwardTest, SumKeepDim) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto sum_fn = std::make_shared<autograd::Sum>(1, true);
+    auto result = sum_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradReductionForwardTest, MeanKeepDim) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto mean_fn = std::make_shared<autograd::Mean>(1, true);
+    auto result = mean_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradReductionForwardTest);
diff --git a/tests/autograd/test_autograd_softmax_backward.cc b/tests/autograd/test_autograd_softmax_backward.cc
new file mode 100644
index 00000000..b07ac833
--- /dev/null
+++ b/tests/autograd/test_autograd_softmax_backward.cc
@@ -0,0 +1,32 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/softmax.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradSoftmaxBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradSoftmaxBackwardTest, SoftmaxBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto softmax_fn = std::make_shared<autograd::Softmax>(1);
+    auto result = softmax_fn->Apply({a});
+    auto grad = createTensor({2, 3}, 1.0f);
+    auto grad_inputs = softmax_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+TEST_P(AutogradSoftmaxBackwardTest, SoftmaxBackwardDim0) {
+    auto a = createTensor({4, 3}, 1.0f);
+    auto softmax_fn = std::make_shared<autograd::Softmax>(0);
+    auto result = softmax_fn->Apply({a});
+    auto grad = createTensor({4, 3}, 1.0f);
+    auto grad_inputs = softmax_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradSoftmaxBackwardTest);
diff --git a/tests/autograd/test_autograd_softmax_forward.cc b/tests/autograd/test_autograd_softmax_forward.cc
new file mode 100644
index 00000000..42939fc9
--- /dev/null
+++ b/tests/autograd/test_autograd_softmax_forward.cc
@@ -0,0 +1,38 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/softmax.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradSoftmaxForwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradSoftmaxForwardTest, SoftmaxForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto softmax_fn = std::make_shared<autograd::Softmax>(1);
+    auto result = softmax_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 3}));
+}
+
+TEST_P(AutogradSoftmaxForwardTest, SoftmaxDim0) {
+    auto a = createTensor({4, 3}, 1.0f);
+    auto softmax_fn = std::make_shared<autograd::Softmax>(0);
+    auto result = softmax_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{4, 3}));
+}
+
+TEST_P(AutogradSoftmaxForwardTest, SoftmaxLastDim) {
+    auto a = createTensor({2, 3, 4}, 1.0f);
+    auto softmax_fn = std::make_shared<autograd::Softmax>(2);
+    auto result = softmax_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 3, 4}));
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradSoftmaxForwardTest);
diff --git a/tests/autograd/test_autograd_transform_backward.cc b/tests/autograd/test_autograd_transform_backward.cc
new file mode 100644
index 00000000..2a542d4e
--- /dev/null
+++ b/tests/autograd/test_autograd_transform_backward.cc
@@ -0,0 +1,23 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/transform.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradTransformBackwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradTransformBackwardTest, TransposeBackward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto transpose_fn = std::make_shared<autograd::Transpose>(0, 1);
+    auto result = transpose_fn->Apply({a});
+    auto grad = createTensor({3, 2}, 1.0f);
+    auto grad_inputs = transpose_fn->Backward({grad});
+    EXPECT_EQ(grad_inputs.size(), 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradTransformBackwardTest);
diff --git a/tests/autograd/test_autograd_transform_forward.cc b/tests/autograd/test_autograd_transform_forward.cc
new file mode 100644
index 00000000..bc4da44f
--- /dev/null
+++ b/tests/autograd/test_autograd_transform_forward.cc
@@ -0,0 +1,72 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/autograd/transform.h"
+#include "infini_train/include/autograd/misc.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class AutogradTransformForwardTest : public infini_train::test::AutogradTestBaseP {};
+
+TEST_P(AutogradTransformForwardTest, TransposeForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto transpose_fn = std::make_shared<autograd::Transpose>(0, 1);
+    auto result = transpose_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{3, 2}));
+}
+
+TEST_P(AutogradTransformForwardTest, SliceForward) {
+    auto a = createTensor({4, 4}, 1.0f);
+    auto slice_fn = std::make_shared<autograd::Slice>(
+        std::vector<int64_t>{1, 1},
+        std::vector<int64_t>{3, 3},
+        std::vector<int64_t>{1, 1});
+    auto result = slice_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradTransformForwardTest, SplitForward) {
+    auto a = createTensor({4, 4}, 1.0f);
+    auto split_fn = std::make_shared<autograd::Split>(2, 0);
+    auto result = split_fn->Apply({a});
+    EXPECT_EQ(result.size(), 2);
+}
+
+TEST_P(AutogradTransformForwardTest, ConcatForward) {
+    auto a = createTensor({2, 2}, 1.0f);
+    auto b = createTensor({2, 2}, 2.0f);
+    auto concat_fn = std::make_shared<autograd::Concat>(0);
+    auto result = concat_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{4, 2}));
+}
+
+TEST_P(AutogradTransformForwardTest, StackForward) {
+    auto a = createTensor({2, 3}, 1.0f);
+    auto b = createTensor({2, 3}, 2.0f);
+    auto stack_fn = std::make_shared<autograd::Stack>(0);
+    auto result = stack_fn->Apply({a, b});
+    EXPECT_EQ(result.size(), 1);
+    EXPECT_EQ(result[0]->Dims(), (std::vector<int64_t>{2, 2, 3}));
+}
+
+TEST_P(AutogradTransformForwardTest, TrilForward) {
+    auto a = createTensor({3, 3}, 1.0f);
+    auto tril_fn = std::make_shared<autograd::Tril>(0);
+    auto result = tril_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+TEST_P(AutogradTransformForwardTest, TriuForward) {
+    auto a = createTensor({3, 3}, 1.0f);
+    auto triu_fn = std::make_shared<autograd::Triu>(0);
+    auto result = triu_fn->Apply({a});
+    EXPECT_EQ(result.size(), 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST(AutogradTransformForwardTest);
diff --git a/tests/common/CMakeLists.txt b/tests/common/CMakeLists.txt
new file mode 100644
index 00000000..3960d474
--- /dev/null
+++ b/tests/common/CMakeLists.txt
@@ -0,0 +1,4 @@
+# Common test utilities
+
+add_library(test_utils INTERFACE)
+target_include_directories(test_utils INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/tests/common/test_macros.cmake b/tests/common/test_macros.cmake
new file mode 100644
index 00000000..477a668d
--- /dev/null
+++ b/tests/common/test_macros.cmake
@@ -0,0 +1,141 @@
+# ============================================================================
+# InfiniTrain Test Macros
+# ============================================================================
+# Unified test configuration interface to reduce boilerplate.
+#
+# Usage:
+#   1. Include this file in tests/CMakeLists.txt
+#   2. Use infini_train_add_test macro to register tests
+#
+# Examples:
+#   infini_train_add_test(
+#     test_tensor_create
+#     SOURCES test_tensor_create.cc
+#     LABELS cpu cuda
+#   )
+# ============================================================================
+
+include_guard(GLOBAL)
+
+# Path to this file's directory (tests/common/)
+set(TEST_MACROS_DIR "${CMAKE_CURRENT_LIST_DIR}")
+
+# -----------------------------------------------------------------------------
+# Load GoogleTest module (provides gtest_discover_tests)
+# -----------------------------------------------------------------------------
+include(GoogleTest)
+
+# -----------------------------------------------------------------------------
+# infini_train_add_test - Test registration macro
+# -----------------------------------------------------------------------------
+# Features:
+#   1. Create executable target
+#   2. Configure compile options, link libraries, and include paths
+#   3. Use gtest_discover_tests to auto-discover test cases
+#   4. Set test labels
+#
+# Arguments:
+#   SOURCES:    Source file list (required)
+#   LABELS:     Test labels, e.g. "cpu" "cuda" "distributed" (optional, default "cpu")
+#   TEST_FILTER: gtest test filter pattern (optional)
+#
+# Examples:
+#   # Single-label test (one liner)
+#   infini_train_add_test(test_example SOURCES test_example.cc LABELS cpu)
+#
+#   # Filter same binary by label suffix (one call per label)
+#   infini_train_add_test(test_example SOURCES test_example.cc LABELS cpu TEST_FILTER "-*CUDA*")
+#   infini_train_add_test(test_example_cuda SOURCES test_example.cc LABELS cuda TEST_FILTER "*CUDA*")
+# -----------------------------------------------------------------------------
+macro(infini_train_add_test)
+  cmake_parse_arguments(ARG "" "TEST_NAME;TEST_FILTER" "SOURCES;LABELS" ${ARGN})
+
+  if(NOT ARG_TEST_NAME)
+    set(ARG_TEST_NAME ${ARG_UNPARSED_ARGUMENTS})
+  endif()
+
+  if(NOT ARG_SOURCES)
+    message(FATAL_ERROR "infini_train_add_test: TEST_NAME and SOURCES are required")
+  endif()
+
+  # 1. Create executable target
+  add_executable(${ARG_TEST_NAME} ${ARG_SOURCES})
+
+  # 2. Disable -Werror so tests can run under relaxed warning levels
+  target_compile_options(${ARG_TEST_NAME} PRIVATE -Wno-error)
+
+  # 3. Link Google Test
+  target_link_libraries(${ARG_TEST_NAME} PRIVATE
+    GTest::gtest
+    GTest::gtest_main
+  )
+
+  # 4. Add include paths
+  target_include_directories(${ARG_TEST_NAME} PRIVATE
+    ${TEST_MACROS_DIR}
+    ${glog_SOURCE_DIR}/src
+  )
+
+  # 5. Link project library (reuses framework linking strategy)
+  link_infini_train_exe(${ARG_TEST_NAME})
+
+  # 6. Auto-discover gtest cases and register as ctest tests
+  set(labels "cpu")
+  if(ARG_LABELS)
+    set(labels "${ARG_LABELS}")
+  endif()
+
+  if(ARG_TEST_FILTER)
+    gtest_discover_tests(${ARG_TEST_NAME}
+      EXTRA_ARGS --gtest_output=xml:%T.xml
+      TEST_FILTER "${ARG_TEST_FILTER}"
+      PROPERTIES LABELS "${labels}"
+    )
+  else()
+    gtest_discover_tests(${ARG_TEST_NAME}
+      EXTRA_ARGS --gtest_output=xml:%T.xml
+      PROPERTIES LABELS "${labels}"
+    )
+  endif()
+endmacro()
+
+# -----------------------------------------------------------------------------
+# infini_train_add_test_suite - Register cpu/cuda/distributed targets in one call
+# -----------------------------------------------------------------------------
+# Calls infini_train_add_test three times (or fewer) with the correct
+# TEST_FILTER and LABELS derived from the label list.
+#
+# Arguments:
+#   <name>   Base name; each target is named <name>_<label>
+#   SOURCES  Source file list (required)
+#   LABELS   Subset of {cpu cuda distributed} (optional, default: all three)
+#
+# Examples:
+#   infini_train_add_test_suite(test_tensor SOURCES ${TENSOR_TEST_SOURCES})
+#   infini_train_add_test_suite(test_lora   SOURCES test_lora.cc LABELS cpu)
+# -----------------------------------------------------------------------------
+macro(infini_train_add_test_suite)
+  cmake_parse_arguments(SUITE "" "" "SOURCES;LABELS" ${ARGN})
+  set(_suite_name ${SUITE_UNPARSED_ARGUMENTS})
+
+  if(NOT SUITE_LABELS)
+    set(SUITE_LABELS cpu cuda distributed)
+  endif()
+
+  foreach(_label IN LISTS SUITE_LABELS)
+    if(_label STREQUAL "cpu")
+      set(_filter "CPU/*")
+    elseif(_label STREQUAL "cuda")
+      set(_filter "CUDA/*")
+    elseif(_label STREQUAL "distributed")
+      set(_filter "Distributed/*")
+    else()
+      message(FATAL_ERROR "infini_train_add_test_suite: unknown label '${_label}'")
+    endif()
+    infini_train_add_test(${_suite_name}_${_label}
+      SOURCES ${SUITE_SOURCES}
+      LABELS ${_label}
+      TEST_FILTER "${_filter}"
+    )
+  endforeach()
+endmacro()
diff --git a/tests/common/test_utils.h b/tests/common/test_utils.h
new file mode 100644
index 00000000..4da3ffe0
--- /dev/null
+++ b/tests/common/test_utils.h
@@ -0,0 +1,245 @@
+#pragma once
+
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#if defined(USE_CUDA)
+#if defined(__has_include)
+#if __has_include(<cuda_runtime_api.h>)
+#include <cuda_runtime_api.h>
+#else
+#error "CUDA runtime headers are required when USE_CUDA=ON"
+#endif
+#else
+#include <cuda_runtime_api.h>
+#endif
+#endif
+
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/tensor.h"
+
+namespace infini_train {
+namespace test {
+
+#ifdef USE_CUDA
+inline int GetCudaDeviceCount() {
+    int count = 0;
+    cudaError_t err = cudaGetDeviceCount(&count);
+    if (err != cudaSuccess) {
+        return 0;
+    }
+    return std::max(count, 0);
+}
+#else
+inline int GetCudaDeviceCount() { return 0; }
+#endif
+
+inline bool HasCudaRuntime() { return GetCudaDeviceCount() > 0; }
+
+inline bool HasNCCL() {
+#ifdef USE_NCCL
+    return true;
+#else
+    return false;
+#endif
+}
+
+inline bool HasDistributedSupport() { return HasCudaRuntime() && HasNCCL() && GetCudaDeviceCount() >= 2; }
+
+inline void FillSequentialTensor(const std::shared_ptr<Tensor> &tensor, float start = 0.0f) {
+    size_t size = 1;
+    for (auto dim : tensor->Dims()) { size *= static_cast<size_t>(dim); }
+
+    if (tensor->GetDevice().IsCUDA()) {
+        auto cpu_tensor
+            = std::make_shared<Tensor>(tensor->Dims(), tensor->Dtype(), Device(Device::DeviceType::kCPU, 0));
+        auto *cpu_data = static_cast<float *>(cpu_tensor->DataPtr());
+        for (size_t i = 0; i < size; ++i) { cpu_data[i] = start + static_cast<float>(i); }
+        tensor->CopyFrom(cpu_tensor);
+        return;
+    }
+
+    auto *data = static_cast<float *>(tensor->DataPtr());
+    for (size_t i = 0; i < size; ++i) { data[i] = start + static_cast<float>(i); }
+}
+
+inline void FillConstantTensor(const std::shared_ptr<Tensor> &tensor, float value) {
+    size_t size = 1;
+    for (auto dim : tensor->Dims()) { size *= static_cast<size_t>(dim); }
+
+    if (tensor->GetDevice().IsCUDA()) {
+        auto cpu_tensor
+            = std::make_shared<Tensor>(tensor->Dims(), tensor->Dtype(), Device(Device::DeviceType::kCPU, 0));
+        auto *cpu_data = static_cast<float *>(cpu_tensor->DataPtr());
+        for (size_t i = 0; i < size; ++i) { cpu_data[i] = value; }
+        tensor->CopyFrom(cpu_tensor);
+        return;
+    }
+
+    auto *data = static_cast<float *>(tensor->DataPtr());
+    for (size_t i = 0; i < size; ++i) { data[i] = value; }
+}
+
+#define REQUIRE_CUDA()                                                                                                 \
+    do {                                                                                                               \
+        if (!infini_train::test::HasCudaRuntime()) {                                                                   \
+            GTEST_SKIP() << "requires CUDA support (found " << infini_train::test::GetCudaDeviceCount() << " GPUs)";   \
+        }                                                                                                              \
+    } while (0)
+
+#define REQUIRE_MIN_GPUS(n)                                                                                            \
+    do {                                                                                                               \
+        int available_gpus = infini_train::test::GetCudaDeviceCount();                                                 \
+        if (available_gpus < (n)) {                                                                                    \
+            GTEST_SKIP() << "requires at least " << (n) << " GPUs (found " << available_gpus << ")";                   \
+        }                                                                                                              \
+    } while (0)
+
+#define REQUIRE_NCCL()                                                                                                 \
+    do {                                                                                                               \
+        if (!infini_train::test::HasNCCL()) {                                                                          \
+            GTEST_SKIP() << "NCCL support is disabled (build with USE_NCCL=ON)";                                       \
+        }                                                                                                              \
+    } while (0)
+
+#define REQUIRE_DISTRIBUTED()                                                                                          \
+    do {                                                                                                               \
+        REQUIRE_NCCL();                                                                                                \
+        REQUIRE_MIN_GPUS(2);                                                                                           \
+    } while (0)
+
+class InfiniTrainTest : public ::testing::Test {
+protected:
+    static void SetUpTestSuite() { nn::parallel::global::GlobalEnv::Instance().Init(1, 1, false, 1, 1); }
+};
+
+// ---------------------------------------------------------------------------
+// Parametrized base classes (TEST_P).
+// Use prefix CPU/ or CUDA/ in INSTANTIATE_TEST_SUITE_P so that CMake
+// TEST_FILTER can route each instantiation to the right label.
+// ---------------------------------------------------------------------------
+
+class InfiniTrainTestP : public ::testing::TestWithParam<Device::DeviceType> {
+protected:
+    static void SetUpTestSuite() { nn::parallel::global::GlobalEnv::Instance().Init(1, 1, false, 1, 1); }
+    void SetUp() override {
+        if (GetParam() == Device::DeviceType::kCUDA && !HasCudaRuntime()) {
+            GTEST_SKIP() << "requires CUDA (" << GetCudaDeviceCount() << " GPUs found)";
+        }
+    }
+    Device GetDevice() const { return Device(GetParam(), 0); }
+};
+
+class TensorTestBaseP : public InfiniTrainTestP {
+protected:
+    std::shared_ptr<Tensor> createTensor(const std::vector<int64_t> &shape = {2, 3, 4},
+                                         DataType dtype = DataType::kFLOAT32, bool requires_grad = false) {
+        auto t = std::make_shared<Tensor>(shape, dtype, GetDevice());
+        t->set_requires_grad(requires_grad);
+        return t;
+    }
+};
+
+class AutogradTestBaseP : public InfiniTrainTestP {
+protected:
+    std::shared_ptr<Tensor> createTensor(const std::vector<int64_t> &shape, float value = 0.0f) {
+        auto t = std::make_shared<Tensor>(shape, DataType::kFLOAT32, GetDevice());
+        t->set_requires_grad(true);
+        FillSequentialTensor(t, value);
+        return t;
+    }
+};
+
+// Distributed variant: requires NCCL + >=2 GPUs.
+class DistributedInfiniTrainTestP : public ::testing::TestWithParam<Device::DeviceType> {
+protected:
+    static void SetUpTestSuite() { nn::parallel::global::GlobalEnv::Instance().Init(1, 1, false, 1, 1); }
+    void SetUp() override {
+        if (!HasNCCL()) {
+            GTEST_SKIP() << "NCCL support is disabled (build with USE_NCCL=ON)";
+        }
+        if (GetCudaDeviceCount() < 2) {
+            GTEST_SKIP() << "requires at least 2 GPUs (" << GetCudaDeviceCount() << " found)";
+        }
+    }
+    Device GetDevice() const { return Device(GetParam(), 0); }
+};
+
+class TensorTestBase : public InfiniTrainTest {
+protected:
+    std::vector<int64_t> default_shape_{2, 3, 4};
+    DataType default_dtype_{DataType::kFLOAT32};
+
+    std::shared_ptr<Tensor> createTensor(const std::vector<int64_t> &shape = {2, 3, 4},
+                                         DataType dtype = DataType::kFLOAT32, bool requires_grad = false,
+                                         Device::DeviceType device = Device::DeviceType::kCPU, int device_id = 0) {
+        auto tensor = std::make_shared<Tensor>(shape, dtype, Device(device, device_id));
+        tensor->set_requires_grad(requires_grad);
+        return tensor;
+    }
+
+    void fillTensor(std::shared_ptr<Tensor> tensor, float value) { FillSequentialTensor(tensor, value); }
+};
+
+class CPUTensorTest : public TensorTestBase {};
+
+#ifdef USE_CUDA
+class CUDATensorTest : public TensorTestBase {
+protected:
+    CUDATensorTest() {
+        default_shape_ = {2, 3, 4};
+        default_dtype_ = DataType::kFLOAT32;
+    }
+};
+#endif
+
+#ifdef USE_NCCL
+class DistributedTensorTest : public TensorTestBase {};
+#endif
+
+class AutogradTestBase : public InfiniTrainTest {
+protected:
+    std::shared_ptr<Tensor> createTensor(const std::vector<int64_t> &shape, float value = 0.0f,
+                                         Device::DeviceType device = Device::DeviceType::kCPU, int device_id = 0) {
+        auto tensor = std::make_shared<Tensor>(shape, DataType::kFLOAT32, Device(device, device_id));
+        tensor->set_requires_grad(true);
+        FillSequentialTensor(tensor, value);
+        return tensor;
+    }
+};
+
+class CPUAutogradTest : public AutogradTestBase {};
+
+#ifdef USE_CUDA
+class CUDAAutogradTest : public AutogradTestBase {};
+#endif
+
+#ifdef USE_NCCL
+class DistributedAutogradTest : public AutogradTestBase {};
+#endif
+
+} // namespace test
+} // namespace infini_train
+
+// ---------------------------------------------------------------------------
+// Convenience macros for instantiating parametrized test suites.
+// These expand INSTANTIATE_TEST_SUITE_P with the fixed CPU/CUDA/Distributed
+// prefixes that CMake TEST_FILTER patterns rely on.
+// ---------------------------------------------------------------------------
+
+// Instantiate a test suite on both CPU and CUDA.
+#define INFINI_TRAIN_REGISTER_TEST(TestName)                                                                           \
+    INSTANTIATE_TEST_SUITE_P(CPU, TestName, ::testing::Values(infini_train::Device::DeviceType::kCPU));                \
+    INSTANTIATE_TEST_SUITE_P(CUDA, TestName, ::testing::Values(infini_train::Device::DeviceType::kCUDA))
+
+// Instantiate a test suite on CPU only.
+#define INFINI_TRAIN_REGISTER_TEST_CPU(TestName)                                                                       \
+    INSTANTIATE_TEST_SUITE_P(CPU, TestName, ::testing::Values(infini_train::Device::DeviceType::kCPU))
+
+// Instantiate a distributed test suite (CUDA device,
+// DistributedInfiniTrainTestP provides the NCCL + GPU-count guard).
+#define INFINI_TRAIN_REGISTER_TEST_DISTRIBUTED(TestName)                                                               \
+    INSTANTIATE_TEST_SUITE_P(Distributed, TestName, ::testing::Values(infini_train::Device::DeviceType::kCUDA))
diff --git a/tests/hook/CMakeLists.txt b/tests/hook/CMakeLists.txt
new file mode 100644
index 00000000..74be8ce6
--- /dev/null
+++ b/tests/hook/CMakeLists.txt
@@ -0,0 +1,13 @@
+# ==========================================================================
+# Hook tests
+# ==========================================================================
+
+set(HOOK_TEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(HOOK_TEST_SOURCES
+  ${HOOK_TEST_DIR}/test_hook.cc
+  ${HOOK_TEST_DIR}/test_precision_check.cc
+)
+
+infini_train_add_test_suite(test_hook
+  SOURCES ${HOOK_TEST_SOURCES}
+)
diff --git a/tests/hook/test_hook.cc b/tests/hook/test_hook.cc
new file mode 100644
index 00000000..7fb69cdc
--- /dev/null
+++ b/tests/hook/test_hook.cc
@@ -0,0 +1,109 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "infini_train/include/autograd/elementwise.h"
+#include "infini_train/include/autograd/function.h"
+#include "infini_train/include/autograd/function_hook.h"
+#include "infini_train/include/common/hook.h"
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class HookTest : public infini_train::test::InfiniTrainTestP {};
+
+class TestModule : public nn::Module {
+public:
+    TestModule() : Module("TestModule") {}
+    std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &inputs) override {
+        return inputs;
+    }
+};
+
+TEST_P(HookTest, BasicModuleHooks) {
+    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    x->set_requires_grad(true);
+
+    auto module = std::make_shared<TestModule>();
+    auto pre_hook = module->RegisterForwardPreHook([](nn::Module *, const std::vector<std::shared_ptr<Tensor>> &) {});
+    auto fwd_hook = module->RegisterForwardPostHook([](nn::Module *, const std::vector<std::shared_ptr<Tensor>> &,
+                                                       const std::vector<std::shared_ptr<Tensor>> &) {});
+    auto bwd_pre_hook
+        = module->RegisterBackwardPreHook([](nn::Module *, const std::vector<std::shared_ptr<Tensor>> &) {});
+    auto bwd_post_hook = module->RegisterBackwardPostHook([](nn::Module *, const std::vector<std::shared_ptr<Tensor>> &,
+                                                             const std::vector<std::shared_ptr<Tensor>> &) {});
+
+    auto outputs = (*module)({x});
+    EXPECT_EQ(outputs.size(), 1);
+}
+
+TEST_P(HookTest, HookRemove) {
+    auto a = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32, GetDevice());
+    auto b = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32, GetDevice());
+    a->set_requires_grad(true);
+    b->set_requires_grad(true);
+
+    int hook1_count = 0, hook2_count = 0, hook3_count = 0;
+    auto add_fn = std::make_shared<autograd::Add>();
+
+    auto handle1 = add_fn->RegisterForwardPreHook(
+        [&hook1_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) { hook1_count++; });
+    auto handle2 = add_fn->RegisterForwardPreHook(
+        [&hook2_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) { hook2_count++; });
+    auto handle3 = add_fn->RegisterForwardPreHook(
+        [&hook3_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) { hook3_count++; });
+
+    add_fn->Apply({a, b});
+    EXPECT_EQ(hook1_count, 1);
+    EXPECT_EQ(hook2_count, 1);
+    EXPECT_EQ(hook3_count, 1);
+
+    handle2->Remove();
+    add_fn->Apply({a, b});
+    EXPECT_EQ(hook1_count, 2);
+    EXPECT_EQ(hook2_count, 1);
+    EXPECT_EQ(hook3_count, 2);
+
+    handle1->Remove();
+    add_fn->Apply({a, b});
+    EXPECT_EQ(hook1_count, 2);
+    EXPECT_EQ(hook2_count, 1);
+    EXPECT_EQ(hook3_count, 3);
+}
+
+INFINI_TRAIN_REGISTER_TEST(HookTest);
+
+// ============================================================================
+// Distributed
+// ============================================================================
+
+class HookDistributedTest : public infini_train::test::DistributedInfiniTrainTestP {};
+
+TEST_P(HookDistributedTest, BasicModuleHooks) {
+    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    x->set_requires_grad(true);
+    auto module = std::make_shared<TestModule>();
+    auto pre_hook = module->RegisterForwardPreHook([](nn::Module *, const std::vector<std::shared_ptr<Tensor>> &) {});
+    auto outputs = (*module)({x});
+    EXPECT_EQ(outputs.size(), 1);
+    EXPECT_TRUE(outputs[0]->GetDevice().IsCUDA());
+}
+
+TEST_P(HookDistributedTest, HookRemove) {
+    auto a = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32, GetDevice());
+    auto b = std::make_shared<Tensor>(std::vector<int64_t>{2, 2}, DataType::kFLOAT32, GetDevice());
+    a->set_requires_grad(true);
+    b->set_requires_grad(true);
+
+    int hook_count = 0;
+    auto add_fn = std::make_shared<autograd::Add>();
+    auto handle = add_fn->RegisterForwardPreHook(
+        [&hook_count](autograd::Function *, const std::vector<std::shared_ptr<Tensor>> &) { hook_count++; });
+    add_fn->Apply({a, b});
+    EXPECT_EQ(hook_count, 1);
+}
+
+INFINI_TRAIN_REGISTER_TEST_DISTRIBUTED(HookDistributedTest);
diff --git a/tests/hook/test_precision_check.cc b/tests/hook/test_precision_check.cc
new file mode 100644
index 00000000..24c38c1f
--- /dev/null
+++ b/tests/hook/test_precision_check.cc
@@ -0,0 +1,73 @@
+#include <gtest/gtest.h>
+
+#include <filesystem>
+#include <memory>
+#include <vector>
+
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/tensor.h"
+#include "infini_train/include/utils/precision_check_config.h"
+#include "infini_train/include/utils/precision_checker.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class PrecisionCheckTest : public infini_train::test::InfiniTrainTestP {};
+
+class SimpleModel : public nn::Module {
+public:
+    SimpleModel() : Module("SimpleModel") {}
+
+    std::vector<std::shared_ptr<Tensor>> Forward(const std::vector<std::shared_ptr<Tensor>> &inputs) override {
+        auto x = inputs[0];
+        x->RequiresGrad();
+        auto y = x->Mul(x)->Mul(x);
+        return {y};
+    }
+};
+
+TEST_P(PrecisionCheckTest, SimpleFormat) {
+    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    x->Fill<float>(2.0f);
+    x->RequiresGrad();
+
+    auto y = x->Mul(x);
+    auto loss = y->Sum(0, false)->Sum(0, false);
+    loss->Backward();
+
+    EXPECT_NE(x->DataPtr(), nullptr);
+}
+
+TEST_P(PrecisionCheckTest, ModuleForwardBackward) {
+    auto model = std::make_shared<SimpleModel>();
+
+    auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    x->Fill<float>(2.0f);
+    x->RequiresGrad();
+
+    std::vector<std::shared_ptr<Tensor>> inputs = {x};
+    auto outputs = (*model)(inputs);
+    auto loss = outputs[0]->Sum(0, false)->Sum(0, false);
+    loss->Backward();
+
+    EXPECT_TRUE(x->requires_grad());
+}
+
+TEST_P(PrecisionCheckTest, MultiIteration) {
+    auto model = std::make_shared<SimpleModel>();
+
+    for (int i = 0; i < 3; ++i) {
+        auto x = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+        x->Fill<float>(2.0f);
+        x->RequiresGrad();
+
+        std::vector<std::shared_ptr<Tensor>> inputs = {x};
+        auto outputs = (*model)(inputs);
+        auto loss = outputs[0]->Sum(0, false)->Sum(0, false);
+        loss->Backward();
+    }
+
+    SUCCEED();
+}
+
+INFINI_TRAIN_REGISTER_TEST(PrecisionCheckTest);
diff --git a/tests/lora/CMakeLists.txt b/tests/lora/CMakeLists.txt
new file mode 100644
index 00000000..d9187e5b
--- /dev/null
+++ b/tests/lora/CMakeLists.txt
@@ -0,0 +1,9 @@
+# ==========================================================================
+# LoRA tests
+# ==========================================================================
+
+set(LORA_TEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+
+infini_train_add_test_suite(test_lora
+  SOURCES ${LORA_TEST_DIR}/test_lora.cc
+)
diff --git a/tests/lora/test_lora.cc b/tests/lora/test_lora.cc
new file mode 100644
index 00000000..127d6baf
--- /dev/null
+++ b/tests/lora/test_lora.cc
@@ -0,0 +1,339 @@
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "infini_train/include/nn/lora/lora_config.h"
+#include "infini_train/include/nn/lora/lora_linear.h"
+#include "infini_train/include/nn/lora/lora_utils.h"
+#include "infini_train/include/nn/modules/container.h"
+#include "infini_train/include/nn/modules/linear.h"
+#include "infini_train/include/nn/modules/module.h"
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+using namespace infini_train::nn::lora;
+
+class LoRATest : public infini_train::test::InfiniTrainTestP {};
+
+// Helper: sum tensor values on any device by copying to CPU if needed.
+static float TensorSum(const std::shared_ptr<Tensor> &t) {
+    if (t->GetDevice().IsCPU()) {
+        return t->EigenMatrix().sum();
+    }
+    auto cpu = std::make_shared<Tensor>(t->Dims(), t->Dtype(), Device(Device::DeviceType::kCPU, 0));
+    cpu->CopyFrom(t);
+    return cpu->EigenMatrix().sum();
+}
+
+TEST_P(LoRATest, LoRAConfigScaling) {
+    LoRAConfig config;
+    config.rank = 8;
+    config.alpha = 16.0f;
+
+    float expected_scaling = 16.0f / 8.0f;
+    EXPECT_EQ(config.Scaling(), expected_scaling);
+}
+
+TEST_P(LoRATest, LoRAConfigShouldApply) {
+    LoRAConfig config;
+    config.rank = 8;
+    config.alpha = 16.0f;
+
+    EXPECT_TRUE(config.ShouldApplyLoRA("c_attn"));
+    EXPECT_TRUE(config.ShouldApplyLoRA("transformer.h.0.attn.c_attn"));
+    EXPECT_TRUE(config.ShouldApplyLoRA("c_proj"));
+    EXPECT_FALSE(config.ShouldApplyLoRA("c_fc"));
+    EXPECT_FALSE(config.ShouldApplyLoRA("random_layer"));
+}
+
+TEST_P(LoRATest, LoRALinearFromModel) {
+    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true, GetDevice());
+
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+
+    auto model = GetLoRAModel(base_linear, config);
+
+    auto *lora_linear = dynamic_cast<LoRALinear *>(model.get());
+    ASSERT_NE(lora_linear, nullptr);
+
+    EXPECT_EQ(lora_linear->in_features(), 64);
+    EXPECT_EQ(lora_linear->out_features(), 128);
+    EXPECT_EQ(lora_linear->rank(), 4);
+
+    auto lora_A = lora_linear->parameter(LoRALinear::kParamLoraAName);
+    auto lora_B = lora_linear->parameter(LoRALinear::kParamLoraBName);
+    auto weight = lora_linear->parameter(nn::Linear::kParamWeightName);
+
+    EXPECT_EQ(lora_A->Dims()[0], config.rank);
+    EXPECT_EQ(lora_A->Dims()[1], 64);
+    EXPECT_EQ(lora_B->Dims()[0], 128);
+    EXPECT_EQ(lora_B->Dims()[1], config.rank);
+
+    EXPECT_FALSE(weight->requires_grad());
+    EXPECT_TRUE(lora_A->requires_grad());
+    EXPECT_TRUE(lora_B->requires_grad());
+
+    auto params = lora_linear->LoRAParameters();
+    EXPECT_EQ(params.size(), 2);
+}
+
+TEST_P(LoRATest, LoRALinearForward) {
+    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true, GetDevice());
+
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+
+    auto model = GetLoRAModel(base_linear, config);
+
+    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 10, 64}, DataType::kFLOAT32, GetDevice());
+
+    auto output = (*model)({input})[0];
+
+    EXPECT_EQ(output->Dims().size(), 3);
+    EXPECT_EQ(output->Dims()[0], 2);
+    EXPECT_EQ(output->Dims()[1], 10);
+    EXPECT_EQ(output->Dims()[2], 128);
+}
+
+TEST_P(LoRATest, LoRALinearMerge) {
+    auto base_linear = std::make_shared<nn::Linear>(32, 64, /*bias=*/false, GetDevice());
+
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+
+    auto model = GetLoRAModel(base_linear, config);
+
+    auto *lora_linear = dynamic_cast<LoRALinear *>(model.get());
+    ASSERT_NE(lora_linear, nullptr);
+
+    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 5, 32}, DataType::kFLOAT32, GetDevice());
+    infini_train::test::FillSequentialTensor(input, 1.0f);
+
+    auto output_before = (*model)({input})[0];
+    float output_before_sum = TensorSum(output_before);
+
+    EXPECT_FALSE(lora_linear->IsMerged());
+    MergeLoRAWeights(model);
+    EXPECT_TRUE(lora_linear->IsMerged());
+
+    auto lora_A = lora_linear->parameter(LoRALinear::kParamLoraAName);
+    auto lora_B = lora_linear->parameter(LoRALinear::kParamLoraBName);
+    EXPECT_FALSE(lora_A->requires_grad());
+    EXPECT_FALSE(lora_B->requires_grad());
+
+    auto output_merged = (*model)({input})[0];
+    float output_merged_sum = TensorSum(output_merged);
+    EXPECT_NEAR(std::abs(output_before_sum - output_merged_sum), 0.0f, 1e-3);
+
+    UnmergeLoRAWeights(model);
+    EXPECT_FALSE(lora_linear->IsMerged());
+    EXPECT_TRUE(lora_A->requires_grad());
+    EXPECT_TRUE(lora_B->requires_grad());
+
+    auto output_unmerged = (*model)({input})[0];
+    EXPECT_EQ(output_before->Dims(), output_unmerged->Dims());
+}
+
+TEST_P(LoRATest, LoRAUtils) {
+    auto base_linear = std::make_shared<nn::Linear>(32, 64, /*bias=*/true, GetDevice());
+
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+
+    auto model = GetLoRAModel(base_linear, config);
+
+    auto lora_params = GetLoRAParameters(model);
+    EXPECT_EQ(lora_params.size(), 2);
+
+    int64_t trainable = CountTrainableParameters(model);
+    int64_t expected_trainable = config.rank * 32 + 64 * config.rank;
+    EXPECT_EQ(trainable, expected_trainable);
+
+    int64_t total = CountTotalParameters(model);
+    int64_t expected_total = 64 * 32 + 64 + config.rank * 32 + 64 * config.rank;
+    EXPECT_EQ(total, expected_total);
+}
+
+TEST_P(LoRATest, ParseLoRATargetModules) {
+    auto modules = ParseLoRATargetModules("c_attn");
+    EXPECT_EQ(modules.size(), 1);
+    EXPECT_TRUE(modules.count("c_attn"));
+
+    modules = ParseLoRATargetModules("c_attn,c_proj,c_fc");
+    EXPECT_EQ(modules.size(), 3);
+    EXPECT_TRUE(modules.count("c_attn"));
+    EXPECT_TRUE(modules.count("c_proj"));
+    EXPECT_TRUE(modules.count("c_fc"));
+
+    modules = ParseLoRATargetModules("c_attn, c_proj , c_fc");
+    EXPECT_EQ(modules.size(), 3);
+
+    modules = ParseLoRATargetModules("c_attn,,c_proj");
+    EXPECT_EQ(modules.size(), 2);
+}
+
+TEST_P(LoRATest, ShouldApplyLoRAEdgeCases) {
+    {
+        LoRAConfig config{8, 16.0f, 0.0f, ParseLoRATargetModules("c_attn,attn.c_proj")};
+        EXPECT_TRUE(config.ShouldApplyLoRA("attn.c_proj"));
+        EXPECT_TRUE(config.ShouldApplyLoRA("transformer.h.0.attn.c_proj"));
+        EXPECT_FALSE(config.ShouldApplyLoRA("mlp.c_proj"));
+    }
+
+    {
+        LoRAConfig config{8, 16.0f, 0.0f, ParseLoRATargetModules("c_attn,mlp.c_proj")};
+        EXPECT_FALSE(config.ShouldApplyLoRA("attn.c_proj"));
+        EXPECT_TRUE(config.ShouldApplyLoRA("mlp.c_proj"));
+    }
+
+    {
+        LoRAConfig config{8, 16.0f, 0.0f, ParseLoRATargetModules("c_attn,c_proj")};
+        EXPECT_TRUE(config.ShouldApplyLoRA("transformer.h.0.attn.c_proj"));
+        EXPECT_TRUE(config.ShouldApplyLoRA("transformer.h.0.mlp.c_proj"));
+    }
+}
+
+TEST_P(LoRATest, FreezeUnfreeze) {
+    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true, GetDevice());
+
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+
+    auto model = GetLoRAModel(base_linear, config);
+
+    auto all_params = model->Parameters();
+
+    int64_t total_trainable = 0;
+    for (const auto &p : all_params) {
+        if (p->requires_grad()) {
+            total_trainable += p->NumElements();
+        }
+    }
+    int64_t expected = config.rank * 64 + 128 * config.rank;
+    EXPECT_EQ(total_trainable, expected);
+
+    FreezeBaseModel(model);
+
+    int64_t after_freeze = 0;
+    for (const auto &p : all_params) {
+        if (p->requires_grad()) {
+            after_freeze += p->NumElements();
+        }
+    }
+    EXPECT_EQ(after_freeze, expected);
+
+    UnfreezeModel(model);
+    int64_t after_unfreeze = 0;
+    for (const auto &p : all_params) {
+        if (p->requires_grad()) {
+            after_unfreeze += p->NumElements();
+        }
+    }
+    int64_t expected_unfreeze = 64 * 128 + 128 + config.rank * 64 + 128 * config.rank;
+    EXPECT_EQ(after_unfreeze, expected_unfreeze);
+}
+
+TEST_P(LoRATest, LoRAStateDict) {
+    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true, GetDevice());
+
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+
+    auto model = GetLoRAModel(base_linear, config);
+
+    auto state_dict = model->StateDict();
+
+    EXPECT_TRUE(state_dict.count("weight"));
+    EXPECT_TRUE(state_dict.count("bias"));
+    EXPECT_TRUE(state_dict.count("lora_A"));
+    EXPECT_TRUE(state_dict.count("lora_B"));
+
+    EXPECT_TRUE(state_dict.at("lora_A")->requires_grad());
+    EXPECT_TRUE(state_dict.at("lora_B")->requires_grad());
+    EXPECT_FALSE(state_dict.at("weight")->requires_grad());
+
+    EXPECT_EQ(state_dict.at("lora_A")->Dims()[0], config.rank);
+    EXPECT_EQ(state_dict.at("lora_A")->Dims()[1], 64);
+    EXPECT_EQ(state_dict.at("lora_B")->Dims()[0], 128);
+    EXPECT_EQ(state_dict.at("lora_B")->Dims()[1], config.rank);
+}
+
+TEST_P(LoRATest, GetLoRAModel) {
+    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true, GetDevice());
+
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+
+    auto model = GetLoRAModel(base_linear, config);
+
+    EXPECT_NE(model, nullptr);
+
+    auto lora_params = GetLoRAParameters(model);
+    EXPECT_EQ(lora_params.size(), 2);
+
+    int64_t total_elements = 0;
+    for (const auto &t : lora_params) { total_elements += t->NumElements(); }
+    int64_t expected_elements = config.rank * 64 + 128 * config.rank;
+    EXPECT_EQ(total_elements, expected_elements);
+
+    MergeLoRAWeights(model);
+    auto *lora_mod = dynamic_cast<LoRALinear *>(model.get());
+    EXPECT_NE(lora_mod, nullptr);
+    EXPECT_FALSE(lora_mod->LoRAParameters()[0]->requires_grad());
+
+    UnmergeLoRAWeights(model);
+    EXPECT_TRUE(lora_mod->LoRAParameters()[0]->requires_grad());
+}
+
+TEST_P(LoRATest, MergeAndUnload) {
+    auto base_linear = std::make_shared<nn::Linear>(64, 128, /*bias=*/true, GetDevice());
+    LoRAConfig config;
+    config.rank = 4;
+    config.alpha = 8.0f;
+    config.target_modules = {"Linear"};
+    auto model = GetLoRAModel(base_linear, config);
+
+    EXPECT_NE(dynamic_cast<LoRALinear *>(model.get()), nullptr);
+
+    auto input = std::make_shared<Tensor>(std::vector<int64_t>{2, 5, 64}, DataType::kFLOAT32, GetDevice());
+    infini_train::test::FillSequentialTensor(input, 1.0f);
+    auto output_before = (*model)({input})[0];
+    float output_before_sum = TensorSum(output_before);
+
+    auto unloaded_model = MergeAndUnload(model);
+    EXPECT_NE(unloaded_model, nullptr);
+    EXPECT_EQ(dynamic_cast<LoRALinear *>(unloaded_model.get()), nullptr);
+
+    auto state_dict = unloaded_model->StateDict();
+    for (const auto &[name, param] : state_dict) {
+        EXPECT_EQ(name.find("lora_A"), std::string::npos);
+        EXPECT_EQ(name.find("lora_B"), std::string::npos);
+    }
+
+    auto output_after = (*unloaded_model)({input})[0];
+    float output_after_sum = TensorSum(output_after);
+    EXPECT_NEAR(std::abs(output_before_sum - output_after_sum), 0.0f, 1e-3);
+
+    for (const auto &param : unloaded_model->Parameters()) { EXPECT_TRUE(param->requires_grad()); }
+}
+
+INFINI_TRAIN_REGISTER_TEST(LoRATest);
diff --git a/tests/optimizer/CMakeLists.txt b/tests/optimizer/CMakeLists.txt
new file mode 100644
index 00000000..f8690742
--- /dev/null
+++ b/tests/optimizer/CMakeLists.txt
@@ -0,0 +1,13 @@
+# ==========================================================================
+# Optimizer tests
+# ==========================================================================
+
+set(OPTIMIZER_TEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(OPTIMIZER_TEST_SOURCES
+  ${OPTIMIZER_TEST_DIR}/test_optimizer_creation.cc
+  ${OPTIMIZER_TEST_DIR}/test_optimizer_step.cc
+)
+
+infini_train_add_test_suite(test_optimizer
+  SOURCES ${OPTIMIZER_TEST_SOURCES}
+)
diff --git a/tests/optimizer/test_optimizer_creation.cc b/tests/optimizer/test_optimizer_creation.cc
new file mode 100644
index 00000000..56a08411
--- /dev/null
+++ b/tests/optimizer/test_optimizer_creation.cc
@@ -0,0 +1,72 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/optimizer.h"
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class OptimizerCreationTest : public infini_train::test::InfiniTrainTestP {};
+
+TEST_P(OptimizerCreationTest, SGDCreation) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    auto optimizer = std::make_shared<optimizers::SGD>(std::vector<std::shared_ptr<Tensor>>{param}, 0.01);
+    EXPECT_NE(optimizer, nullptr);
+}
+
+TEST_P(OptimizerCreationTest, AdamCreation) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    auto optimizer = std::make_shared<optimizers::Adam>(std::vector<std::shared_ptr<Tensor>>{param}, 0.001);
+    EXPECT_NE(optimizer, nullptr);
+}
+
+TEST_P(OptimizerCreationTest, SGDMultiParams) {
+    std::vector<std::shared_ptr<Tensor>> params;
+    for (int i = 0; i < 3; ++i) {
+        auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+        param->set_requires_grad(true);
+        params.push_back(param);
+    }
+    auto optimizer = std::make_shared<optimizers::SGD>(params, 0.01);
+    EXPECT_NE(optimizer, nullptr);
+}
+
+TEST_P(OptimizerCreationTest, AdamMultiParams) {
+    std::vector<std::shared_ptr<Tensor>> params;
+    for (int i = 0; i < 3; ++i) {
+        auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+        param->set_requires_grad(true);
+        params.push_back(param);
+    }
+    auto optimizer = std::make_shared<optimizers::Adam>(params, 0.001);
+    EXPECT_NE(optimizer, nullptr);
+}
+
+INFINI_TRAIN_REGISTER_TEST(OptimizerCreationTest);
+
+// ---------------------------------------------------------------------------
+// Distributed
+// ---------------------------------------------------------------------------
+
+class OptimizerCreationDistributedTest : public infini_train::test::DistributedInfiniTrainTestP {};
+
+TEST_P(OptimizerCreationDistributedTest, SGDCreation) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    auto optimizer = std::make_shared<optimizers::SGD>(std::vector<std::shared_ptr<Tensor>>{param}, 0.01);
+    EXPECT_NE(optimizer, nullptr);
+}
+
+TEST_P(OptimizerCreationDistributedTest, AdamCreation) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{4, 4}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    auto optimizer = std::make_shared<optimizers::Adam>(std::vector<std::shared_ptr<Tensor>>{param}, 0.001);
+    EXPECT_NE(optimizer, nullptr);
+}
+
+INFINI_TRAIN_REGISTER_TEST_DISTRIBUTED(OptimizerCreationDistributedTest);
diff --git a/tests/optimizer/test_optimizer_step.cc b/tests/optimizer/test_optimizer_step.cc
new file mode 100644
index 00000000..21dcdd65
--- /dev/null
+++ b/tests/optimizer/test_optimizer_step.cc
@@ -0,0 +1,86 @@
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "infini_train/include/nn/parallel/global.h"
+#include "infini_train/include/optimizer.h"
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class OptimizerStepTest : public infini_train::test::InfiniTrainTestP {};
+
+TEST_P(OptimizerStepTest, SGDStep) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    infini_train::test::FillConstantTensor(param, 1.0f);
+    auto optimizer = std::make_shared<optimizers::SGD>(std::vector<std::shared_ptr<Tensor>>{param}, 0.01);
+    optimizer->ZeroGrad();
+    optimizer->Step();
+}
+
+TEST_P(OptimizerStepTest, AdamStep) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    infini_train::test::FillConstantTensor(param, 1.0f);
+    auto optimizer = std::make_shared<optimizers::Adam>(std::vector<std::shared_ptr<Tensor>>{param}, 0.001);
+    optimizer->ZeroGrad();
+    optimizer->Step();
+}
+
+TEST_P(OptimizerStepTest, ZeroGrad) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    auto optimizer = std::make_shared<optimizers::SGD>(std::vector<std::shared_ptr<Tensor>>{param}, 0.01);
+    optimizer->ZeroGrad();
+}
+
+TEST_P(OptimizerStepTest, ZeroGradWithNone) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    auto optimizer = std::make_shared<optimizers::SGD>(std::vector<std::shared_ptr<Tensor>>{param}, 0.01);
+    optimizer->ZeroGrad(false);
+}
+
+TEST_P(OptimizerStepTest, SGDMultiParams) {
+    std::vector<std::shared_ptr<Tensor>> params;
+    for (int i = 0; i < 3; ++i) {
+        auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+        param->set_requires_grad(true);
+        params.push_back(param);
+    }
+    auto optimizer = std::make_shared<optimizers::SGD>(params, 0.01);
+    EXPECT_NE(optimizer, nullptr);
+    optimizer->ZeroGrad();
+}
+
+INFINI_TRAIN_REGISTER_TEST(OptimizerStepTest);
+
+// ---------------------------------------------------------------------------
+// Distributed
+// ---------------------------------------------------------------------------
+
+class OptimizerStepDistributedTest : public infini_train::test::DistributedInfiniTrainTestP {};
+
+TEST_P(OptimizerStepDistributedTest, ZeroGrad) {
+    auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    param->set_requires_grad(true);
+    auto optimizer = std::make_shared<optimizers::SGD>(std::vector<std::shared_ptr<Tensor>>{param}, 0.01);
+    optimizer->ZeroGrad();
+}
+
+TEST_P(OptimizerStepDistributedTest, StepMultiParams) {
+    std::vector<std::shared_ptr<Tensor>> params;
+    for (int i = 0; i < 2; ++i) {
+        auto param = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+        param->set_requires_grad(true);
+        params.push_back(param);
+    }
+    auto optimizer = std::make_shared<optimizers::SGD>(params, 0.01);
+    EXPECT_NE(optimizer, nullptr);
+    optimizer->ZeroGrad();
+    optimizer->Step();
+}
+
+INFINI_TRAIN_REGISTER_TEST_DISTRIBUTED(OptimizerStepDistributedTest);
diff --git a/tests/tensor/CMakeLists.txt b/tests/tensor/CMakeLists.txt
new file mode 100644
index 00000000..2b52090a
--- /dev/null
+++ b/tests/tensor/CMakeLists.txt
@@ -0,0 +1,15 @@
+# ==========================================================================
+# Tensor tests
+# ==========================================================================
+
+set(TENSOR_TEST_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(TENSOR_TEST_SOURCES
+  ${TENSOR_TEST_DIR}/test_tensor.cc
+  ${TENSOR_TEST_DIR}/test_tensor_create.cc
+  ${TENSOR_TEST_DIR}/test_tensor_copy.cc
+  ${TENSOR_TEST_DIR}/test_tensor_delete.cc
+)
+
+infini_train_add_test_suite(test_tensor
+  SOURCES ${TENSOR_TEST_SOURCES}
+)
diff --git a/tests/tensor/test_tensor.cc b/tests/tensor/test_tensor.cc
new file mode 100644
index 00000000..121ff83c
--- /dev/null
+++ b/tests/tensor/test_tensor.cc
@@ -0,0 +1,56 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+// ============================================================================
+// Op tests — CPU + CUDA
+// ============================================================================
+
+class TensorOpTest : public infini_train::test::TensorTestBaseP {};
+
+TEST_P(TensorOpTest, MatmulAllocatesOutputs) {
+    auto a = createTensor({2, 3});
+    auto b = createTensor({3, 4});
+    auto c = createTensor({2, 4});
+    EXPECT_NE(a->DataPtr(), nullptr);
+    EXPECT_NE(b->DataPtr(), nullptr);
+    EXPECT_NE(c->DataPtr(), nullptr);
+}
+
+INFINI_TRAIN_REGISTER_TEST(TensorOpTest);
+
+// ============================================================================
+// Distributed tests — requires NCCL + >=2 GPUs
+// ============================================================================
+
+class TensorDistributedTest : public infini_train::test::DistributedInfiniTrainTestP {};
+
+TEST_P(TensorDistributedTest, AllReduce) {
+    auto tensor = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, GetDevice());
+    tensor->set_requires_grad(true);
+    infini_train::test::FillConstantTensor(tensor, 1.0f);
+    EXPECT_TRUE(tensor->GetDevice().IsCUDA());
+    EXPECT_TRUE(tensor->requires_grad());
+}
+
+TEST_P(TensorDistributedTest, AllGather) {
+    auto tensor = std::make_shared<Tensor>(std::vector<int64_t>{4, 4}, DataType::kFLOAT32, GetDevice());
+    tensor->set_requires_grad(true);
+    EXPECT_TRUE(tensor->GetDevice().IsCUDA());
+    EXPECT_EQ(tensor->Dims(), (std::vector<int64_t>{4, 4}));
+}
+
+TEST_P(TensorDistributedTest, ReduceScatter) {
+    auto tensor = std::make_shared<Tensor>(std::vector<int64_t>{2, 8}, DataType::kFLOAT32, GetDevice());
+    tensor->set_requires_grad(true);
+    EXPECT_TRUE(tensor->GetDevice().IsCUDA());
+    EXPECT_EQ(tensor->Dims(), (std::vector<int64_t>{2, 8}));
+}
+
+INFINI_TRAIN_REGISTER_TEST_DISTRIBUTED(TensorDistributedTest);
diff --git a/tests/tensor/test_tensor_copy.cc b/tests/tensor/test_tensor_copy.cc
new file mode 100644
index 00000000..5084f2a4
--- /dev/null
+++ b/tests/tensor/test_tensor_copy.cc
@@ -0,0 +1,94 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class TensorCopyTest : public infini_train::test::TensorTestBaseP {};
+
+// CPU-to-CPU copy is special: source is always CPU regardless of param device.
+TEST_P(TensorCopyTest, CopiesCPUToCPU) {
+    auto source
+        = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, Device(Device::DeviceType::kCPU, 0));
+    auto target
+        = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, Device(Device::DeviceType::kCPU, 0));
+    infini_train::test::FillSequentialTensor(source, 1.0f);
+    target->CopyFrom(source);
+    auto *target_data = static_cast<float *>(target->DataPtr());
+    for (int i = 0; i < 6; ++i) { EXPECT_FLOAT_EQ(target_data[i], 1.0f + static_cast<float>(i)); }
+}
+
+TEST_P(TensorCopyTest, CopiesBetweenSameShape) {
+    auto source = createTensor({4, 5, 6});
+    auto target = createTensor({4, 5, 6});
+    infini_train::test::FillSequentialTensor(source, 0.0f);
+    target->CopyFrom(source);
+    EXPECT_EQ(source->Dims(), target->Dims());
+}
+
+TEST_P(TensorCopyTest, CopiesPreservesDataType) {
+    auto source = createTensor({2, 3});
+    auto target = createTensor({2, 3});
+    EXPECT_EQ(source->Dtype(), target->Dtype());
+    target->CopyFrom(source);
+    EXPECT_EQ(target->Dtype(), DataType::kFLOAT32);
+}
+
+TEST_P(TensorCopyTest, CopiesCPUToCUDA) {
+    if (GetParam() != Device::DeviceType::kCUDA) {
+        GTEST_SKIP() << "CPU-to-CUDA copy only runs in CUDA instantiation";
+    }
+    auto cpu_tensor
+        = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, Device(Device::DeviceType::kCPU, 0));
+    auto cuda_tensor = createTensor({2, 3});
+    infini_train::test::FillSequentialTensor(cpu_tensor, 0.0f);
+    cuda_tensor->CopyFrom(cpu_tensor);
+    EXPECT_TRUE(cuda_tensor->GetDevice().IsCUDA());
+}
+
+TEST_P(TensorCopyTest, CopiesCUDAtoCUDA) {
+    if (GetParam() != Device::DeviceType::kCUDA) {
+        GTEST_SKIP() << "CUDA-to-CUDA copy only runs in CUDA instantiation";
+    }
+    auto source = createTensor({2, 3});
+    auto target = createTensor({2, 3});
+    infini_train::test::FillSequentialTensor(source, 2.0f);
+    target->CopyFrom(source);
+    EXPECT_TRUE(target->GetDevice().IsCUDA());
+}
+
+TEST_P(TensorCopyTest, CopiesCUDAtoCPU) {
+    if (GetParam() != Device::DeviceType::kCUDA) {
+        GTEST_SKIP() << "CUDA-to-CPU copy only runs in CUDA instantiation";
+    }
+    auto cuda_tensor = createTensor({2, 3});
+    auto cpu_tensor
+        = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32, Device(Device::DeviceType::kCPU, 0));
+    infini_train::test::FillSequentialTensor(cuda_tensor, 1.0f);
+    cpu_tensor->CopyFrom(cuda_tensor);
+    EXPECT_FALSE(cpu_tensor->GetDevice().IsCUDA());
+    EXPECT_TRUE(cpu_tensor->GetDevice().IsCPU());
+}
+
+TEST_P(TensorCopyTest, CopiesWithDifferentDeviceId) {
+    if (GetParam() != Device::DeviceType::kCUDA) {
+        GTEST_SKIP() << "multi-GPU copy only runs in CUDA instantiation";
+    }
+    REQUIRE_MIN_GPUS(2);
+#if defined(USE_CUDA)
+    auto source = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32,
+                                           Device(Device::DeviceType::kCUDA, 0));
+    auto target = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, DataType::kFLOAT32,
+                                           Device(Device::DeviceType::kCUDA, 1));
+    infini_train::test::FillSequentialTensor(source, 5.0f);
+    target->CopyFrom(source);
+    EXPECT_EQ(source->GetDevice().index(), 0);
+    EXPECT_EQ(target->GetDevice().index(), 1);
+#endif
+}
+
+INFINI_TRAIN_REGISTER_TEST(TensorCopyTest);
diff --git a/tests/tensor/test_tensor_create.cc b/tests/tensor/test_tensor_create.cc
new file mode 100644
index 00000000..499ba2e7
--- /dev/null
+++ b/tests/tensor/test_tensor_create.cc
@@ -0,0 +1,46 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class TensorCreateTest : public infini_train::test::TensorTestBaseP {};
+
+TEST_P(TensorCreateTest, CreatesTensorWithShapeAndType) {
+    auto tensor = createTensor({2, 3});
+    EXPECT_NE(tensor, nullptr);
+    EXPECT_EQ(tensor->Dims(), (std::vector<int64_t>{2, 3}));
+    EXPECT_EQ(tensor->Dtype(), DataType::kFLOAT32);
+}
+
+TEST_P(TensorCreateTest, TracksRequiresGrad) {
+    auto tensor = createTensor({2, 3});
+    EXPECT_FALSE(tensor->requires_grad());
+    tensor->set_requires_grad(true);
+    EXPECT_TRUE(tensor->requires_grad());
+}
+
+TEST_P(TensorCreateTest, ProvidesDataPointer) {
+    auto tensor = createTensor({2, 3});
+    EXPECT_NE(tensor->DataPtr(), nullptr);
+}
+
+TEST_P(TensorCreateTest, SupportsMultipleShapes) {
+    for (const auto &shape : std::vector<std::vector<int64_t>>{{2, 3}, {4, 5, 6}, {10}, {1, 1, 1, 1}}) {
+        auto tensor = createTensor(shape);
+        EXPECT_EQ(tensor->Dims(), shape);
+    }
+}
+
+TEST_P(TensorCreateTest, SupportsMultipleDtypes) {
+    for (auto dtype : {DataType::kFLOAT32, DataType::kBFLOAT16}) {
+        auto tensor = std::make_shared<Tensor>(std::vector<int64_t>{2, 3}, dtype, GetDevice());
+        EXPECT_EQ(tensor->Dtype(), dtype);
+    }
+}
+
+INFINI_TRAIN_REGISTER_TEST(TensorCreateTest);
diff --git a/tests/tensor/test_tensor_delete.cc b/tests/tensor/test_tensor_delete.cc
new file mode 100644
index 00000000..99e5c461
--- /dev/null
+++ b/tests/tensor/test_tensor_delete.cc
@@ -0,0 +1,76 @@
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "infini_train/include/tensor.h"
+#include "test_utils.h"
+
+using namespace infini_train;
+
+class TensorDeleteTest : public infini_train::test::TensorTestBaseP {};
+
+TEST_P(TensorDeleteTest, ReleasesResourcesOnReset) {
+    std::weak_ptr<Tensor> weak_tensor;
+    {
+        auto tensor = createTensor({2, 3}, DataType::kFLOAT32, /*requires_grad=*/true);
+        weak_tensor = tensor;
+    }
+    EXPECT_TRUE(weak_tensor.expired());
+}
+
+TEST_P(TensorDeleteTest, MoveTransferKeepsData) {
+    // Move semantics only make sense on CPU (data pointer is directly readable).
+    if (GetParam() != Device::DeviceType::kCPU) {
+        GTEST_SKIP() << "data-pointer read after move only meaningful on CPU";
+    }
+    auto tensor = createTensor({2, 3});
+    infini_train::test::FillSequentialTensor(tensor, 5.0f);
+    auto moved = std::move(tensor);
+    EXPECT_EQ(tensor, nullptr);
+    ASSERT_NE(moved, nullptr);
+    auto *data = static_cast<float *>(moved->DataPtr());
+    for (int i = 0; i < 6; ++i) { EXPECT_FLOAT_EQ(data[i], 5.0f + static_cast<float>(i)); }
+}
+
+TEST_P(TensorDeleteTest, NullifiesPointerOnMove) {
+    auto tensor = createTensor({3, 3});
+    EXPECT_NE(tensor, nullptr);
+    auto moved_tensor = std::move(tensor);
+    EXPECT_EQ(tensor, nullptr);
+    EXPECT_NE(moved_tensor, nullptr);
+}
+
+TEST_P(TensorDeleteTest, SharedPtrRefCountOnCopy) {
+    auto tensor = createTensor({2, 3});
+    auto copy1 = tensor;
+    auto copy2 = tensor;
+    EXPECT_EQ(tensor.use_count(), 3);
+    copy1.reset();
+    EXPECT_EQ(tensor.use_count(), 2);
+    copy2.reset();
+    EXPECT_EQ(tensor.use_count(), 1);
+    EXPECT_NE(tensor, nullptr);
+}
+
+TEST_P(TensorDeleteTest, TensorDestroyedAfterScope) {
+    {
+        auto tensor = createTensor({2, 2});
+        EXPECT_NE(tensor, nullptr);
+    }
+}
+
+TEST_P(TensorDeleteTest, ReleaseMemoryOnCUDA) {
+    if (GetParam() != Device::DeviceType::kCUDA) {
+        GTEST_SKIP() << "GPU memory release only runs in CUDA instantiation";
+    }
+    std::weak_ptr<Tensor> weak_tensor;
+    {
+        auto tensor = createTensor({100, 100}, DataType::kFLOAT32, /*requires_grad=*/true);
+        EXPECT_TRUE(tensor->GetDevice().IsCUDA());
+        weak_tensor = tensor;
+    }
+    EXPECT_TRUE(weak_tensor.expired());
+}
+
+INFINI_TRAIN_REGISTER_TEST(TensorDeleteTest);