From f72193d40675050aedecc8f8f3fedf3d03902f61 Mon Sep 17 00:00:00 2001
From: zhuyue <zhuyue@qiyuanlab.com>
Date: Mon, 13 Oct 2025 15:14:07 +0800
Subject: [PATCH 1/3] Add debug function in InfiniCore tensor.

---
 include/infinicore/tensor.hpp      |   4 +
 python/infinicore/tensor.py        |  11 +
 src/infinicore/pybind11/tensor.hpp |   4 +-
 src/infinicore/tensor/debug.cc     | 376 +++++++++++++++++++++++++++
 test/infinicore/op/debug.py        | 398 +++++++++++++++++++++++++++++
 5 files changed, 792 insertions(+), 1 deletion(-)
 create mode 100644 src/infinicore/tensor/debug.cc
 create mode 100644 test/infinicore/op/debug.py
diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp
index b3570e9fe..828034c94 100644
--- a/include/infinicore/tensor.hpp
+++ b/include/infinicore/tensor.hpp
@@ -121,6 +121,10 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
 
     std::string info() const;
 
+    void debug(const std::string &filename) const;
+
+    void debug() const;
+
     ///
     /// Data Transfer APIs
     ///
diff --git a/python/infinicore/tensor.py b/python/infinicore/tensor.py
index 5095768c5..2df6df681 100644
--- a/python/infinicore/tensor.py
+++ b/python/infinicore/tensor.py
@@ -75,6 +75,17 @@ def permute(self, dims):
     def view(self, shape):
         return Tensor(self._underlying.view(shape))
 
+    def debug(self, filename=None):
+        """Print tensor data or save to file for debugging
+        
+        Args:
+            filename: Optional filename to save raw binary data. If None, prints to stdout.
+        """
+        if filename is None:
+            self._underlying.debug()
+        else:
+            self._underlying.debug(filename)
+
 
 def empty(size, *, dtype=None, device=None, pin_memory=False):
     return Tensor(
diff --git a/src/infinicore/pybind11/tensor.hpp b/src/infinicore/pybind11/tensor.hpp
index 66fa06678..b7e50d561 100644
--- a/src/infinicore/pybind11/tensor.hpp
+++ b/src/infinicore/pybind11/tensor.hpp
@@ -17,7 +17,7 @@ inline void bind(py::module &m) {
         .def_property_readonly("dtype", [](const Tensor &tensor) { return tensor->dtype(); })
         .def_property_readonly("device", [](const Tensor &tensor) { return tensor->device(); })
 
-        .def("data_ptr", [](const Tensor &tensor) { return tensor->data(); })
+        .def("data_ptr", [](const Tensor &tensor) { return reinterpret_cast<uintptr_t>(tensor->data()); })
         .def("size", [](const Tensor &tensor, std::size_t dim) { return tensor->size(dim); })
         .def("stride", [](const Tensor &tensor, std::size_t dim) { return tensor->stride(dim); })
         .def("numel", [](const Tensor &tensor) { return tensor->numel(); })
@@ -25,6 +25,8 @@ inline void bind(py::module &m) {
         .def("is_contiguous", [](const Tensor &tensor) { return tensor->is_contiguous(); })
         .def("is_pinned", [](const Tensor &tensor) { return tensor->is_pinned(); })
         .def("info", [](const Tensor &tensor) { return tensor->info(); })
+        .def("debug", [](const Tensor &tensor) { return tensor->debug(); })
+        .def("debug", [](const Tensor &tensor, const std::string &filename) { return tensor->debug(filename); })
 
         .def("copy_", [](Tensor &tensor, const Tensor &other) { tensor->copy_from(other); })
         .def("to", [](const Tensor &tensor, const Device &device) { return tensor->to(device); })
diff --git a/src/infinicore/tensor/debug.cc b/src/infinicore/tensor/debug.cc
new file mode 100644
index 000000000..6a93cf628
--- /dev/null
+++ b/src/infinicore/tensor/debug.cc
@@ -0,0 +1,376 @@
+#include "infinicore/context/context.hpp"
+#include "infinicore/dtype.hpp"
+#include "infinicore/tensor.hpp"
+
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+namespace infinicore {
+
+inline float f16_to_f32(uint16_t h) {
+    uint32_t sign = (h & 0x8000) << 16;
+    int32_t exponent = (h >> 10) & 0x1F;
+    uint32_t mantissa = h & 0x3FF;
+
+    uint32_t f32;
+    if (exponent == 31) {
+        if (mantissa != 0) {
+            f32 = sign | 0x7F800000 | (mantissa << 13);
+        } else {
+            f32 = sign | 0x7F800000;
+        }
+    } else if (exponent == 0) {
+        if (mantissa == 0) {
+            f32 = sign;
+        } else {
+            exponent = -14;
+            while ((mantissa & 0x400) == 0) {
+                mantissa <<= 1;
+                exponent--;
+            }
+            mantissa &= 0x3FF;
+            f32 = sign | ((exponent + 127) << 23) | (mantissa << 13);
+        }
+    } else {
+        f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13);
+    }
+
+    float result;
+    std::memcpy(&result, &f32, sizeof(result));
+    return result;
+}
+
+inline float bf16_to_f32(uint16_t val) {
+    uint32_t bits32 = static_cast<uint32_t>(val) << 16;
+    float out;
+    std::memcpy(&out, &bits32, sizeof(out));
+    return out;
+}
+
+// Template function for printing data recursively
+template <typename T>
+void print_data(const T *data, const Shape &shape, const Strides &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << data[i * strides[dim]] << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data(data + i * strides[dim], shape, strides, dim + 1);
+        }
+    }
+}
+
+// Specialization for F16 (uint16_t)
+template <>
+void print_data<uint16_t>(const uint16_t *data, const Shape &shape, const Strides &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << f16_to_f32(data[i * strides[dim]]) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data(data + i * strides[dim], shape, strides, dim + 1);
+        }
+    }
+}
+
+// Function for printing BF16 data
+void print_data_bf16(const uint16_t *data, const Shape &shape, const Strides &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << bf16_to_f32(data[i * strides[dim]]) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            print_data_bf16(data + i * strides[dim], shape, strides, dim + 1);
+        }
+    }
+}
+
+void TensorImpl::debug(const std::string &filename) const {
+    // Synchronize device if needed
+    context::syncDevice();
+
+    std::cout << info() << std::endl;
+
+    const std::byte *cpu_data = nullptr;
+    std::byte *allocated_memory = nullptr;
+
+    // Copy data to CPU if not already on CPU
+    if (this->device().getType() != Device::Type::CPU) {
+        size_t mem_size = this->numel() * dsize(this->dtype());
+        allocated_memory = new std::byte[mem_size];
+        context::memcpyD2H(allocated_memory, this->data(), mem_size);
+        cpu_data = allocated_memory;
+    } else {
+        cpu_data = this->data();
+    }
+
+    // If filename is provided, save to file
+    if (!filename.empty()) {
+        // Determine file format based on extension
+        bool is_text_format = false;
+        size_t dot_pos = filename.find_last_of('.');
+        if (dot_pos != std::string::npos) {
+            std::string ext = filename.substr(dot_pos);
+            is_text_format = (ext == ".txt");
+        }
+
+        if (is_text_format) {
+            // Save as text format
+            std::ofstream outFile(filename);
+            if (!outFile) {
+                std::cerr << "Error opening file for writing: " << filename << "\n";
+                if (allocated_memory) {
+                    delete[] allocated_memory;
+                }
+                return;
+            }
+
+            // Write header with tensor information
+            outFile << "# Tensor Debug Output\n";
+            outFile << "# Shape: [";
+            for (size_t i = 0; i < this->shape().size(); ++i) {
+                outFile << this->shape()[i];
+                if (i < this->shape().size() - 1) {
+                    outFile << ", ";
+                }
+            }
+            outFile << "]\n";
+            outFile << "# Strides: [";
+            for (size_t i = 0; i < this->strides().size(); ++i) {
+                outFile << this->strides()[i];
+                if (i < this->strides().size() - 1) {
+                    outFile << ", ";
+                }
+            }
+            outFile << "]\n";
+            outFile << "# Dtype: " << toString(this->dtype()) << "\n";
+            outFile << "# Contiguous: " << (this->is_contiguous() ? "Yes" : "No") << "\n";
+            outFile << "# Elements: " << this->numel() << "\n";
+            outFile << "#\n";
+
+            // Helper function to write data recursively
+            std::function<void(const std::byte *, const Shape &, const Strides &, size_t, std::ofstream &)> write_data;
+
+            switch (this->dtype()) {
+            case DataType::F16:
+                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
+                    const uint16_t *ptr = reinterpret_cast<const uint16_t *>(data);
+                    if (dim == shape.size() - 1) {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            out << f16_to_f32(ptr[i * strides[dim]]);
+                            if (i < shape[dim] - 1) {
+                                out << " ";
+                            }
+                        }
+                        out << "\n";
+                    } else {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            write_data(data + i * strides[dim] * sizeof(uint16_t), shape, strides, dim + 1, out);
+                        }
+                    }
+                };
+                break;
+            case DataType::F32:
+                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
+                    const float *ptr = reinterpret_cast<const float *>(data);
+                    if (dim == shape.size() - 1) {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            out << ptr[i * strides[dim]];
+                            if (i < shape[dim] - 1) {
+                                out << " ";
+                            }
+                        }
+                        out << "\n";
+                    } else {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            write_data(data + i * strides[dim] * sizeof(float), shape, strides, dim + 1, out);
+                        }
+                    }
+                };
+                break;
+            case DataType::F64:
+                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
+                    const double *ptr = reinterpret_cast<const double *>(data);
+                    if (dim == shape.size() - 1) {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            out << ptr[i * strides[dim]];
+                            if (i < shape[dim] - 1) {
+                                out << " ";
+                            }
+                        }
+                        out << "\n";
+                    } else {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            write_data(data + i * strides[dim] * sizeof(double), shape, strides, dim + 1, out);
+                        }
+                    }
+                };
+                break;
+            case DataType::I32:
+                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
+                    const int32_t *ptr = reinterpret_cast<const int32_t *>(data);
+                    if (dim == shape.size() - 1) {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            out << ptr[i * strides[dim]];
+                            if (i < shape[dim] - 1) {
+                                out << " ";
+                            }
+                        }
+                        out << "\n";
+                    } else {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            write_data(data + i * strides[dim] * sizeof(int32_t), shape, strides, dim + 1, out);
+                        }
+                    }
+                };
+                break;
+            case DataType::I64:
+                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
+                    const int64_t *ptr = reinterpret_cast<const int64_t *>(data);
+                    if (dim == shape.size() - 1) {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            out << ptr[i * strides[dim]];
+                            if (i < shape[dim] - 1) {
+                                out << " ";
+                            }
+                        }
+                        out << "\n";
+                    } else {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            write_data(data + i * strides[dim] * sizeof(int64_t), shape, strides, dim + 1, out);
+                        }
+                    }
+                };
+                break;
+            case DataType::BF16:
+                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
+                    const uint16_t *ptr = reinterpret_cast<const uint16_t *>(data);
+                    if (dim == shape.size() - 1) {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            out << bf16_to_f32(ptr[i * strides[dim]]);
+                            if (i < shape[dim] - 1) {
+                                out << " ";
+                            }
+                        }
+                        out << "\n";
+                    } else {
+                        for (size_t i = 0; i < shape[dim]; i++) {
+                            write_data(data + i * strides[dim] * sizeof(uint16_t), shape, strides, dim + 1, out);
+                        }
+                    }
+                };
+                break;
+            default:
+                outFile << "# Unsupported data type for text output\n";
+                outFile.close();
+                if (allocated_memory) {
+                    delete[] allocated_memory;
+                }
+                return;
+            }
+
+            // Write the actual data
+            write_data(cpu_data, this->shape(), this->strides(), 0, outFile);
+
+            outFile.close();
+            std::cout << "Data written to text file: " << filename << "\n";
+        } else {
+            // Save as binary format (default)
+            std::ofstream outFile(filename, std::ios::binary);
+            if (!outFile) {
+                std::cerr << "Error opening file for writing: " << filename << "\n";
+                if (allocated_memory) {
+                    delete[] allocated_memory;
+                }
+                return;
+            }
+            size_t mem_size = this->numel() * dsize(this->dtype());
+            outFile.write(reinterpret_cast<const char *>(cpu_data), mem_size);
+            outFile.close();
+            std::cout << "Data written to binary file: " << filename << "\n";
+        }
+
+        if (allocated_memory) {
+            delete[] allocated_memory;
+        }
+        return;
+    }
+
+    // Print data based on dtype
+    switch (this->dtype()) {
+    case DataType::F16:
+        print_data(reinterpret_cast<const uint16_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::F32:
+        print_data(reinterpret_cast<const float *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::F64:
+        print_data(reinterpret_cast<const double *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::U64:
+        print_data(reinterpret_cast<const uint64_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::I64:
+        print_data(reinterpret_cast<const int64_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::U32:
+        print_data(reinterpret_cast<const uint32_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::I32:
+        print_data(reinterpret_cast<const int32_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::U16:
+        print_data(reinterpret_cast<const uint16_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::I16:
+        print_data(reinterpret_cast<const int16_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::U8:
+        print_data(reinterpret_cast<const uint8_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::I8:
+        print_data(reinterpret_cast<const int8_t *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    case DataType::BF16:
+        print_data_bf16(reinterpret_cast<const uint16_t *>(cpu_data),
+                        this->shape(), this->strides(), 0);
+        break;
+    case DataType::BOOL:
+        print_data(reinterpret_cast<const bool *>(cpu_data),
+                   this->shape(), this->strides(), 0);
+        break;
+    default:
+        std::cout << "Unsupported data type for debug" << std::endl;
+        break;
+    }
+
+    // Clean up allocated memory
+    if (allocated_memory) {
+        delete[] allocated_memory;
+    }
+}
+
+void TensorImpl::debug() const {
+    this->debug("");
+}
+
+} // namespace infinicore
diff --git a/test/infinicore/op/debug.py b/test/infinicore/op/debug.py
new file mode 100644
index 000000000..5db66bc44
--- /dev/null
+++ b/test/infinicore/op/debug.py
@@ -0,0 +1,398 @@
+#!/usr/bin/env python3
+"""
+Tensor Debug 功能测试脚本
+
+简单测试 debug 功能是否正常工作
+"""
+
+import torch
+import infinicore
+import sys
+import os
+import numpy as np
+
+
+def test_basic_debug():
+    """测试基本的 debug 打印功能"""
+    print("\n" + "=" * 80)
+    print("测试 1: 基本 debug 打印")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    # 测试 float32
+    print("\n--- Float32 张量 (2x3) ---")
+    torch_tensor = torch.tensor([[1.0, 2.0, 3.0],
+                                  [4.0, 5.0, 6.0]], dtype=torch.float32)
+    infini_tensor = infinicore.from_blob(
+        torch_tensor.data_ptr(),
+        list(torch_tensor.shape),
+        dtype=infinicore.float32,
+        device=device
+    )
+    infini_tensor.debug()
+    print("✓ Float32 打印成功")
+    
+    # 测试 int32
+    print("\n--- Int32 张量 (2x2) ---")
+    torch_i32 = torch.tensor([[1, 2], [3, 4]], dtype=torch.int32)
+    infini_i32 = infinicore.from_blob(
+        torch_i32.data_ptr(),
+        list(torch_i32.shape),
+        dtype=infinicore.int32,
+        device=device
+    )
+    infini_i32.debug()
+    print("✓ Int32 打印成功")
+
+
+def test_save_to_file():
+    """测试保存到文件"""
+    print("\n" + "=" * 80)
+    print("测试 2: 保存张量到文件")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    # 创建张量
+    torch_tensor = torch.arange(1, 13, dtype=torch.float32).reshape(3, 4)
+    print("\n原始张量:")
+    print(torch_tensor)
+    
+    infini_tensor = infinicore.from_blob(
+        torch_tensor.data_ptr(),
+        list(torch_tensor.shape),
+        dtype=infinicore.float32,
+        device=device
+    )
+    
+    # 保存到文件
+    filename = "/tmp/tensor_debug_test.bin"
+    print(f"\n保存到: {filename}")
+    infini_tensor.debug(filename)
+    
+    # 验证文件
+    if os.path.exists(filename):
+        file_size = os.path.getsize(filename)
+        expected_size = 12 * 4  # 12 个 float32
+        assert file_size == expected_size, f"文件大小不匹配: {file_size} vs {expected_size}"
+        
+        # 读取验证
+        loaded = np.fromfile(filename, dtype=np.float32).reshape(3, 4)
+        print("\n从文件读取:")
+        print(loaded)
+        
+        os.remove(filename)
+        print("✓ 文件保存和读取成功")
+    else:
+        raise RuntimeError("文件未创建")
+
+
+def test_multidimensional():
+    """测试多维张量"""
+    print("\n" + "=" * 80)
+    print("测试 3: 多维张量")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    # 3D 张量
+    print("\n--- 3D 张量 (2x2x3) ---")
+    torch_3d = torch.arange(1, 13, dtype=torch.float32).reshape(2, 2, 3)
+    print("PyTorch 张量:")
+    print(torch_3d)
+    
+    infini_3d = infinicore.from_blob(
+        torch_3d.data_ptr(),
+        list(torch_3d.shape),
+        dtype=infinicore.float32,
+        device=device
+    )
+    
+    print("\nInfiniCore debug 输出:")
+    infini_3d.debug()
+    print("✓ 3D 张量打印成功")
+
+
+def test_infinicore_created():
+    """测试 InfiniCore 创建的张量"""
+    print("\n" + "=" * 80)
+    print("测试 4: InfiniCore 创建的张量")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    # ones 张量
+    print("\n--- ones 张量 (2x3) ---")
+    ones_tensor = infinicore.ones([2, 3], dtype=infinicore.float32, device=device)
+    ones_tensor.debug()
+    print("✓ ones 张量打印成功")
+    
+    # zeros 张量
+    print("\n--- zeros 张量 (3x2) ---")
+    zeros_tensor = infinicore.zeros([3, 2], dtype=infinicore.float32, device=device)
+    zeros_tensor.debug()
+    print("✓ zeros 张量打印成功")
+
+
+def test_different_dtypes():
+    """测试不同数据类型"""
+    print("\n" + "=" * 80)
+    print("测试 5: 不同数据类型")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    dtypes = [
+        (infinicore.float32, torch.float32, "Float32"),
+        (infinicore.int32, torch.int32, "Int32"),
+        (infinicore.int64, torch.int64, "Int64"),
+    ]
+    
+    for infini_dtype, torch_dtype, name in dtypes:
+        print(f"\n--- {name} ---")
+        torch_tensor = torch.arange(1, 7, dtype=torch_dtype).reshape(2, 3)
+        infini_tensor = infinicore.from_blob(
+            torch_tensor.data_ptr(),
+            list(torch_tensor.shape),
+            dtype=infini_dtype,
+            device=device
+        )
+        infini_tensor.debug()
+        print(f"✓ {name} 测试通过")
+
+
+def test_text_format():
+    """测试文本格式保存"""
+    print("\n" + "=" * 80)
+    print("测试 6: 文本格式保存 (.txt)")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    # 创建张量
+    torch_tensor = torch.arange(1, 13, dtype=torch.float32).reshape(3, 4)
+    print("\n原始张量:")
+    print(torch_tensor)
+    
+    infini_tensor = infinicore.from_blob(
+        torch_tensor.data_ptr(),
+        list(torch_tensor.shape),
+        dtype=infinicore.float32,
+        device=device
+    )
+    
+    # 保存为文本文件
+    txt_filename = "/tmp/tensor_debug_test.txt"
+    print(f"\n保存为文本格式: {txt_filename}")
+    infini_tensor.debug(txt_filename)
+    
+    # 验证文本文件
+    if os.path.exists(txt_filename):
+        print("\n文本文件内容:")
+        with open(txt_filename, 'r') as f:
+            content = f.read()
+            print(content)
+        
+        # 1. 验证元数据
+        assert "# Tensor Debug Output" in content, "文本文件缺少标题"
+        assert "# Shape: [3, 4]" in content, "文本文件缺少形状信息"
+        assert "# Dtype: F32" in content, "文本文件缺少类型信息"
+        print("✓ 元数据验证通过")
+        
+        # 2. 提取并验证数值数据
+        lines = content.split('\n')
+        data_lines = [line.strip() for line in lines 
+                      if line.strip() and not line.startswith('#')]
+        
+        print(f"\n提取到 {len(data_lines)} 行数据")
+        
+        # 解析数值
+        loaded_data = []
+        for i, line in enumerate(data_lines):
+            row = [float(x) for x in line.split()]
+            loaded_data.append(row)
+            print(f"  第 {i+1} 行: {row}")
+        
+        # 转换为 numpy 数组
+        loaded_array = np.array(loaded_data, dtype=np.float32)
+        
+        # 3. 与原始数据对比
+        expected = torch_tensor.numpy()
+        assert loaded_array.shape == expected.shape, \
+            f"形状不匹配: {loaded_array.shape} vs {expected.shape}"
+        assert np.allclose(loaded_array, expected), \
+            f"数值不匹配:\n加载的数据:\n{loaded_array}\n期望的数据:\n{expected}"
+        
+        print("✓ 数值验证通过")
+        
+        os.remove(txt_filename)
+        print("✓ 文本格式保存测试通过")
+    else:
+        raise RuntimeError("文本文件未创建")
+
+
+def test_binary_format():
+    """测试二进制格式保存"""
+    print("\n" + "=" * 80)
+    print("测试 7: 二进制格式保存 (.bin)")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    # 创建张量
+    torch_tensor = torch.arange(1, 13, dtype=torch.float32).reshape(3, 4)
+    print("\n原始张量:")
+    print(torch_tensor)
+    
+    infini_tensor = infinicore.from_blob(
+        torch_tensor.data_ptr(),
+        list(torch_tensor.shape),
+        dtype=infinicore.float32,
+        device=device
+    )
+    
+    # 保存为二进制文件
+    bin_filename = "/tmp/tensor_debug_test.bin"
+    print(f"\n保存为二进制格式: {bin_filename}")
+    infini_tensor.debug(bin_filename)
+    
+    # 验证二进制文件
+    if os.path.exists(bin_filename):
+        file_size = os.path.getsize(bin_filename)
+        expected_size = 12 * 4  # 12 个 float32
+        assert file_size == expected_size, \
+            f"二进制文件大小不匹配: {file_size} vs {expected_size}"
+        
+        # 读取并验证数据
+        loaded = np.fromfile(bin_filename, dtype=np.float32).reshape(3, 4)
+        print("\n从二进制文件读取:")
+        print(loaded)
+        
+        # 验证数据正确性
+        assert np.allclose(loaded, torch_tensor.numpy()), "数据不匹配"
+        
+        os.remove(bin_filename)
+        print("✓ 二进制格式保存测试通过")
+    else:
+        raise RuntimeError("二进制文件未创建")
+
+
+def test_format_comparison():
+    """对比不同格式"""
+    print("\n" + "=" * 80)
+    print("测试 8: 对比不同格式")
+    print("=" * 80)
+    
+    device = infinicore.device("cpu", 0)
+    
+    # 创建小张量用于对比
+    torch_tensor = torch.tensor([[1.5, 2.5], [3.5, 4.5]], dtype=torch.float32)
+    print("\n原始张量:")
+    print(torch_tensor)
+    
+    infini_tensor = infinicore.from_blob(
+        torch_tensor.data_ptr(),
+        list(torch_tensor.shape),
+        dtype=infinicore.float32,
+        device=device
+    )
+    
+    bin_file = "/tmp/compare_test.bin"
+    txt_file = "/tmp/compare_test.txt"
+    
+    # 保存两种格式
+    print("\n保存两种格式...")
+    infini_tensor.debug(bin_file)
+    infini_tensor.debug(txt_file)
+    
+    # 对比文件大小
+    bin_size = os.path.getsize(bin_file)
+    txt_size = os.path.getsize(txt_file)
+    
+    print(f"\n文件大小对比:")
+    print(f"  二进制文件: {bin_size} 字节")
+    print(f"  文本文件: {txt_size} 字节")
+    print(f"  文本/二进制比: {txt_size/bin_size:.2f}x")
+    
+    # ===== 验证二进制文件 =====
+    print("\n验证二进制文件:")
+    bin_data = np.fromfile(bin_file, dtype=np.float32).reshape(2, 2)
+    print(f"  读取的数据:\n{bin_data}")
+    assert np.allclose(bin_data, torch_tensor.numpy()), "二进制数据不匹配"
+    print("  ✓ 二进制文件数值正确")
+    
+    # ===== 验证文本文件 =====
+    print("\n验证文本文件:")
+    with open(txt_file, 'r') as f:
+        txt_content = f.read()
+    
+    # 1. 元数据验证
+    assert "# Tensor Debug Output" in txt_content, "缺少标题"
+    assert "# Shape: [2, 2]" in txt_content, "缺少形状信息"
+    assert "# Dtype: F32" in txt_content, "缺少类型信息"
+    print("  ✓ 元数据正确")
+    
+    # 2. 数值验证
+    lines = txt_content.split('\n')
+    data_lines = [line.strip() for line in lines 
+                  if line.strip() and not line.startswith('#')]
+    
+    txt_data = []
+    for line in data_lines:
+        row = [float(x) for x in line.split()]
+        txt_data.append(row)
+    
+    txt_array = np.array(txt_data, dtype=np.float32)
+    print(f"  读取的数据:\n{txt_array}")
+    
+    assert txt_array.shape == torch_tensor.shape, \
+        f"文本文件形状不匹配: {txt_array.shape} vs {torch_tensor.shape}"
+    assert np.allclose(txt_array, torch_tensor.numpy()), \
+        f"文本文件数值不匹配"
+    print("  ✓ 文本文件数值正确")
+    
+    # ===== 对比两种格式的数据一致性 =====
+    print("\n验证两种格式数据一致性:")
+    assert np.allclose(bin_data, txt_array), \
+        "二进制和文本文件的数据不一致！"
+    print("  ✓ 两种格式数据完全一致")
+    
+    # 清理
+    os.remove(bin_file)
+    os.remove(txt_file)
+    
+    print("\n✓ 格式对比测试通过")
+
+
+def main():
+    """主测试函数"""
+    print("\n" + "=" * 80)
+    print("InfiniCore Tensor Debug 功能测试")
+    print("=" * 80)
+    
+    try:
+        test_basic_debug()
+        test_save_to_file()
+        test_multidimensional()
+        test_infinicore_created()
+        test_different_dtypes()
+        test_text_format()
+        test_binary_format()
+        test_format_comparison()
+        
+        print("\n" + "=" * 80)
+        print("✅ 所有测试通过!")
+        print("=" * 80)
+        return 0
+        
+    except Exception as e:
+        print(f"\n❌ 测试失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
+

From c7054b7acfdabee4bee52bc2d6a7c81be7078d93 Mon Sep 17 00:00:00 2001
From: zhuyue <zhuyue@qiyuanlab.com>
Date: Tue, 14 Oct 2025 09:59:43 +0800
Subject: [PATCH 2/3] refactor test scripts and remove txt write add
 large-scale and non-contiguous tensor I/O tests

---
 src/infinicore/tensor/debug.cc | 274 +++++-------
 test/infinicore/op/debug.py    | 768 ++++++++++++++++++---------------
 2 files changed, 513 insertions(+), 529 deletions(-)

diff --git a/src/infinicore/tensor/debug.cc b/src/infinicore/tensor/debug.cc
index 6a93cf628..8cf31d592 100644
--- a/src/infinicore/tensor/debug.cc
+++ b/src/infinicore/tensor/debug.cc
@@ -5,6 +5,8 @@
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <limits>
+#include <memory>
 #include <sstream>
 
 namespace infinicore {
@@ -93,6 +95,22 @@ void print_data_bf16(const uint16_t *data, const Shape &shape, const Strides &st
     }
 }
 
+// Template function for writing data recursively to binary file (handles non-contiguous tensors)
+template <typename T>
+void write_binary_data(std::ofstream &out, const T *data, const Shape &shape, const Strides &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        // Write the innermost dimension
+        for (size_t i = 0; i < shape[dim]; i++) {
+            out.write(reinterpret_cast<const char *>(&data[i * strides[dim]]), sizeof(T));
+        }
+    } else {
+        // Recursively process higher dimensions
+        for (size_t i = 0; i < shape[dim]; i++) {
+            write_binary_data(out, data + i * strides[dim], shape, strides, dim + 1);
+        }
+    }
+}
+
 void TensorImpl::debug(const std::string &filename) const {
     // Synchronize device if needed
     context::syncDevice();
@@ -100,207 +118,112 @@ void TensorImpl::debug(const std::string &filename) const {
     std::cout << info() << std::endl;
 
     const std::byte *cpu_data = nullptr;
-    std::byte *allocated_memory = nullptr;
+    std::unique_ptr<std::byte[]> allocated_memory; // RAII: 自动管理内存
 
     // Copy data to CPU if not already on CPU
     if (this->device().getType() != Device::Type::CPU) {
-        size_t mem_size = this->numel() * dsize(this->dtype());
-        allocated_memory = new std::byte[mem_size];
-        context::memcpyD2H(allocated_memory, this->data(), mem_size);
-        cpu_data = allocated_memory;
+        size_t numel = this->numel();
+        size_t element_size = dsize(this->dtype());
+
+        // 检查乘法溢出
+        if (numel > 0 && element_size > std::numeric_limits<size_t>::max() / numel) {
+            std::cerr << "Error: Memory size calculation overflow for tensor with "
+                      << numel << " elements of size " << element_size << "\n";
+            return;
+        }
+
+        size_t mem_size = numel * element_size;
+        allocated_memory = std::make_unique<std::byte[]>(mem_size);
+        context::memcpyD2H(allocated_memory.get(), this->data(), mem_size);
+        cpu_data = allocated_memory.get();
     } else {
         cpu_data = this->data();
     }
 
-    // If filename is provided, save to file
+    // If filename is provided, save to binary file
     if (!filename.empty()) {
-        // Determine file format based on extension
-        bool is_text_format = false;
-        size_t dot_pos = filename.find_last_of('.');
-        if (dot_pos != std::string::npos) {
-            std::string ext = filename.substr(dot_pos);
-            is_text_format = (ext == ".txt");
+        std::ofstream outFile(filename, std::ios::binary);
+        if (!outFile) {
+            std::cerr << "Error opening file for writing: " << filename << "\n";
+            return; // allocated_memory 会自动释放（RAII）
         }
 
-        if (is_text_format) {
-            // Save as text format
-            std::ofstream outFile(filename);
-            if (!outFile) {
-                std::cerr << "Error opening file for writing: " << filename << "\n";
-                if (allocated_memory) {
-                    delete[] allocated_memory;
-                }
-                return;
-            }
-
-            // Write header with tensor information
-            outFile << "# Tensor Debug Output\n";
-            outFile << "# Shape: [";
-            for (size_t i = 0; i < this->shape().size(); ++i) {
-                outFile << this->shape()[i];
-                if (i < this->shape().size() - 1) {
-                    outFile << ", ";
-                }
-            }
-            outFile << "]\n";
-            outFile << "# Strides: [";
-            for (size_t i = 0; i < this->strides().size(); ++i) {
-                outFile << this->strides()[i];
-                if (i < this->strides().size() - 1) {
-                    outFile << ", ";
-                }
-            }
-            outFile << "]\n";
-            outFile << "# Dtype: " << toString(this->dtype()) << "\n";
-            outFile << "# Contiguous: " << (this->is_contiguous() ? "Yes" : "No") << "\n";
-            outFile << "# Elements: " << this->numel() << "\n";
-            outFile << "#\n";
-
-            // Helper function to write data recursively
-            std::function<void(const std::byte *, const Shape &, const Strides &, size_t, std::ofstream &)> write_data;
-
+        // Check if tensor is contiguous - for optimization
+        if (this->is_contiguous()) {
+            // Fast path: contiguous tensor, write in one go
+            size_t mem_size = this->numel() * dsize(this->dtype());
+            outFile.write(reinterpret_cast<const char *>(cpu_data), mem_size);
+        } else {
+            // Slow path: non-contiguous tensor, write element by element using strides
             switch (this->dtype()) {
             case DataType::F16:
-                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
-                    const uint16_t *ptr = reinterpret_cast<const uint16_t *>(data);
-                    if (dim == shape.size() - 1) {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            out << f16_to_f32(ptr[i * strides[dim]]);
-                            if (i < shape[dim] - 1) {
-                                out << " ";
-                            }
-                        }
-                        out << "\n";
-                    } else {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            write_data(data + i * strides[dim] * sizeof(uint16_t), shape, strides, dim + 1, out);
-                        }
-                    }
-                };
+            case DataType::BF16:
+                write_binary_data(outFile, reinterpret_cast<const uint16_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
                 break;
             case DataType::F32:
-                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
-                    const float *ptr = reinterpret_cast<const float *>(data);
-                    if (dim == shape.size() - 1) {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            out << ptr[i * strides[dim]];
-                            if (i < shape[dim] - 1) {
-                                out << " ";
-                            }
-                        }
-                        out << "\n";
-                    } else {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            write_data(data + i * strides[dim] * sizeof(float), shape, strides, dim + 1, out);
-                        }
-                    }
-                };
+                write_binary_data(outFile, reinterpret_cast<const float *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
                 break;
             case DataType::F64:
-                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
-                    const double *ptr = reinterpret_cast<const double *>(data);
-                    if (dim == shape.size() - 1) {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            out << ptr[i * strides[dim]];
-                            if (i < shape[dim] - 1) {
-                                out << " ";
-                            }
-                        }
-                        out << "\n";
-                    } else {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            write_data(data + i * strides[dim] * sizeof(double), shape, strides, dim + 1, out);
-                        }
-                    }
-                };
+                write_binary_data(outFile, reinterpret_cast<const double *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
                 break;
-            case DataType::I32:
-                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
-                    const int32_t *ptr = reinterpret_cast<const int32_t *>(data);
-                    if (dim == shape.size() - 1) {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            out << ptr[i * strides[dim]];
-                            if (i < shape[dim] - 1) {
-                                out << " ";
-                            }
-                        }
-                        out << "\n";
-                    } else {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            write_data(data + i * strides[dim] * sizeof(int32_t), shape, strides, dim + 1, out);
-                        }
-                    }
-                };
+            case DataType::U64:
+                write_binary_data(outFile, reinterpret_cast<const uint64_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
                 break;
             case DataType::I64:
-                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
-                    const int64_t *ptr = reinterpret_cast<const int64_t *>(data);
-                    if (dim == shape.size() - 1) {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            out << ptr[i * strides[dim]];
-                            if (i < shape[dim] - 1) {
-                                out << " ";
-                            }
-                        }
-                        out << "\n";
-                    } else {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            write_data(data + i * strides[dim] * sizeof(int64_t), shape, strides, dim + 1, out);
-                        }
-                    }
-                };
+                write_binary_data(outFile, reinterpret_cast<const int64_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
                 break;
-            case DataType::BF16:
-                write_data = [&write_data](const std::byte *data, const Shape &shape, const Strides &strides, size_t dim, std::ofstream &out) {
-                    const uint16_t *ptr = reinterpret_cast<const uint16_t *>(data);
-                    if (dim == shape.size() - 1) {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            out << bf16_to_f32(ptr[i * strides[dim]]);
-                            if (i < shape[dim] - 1) {
-                                out << " ";
-                            }
-                        }
-                        out << "\n";
-                    } else {
-                        for (size_t i = 0; i < shape[dim]; i++) {
-                            write_data(data + i * strides[dim] * sizeof(uint16_t), shape, strides, dim + 1, out);
-                        }
-                    }
-                };
+            case DataType::U32:
+                write_binary_data(outFile, reinterpret_cast<const uint32_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
+                break;
+            case DataType::I32:
+                write_binary_data(outFile, reinterpret_cast<const int32_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
+                break;
+            case DataType::U16:
+                write_binary_data(outFile, reinterpret_cast<const uint16_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
+                break;
+            case DataType::I16:
+                write_binary_data(outFile, reinterpret_cast<const int16_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
+                break;
+            case DataType::U8:
+                write_binary_data(outFile, reinterpret_cast<const uint8_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
+                break;
+            case DataType::I8:
+                write_binary_data(outFile, reinterpret_cast<const int8_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
+                break;
+            case DataType::BOOL:
+                // 布尔类型特殊处理：转换为 uint8_t 以保证跨平台一致性
+                write_binary_data(outFile, reinterpret_cast<const uint8_t *>(cpu_data),
+                                  this->shape(), this->strides(), 0);
                 break;
             default:
-                outFile << "# Unsupported data type for text output\n";
-                outFile.close();
-                if (allocated_memory) {
-                    delete[] allocated_memory;
-                }
+                std::cerr << "Unsupported data type for binary output\n";
                 return;
             }
+        }
 
-            // Write the actual data
-            write_data(cpu_data, this->shape(), this->strides(), 0, outFile);
-
-            outFile.close();
-            std::cout << "Data written to text file: " << filename << "\n";
-        } else {
-            // Save as binary format (default)
-            std::ofstream outFile(filename, std::ios::binary);
-            if (!outFile) {
-                std::cerr << "Error opening file for writing: " << filename << "\n";
-                if (allocated_memory) {
-                    delete[] allocated_memory;
-                }
-                return;
-            }
-            size_t mem_size = this->numel() * dsize(this->dtype());
-            outFile.write(reinterpret_cast<const char *>(cpu_data), mem_size);
-            outFile.close();
-            std::cout << "Data written to binary file: " << filename << "\n";
+        // 显式关闭文件并检查是否成功
+        outFile.close();
+        if (!outFile) {
+            std::cerr << "Error: Failed to write data to file: " << filename << "\n";
+            return;
         }
 
-        if (allocated_memory) {
-            delete[] allocated_memory;
+        std::cout << "Data written to binary file: " << filename;
+        if (!this->is_contiguous()) {
+            std::cout << " (non-contiguous tensor, wrote " << this->numel() << " elements)";
         }
+        std::cout << "\n";
         return;
     }
 
@@ -362,11 +285,6 @@ void TensorImpl::debug(const std::string &filename) const {
         std::cout << "Unsupported data type for debug" << std::endl;
         break;
     }
-
-    // Clean up allocated memory
-    if (allocated_memory) {
-        delete[] allocated_memory;
-    }
 }
 
 void TensorImpl::debug() const {
diff --git a/test/infinicore/op/debug.py b/test/infinicore/op/debug.py
index 5db66bc44..e65db29d8 100644
--- a/test/infinicore/op/debug.py
+++ b/test/infinicore/op/debug.py
@@ -2,7 +2,7 @@
 """
 Tensor Debug 功能测试脚本
 
-简单测试 debug 功能是否正常工作
+测试 debug 功能在不同设备和数据类型下的正确性
 """
 
 import torch
@@ -10,389 +10,455 @@
 import sys
 import os
 import numpy as np
+import time
 
+# Framework path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
 
-def test_basic_debug():
-    """测试基本的 debug 打印功能"""
-    print("\n" + "=" * 80)
-    print("测试 1: 基本 debug 打印")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    # 测试 float32
-    print("\n--- Float32 张量 (2x3) ---")
-    torch_tensor = torch.tensor([[1.0, 2.0, 3.0],
-                                  [4.0, 5.0, 6.0]], dtype=torch.float32)
-    infini_tensor = infinicore.from_blob(
-        torch_tensor.data_ptr(),
-        list(torch_tensor.shape),
-        dtype=infinicore.float32,
-        device=device
-    )
-    infini_tensor.debug()
-    print("✓ Float32 打印成功")
-    
-    # 测试 int32
-    print("\n--- Int32 张量 (2x2) ---")
-    torch_i32 = torch.tensor([[1, 2], [3, 4]], dtype=torch.int32)
-    infini_i32 = infinicore.from_blob(
-        torch_i32.data_ptr(),
-        list(torch_i32.shape),
-        dtype=infinicore.int32,
-        device=device
-    )
-    infini_i32.debug()
-    print("✓ Int32 打印成功")
+from framework import (
+    TestConfig,
+    TestRunner,
+    TestCase,
+    create_infinicore_tensor,
+    get_args,
+    get_test_devices,
+    to_torch_dtype,
+    InfiniDeviceNames,
+    torch_device_map,
+)
 
+# ==============================================================================
+# Test Setup
+# ==============================================================================
 
-def test_save_to_file():
-    """测试保存到文件"""
-    print("\n" + "=" * 80)
-    print("测试 2: 保存张量到文件")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    # 创建张量
-    torch_tensor = torch.arange(1, 13, dtype=torch.float32).reshape(3, 4)
-    print("\n原始张量:")
-    print(torch_tensor)
-    
-    infini_tensor = infinicore.from_blob(
-        torch_tensor.data_ptr(),
-        list(torch_tensor.shape),
-        dtype=infinicore.float32,
-        device=device
-    )
-    
-    # 保存到文件
-    filename = "/tmp/tensor_debug_test.bin"
-    print(f"\n保存到: {filename}")
-    infini_tensor.debug(filename)
-    
-    # 验证文件
-    if os.path.exists(filename):
-        file_size = os.path.getsize(filename)
-        expected_size = 12 * 4  # 12 个 float32
-        assert file_size == expected_size, f"文件大小不匹配: {file_size} vs {expected_size}"
-        
-        # 读取验证
-        loaded = np.fromfile(filename, dtype=np.float32).reshape(3, 4)
-        print("\n从文件读取:")
-        print(loaded)
-        
-        os.remove(filename)
-        print("✓ 文件保存和读取成功")
-    else:
-        raise RuntimeError("文件未创建")
+# Test cases - 定义不同的测试场景
+_TEST_CASES = [
+    TestCase("basic_print", (2, 3)),           # 基本打印
+    TestCase("binary_save", (3, 4)),           # 二进制保存
+    TestCase("multidimensional", (2, 2, 3)),   # 多维张量
+]
 
+# 非连续内存布局测试用例 (is_contiguous=False)
+_NON_CONTIGUOUS_TEST_CASES = [
+    TestCase("non_contiguous", (3, 4)),        # 测试 transpose 等导致的非连续内存布局
+]
 
-def test_multidimensional():
-    """测试多维张量"""
-    print("\n" + "=" * 80)
-    print("测试 3: 多维张量")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    # 3D 张量
-    print("\n--- 3D 张量 (2x2x3) ---")
-    torch_3d = torch.arange(1, 13, dtype=torch.float32).reshape(2, 2, 3)
-    print("PyTorch 张量:")
-    print(torch_3d)
-    
-    infini_3d = infinicore.from_blob(
-        torch_3d.data_ptr(),
-        list(torch_3d.shape),
-        dtype=infinicore.float32,
-        device=device
-    )
-    
-    print("\nInfiniCore debug 输出:")
-    infini_3d.debug()
-    print("✓ 3D 张量打印成功")
+# 大规模性能测试用例 - 一千万个数据
+_LARGE_SCALE_TEST_CASES = [
+    TestCase("large_scale_binary", (10000000,)),  # 1D: 一千万个元素
+]
 
+# Data types - 包含所有需要测试的数据类型
+_TENSOR_DTYPES = [
+    infinicore.float32,
+    infinicore.float16,
+    infinicore.bfloat16,
+]
 
-def test_infinicore_created():
-    """测试 InfiniCore 创建的张量"""
-    print("\n" + "=" * 80)
-    print("测试 4: InfiniCore 创建的张量")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    # ones 张量
-    print("\n--- ones 张量 (2x3) ---")
-    ones_tensor = infinicore.ones([2, 3], dtype=infinicore.float32, device=device)
-    ones_tensor.debug()
-    print("✓ ones 张量打印成功")
-    
-    # zeros 张量
-    print("\n--- zeros 张量 (3x2) ---")
-    zeros_tensor = infinicore.zeros([3, 2], dtype=infinicore.float32, device=device)
-    zeros_tensor.debug()
-    print("✓ zeros 张量打印成功")
+# Tolerance map - 用于数值验证时的容差
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-3},
+    infinicore.float32: {"atol": 0, "rtol": 1e-5},
+    infinicore.bfloat16: {"atol": 0, "rtol": 1e-2},
+    infinicore.int32: {"atol": 0, "rtol": 0},
+    infinicore.int64: {"atol": 0, "rtol": 0},
+}
 
+# ==============================================================================
+# Helper Functions
+# ==============================================================================
 
-def test_different_dtypes():
-    """测试不同数据类型"""
-    print("\n" + "=" * 80)
-    print("测试 5: 不同数据类型")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    dtypes = [
-        (infinicore.float32, torch.float32, "Float32"),
-        (infinicore.int32, torch.int32, "Int32"),
-        (infinicore.int64, torch.int64, "Int64"),
-    ]
-    
-    for infini_dtype, torch_dtype, name in dtypes:
-        print(f"\n--- {name} ---")
-        torch_tensor = torch.arange(1, 7, dtype=torch_dtype).reshape(2, 3)
-        infini_tensor = infinicore.from_blob(
-            torch_tensor.data_ptr(),
-            list(torch_tensor.shape),
-            dtype=infini_dtype,
-            device=device
-        )
-        infini_tensor.debug()
-        print(f"✓ {name} 测试通过")
+def load_binary_with_torch(filename, dtype, shape):
+    """使用 torch.frombuffer 读取二进制文件"""
+    torch_dtype = to_torch_dtype(dtype)
+    with open(filename, 'rb') as f:
+        data = f.read()
+    return torch.frombuffer(data, dtype=torch_dtype).reshape(shape)
 
 
-def test_text_format():
-    """测试文本格式保存"""
-    print("\n" + "=" * 80)
-    print("测试 6: 文本格式保存 (.txt)")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    # 创建张量
-    torch_tensor = torch.arange(1, 13, dtype=torch.float32).reshape(3, 4)
-    print("\n原始张量:")
-    print(torch_tensor)
-    
-    infini_tensor = infinicore.from_blob(
-        torch_tensor.data_ptr(),
-        list(torch_tensor.shape),
-        dtype=infinicore.float32,
-        device=device
-    )
+# ==============================================================================
+# Test Methods
+# ==============================================================================
+
+def test_basic_print(device, test_case, dtype, config):
+    """测试基本的 debug 打印功能"""
+    test_name, shape = test_case.args
     
-    # 保存为文本文件
-    txt_filename = "/tmp/tensor_debug_test.txt"
-    print(f"\n保存为文本格式: {txt_filename}")
-    infini_tensor.debug(txt_filename)
-    
-    # 验证文本文件
-    if os.path.exists(txt_filename):
-        print("\n文本文件内容:")
-        with open(txt_filename, 'r') as f:
-            content = f.read()
-            print(content)
-        
-        # 1. 验证元数据
-        assert "# Tensor Debug Output" in content, "文本文件缺少标题"
-        assert "# Shape: [3, 4]" in content, "文本文件缺少形状信息"
-        assert "# Dtype: F32" in content, "文本文件缺少类型信息"
-        print("✓ 元数据验证通过")
-        
-        # 2. 提取并验证数值数据
-        lines = content.split('\n')
-        data_lines = [line.strip() for line in lines 
-                      if line.strip() and not line.startswith('#')]
-        
-        print(f"\n提取到 {len(data_lines)} 行数据")
-        
-        # 解析数值
-        loaded_data = []
-        for i, line in enumerate(data_lines):
-            row = [float(x) for x in line.split()]
-            loaded_data.append(row)
-            print(f"  第 {i+1} 行: {row}")
-        
-        # 转换为 numpy 数组
-        loaded_array = np.array(loaded_data, dtype=np.float32)
-        
-        # 3. 与原始数据对比
-        expected = torch_tensor.numpy()
-        assert loaded_array.shape == expected.shape, \
-            f"形状不匹配: {loaded_array.shape} vs {expected.shape}"
-        assert np.allclose(loaded_array, expected), \
-            f"数值不匹配:\n加载的数据:\n{loaded_array}\n期望的数据:\n{expected}"
-        
-        print("✓ 数值验证通过")
-        
-        os.remove(txt_filename)
-        print("✓ 文本格式保存测试通过")
-    else:
-        raise RuntimeError("文本文件未创建")
+    print(f"Testing Basic Print on {InfiniDeviceNames[device]} with "
+          f"shape:{shape}, dtype:{dtype}")
+    
+    device_str = torch_device_map[device]
+    torch_dtype = to_torch_dtype(dtype)
+    
+    # 创建测试张量
+    torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, 
+                                dtype=torch_dtype, device=device_str).reshape(shape)
+    
+    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
+    
+    # 测试 debug 打印（不保存文件）
+    infini_tensor.debug()
+    
+    print(f"✓ Basic print test passed")
 
 
-def test_binary_format():
+def test_binary_save(device, test_case, dtype, config):
     """测试二进制格式保存"""
-    print("\n" + "=" * 80)
-    print("测试 7: 二进制格式保存 (.bin)")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    # 创建张量
-    torch_tensor = torch.arange(1, 13, dtype=torch.float32).reshape(3, 4)
-    print("\n原始张量:")
-    print(torch_tensor)
-    
-    infini_tensor = infinicore.from_blob(
-        torch_tensor.data_ptr(),
-        list(torch_tensor.shape),
-        dtype=infinicore.float32,
-        device=device
-    )
+    test_name, shape = test_case.args
+    
+    print(f"Testing Binary Save on {InfiniDeviceNames[device]} with "
+          f"shape:{shape}, dtype:{dtype}")
+    
+    device_str = torch_device_map[device]
+    torch_dtype = to_torch_dtype(dtype)
+    
+    # 创建测试张量
+    torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, 
+                                dtype=torch_dtype, device=device_str).reshape(shape)
+    
+    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
     
     # 保存为二进制文件
-    bin_filename = "/tmp/tensor_debug_test.bin"
-    print(f"\n保存为二进制格式: {bin_filename}")
-    infini_tensor.debug(bin_filename)
+    bin_file = f"/tmp/debug_test_{device}_{dtype}_binary.bin"
+    infini_tensor.debug(bin_file)
     
-    # 验证二进制文件
-    if os.path.exists(bin_filename):
-        file_size = os.path.getsize(bin_filename)
-        expected_size = 12 * 4  # 12 个 float32
-        assert file_size == expected_size, \
-            f"二进制文件大小不匹配: {file_size} vs {expected_size}"
-        
-        # 读取并验证数据
-        loaded = np.fromfile(bin_filename, dtype=np.float32).reshape(3, 4)
-        print("\n从二进制文件读取:")
-        print(loaded)
-        
-        # 验证数据正确性
-        assert np.allclose(loaded, torch_tensor.numpy()), "数据不匹配"
-        
-        os.remove(bin_filename)
-        print("✓ 二进制格式保存测试通过")
-    else:
-        raise RuntimeError("二进制文件未创建")
+    # 验证文件存在
+    assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
+    
+    # 验证文件大小
+    expected_size = int(np.prod(shape)) * torch_tensor.element_size()
+    actual_size = os.path.getsize(bin_file)
+    assert actual_size == expected_size, \
+        f"Binary file size mismatch: {actual_size} vs {expected_size}"
+    
+    # 使用 torch.frombuffer 读取并验证
+    loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
+    
+    # 将两个张量都移到 CPU 进行比较
+    torch_tensor_cpu = torch_tensor.cpu()
+    loaded_tensor_cpu = loaded_tensor.cpu()
+    
+    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
+    assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu, 
+                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
+        f"Binary data mismatch"
+    
+    # 清理
+    os.remove(bin_file)
+    print(f"✓ Binary save test passed")
+
+
+def test_multidimensional(device, test_case, dtype, config):
+    """测试多维张量"""
+    test_name, shape = test_case.args
+    
+    print(f"Testing Multidimensional on {InfiniDeviceNames[device]} with "
+          f"shape:{shape}, dtype:{dtype}")
+    
+    device_str = torch_device_map[device]
+    torch_dtype = to_torch_dtype(dtype)
+    
+    # 创建多维张量
+    torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, 
+                                dtype=torch_dtype, device=device_str).reshape(shape)
+    
+    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
+    
+    # 测试打印
+    infini_tensor.debug()
+    
+    # 测试保存和读取
+    bin_file = f"/tmp/debug_test_multidim_{device}_{dtype}.bin"
+    infini_tensor.debug(bin_file)
+    
+    assert os.path.exists(bin_file), "Multidimensional binary file not created"
+    
+    # 验证
+    loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
+    torch_tensor_cpu = torch_tensor.cpu()
+    loaded_tensor_cpu = loaded_tensor.cpu()
+    
+    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
+    assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu,
+                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
+        f"Multidimensional data mismatch"
+    
+    # 清理
+    os.remove(bin_file)
+    print(f"✓ Multidimensional test passed")
 
 
-def test_format_comparison():
-    """对比不同格式"""
-    print("\n" + "=" * 80)
-    print("测试 8: 对比不同格式")
-    print("=" * 80)
-    
-    device = infinicore.device("cpu", 0)
-    
-    # 创建小张量用于对比
-    torch_tensor = torch.tensor([[1.5, 2.5], [3.5, 4.5]], dtype=torch.float32)
-    print("\n原始张量:")
-    print(torch_tensor)
-    
-    infini_tensor = infinicore.from_blob(
-        torch_tensor.data_ptr(),
-        list(torch_tensor.shape),
-        dtype=infinicore.float32,
-        device=device
+def test_non_contiguous_stride(device, test_case, dtype, config):
+    """测试非连续内存布局的情况（is_contiguous=False，例如 transpose 后的张量）"""
+    test_name, shape = test_case.args
+    
+    print(f"\n{'='*70}")
+    print(f"Testing Non-Contiguous Memory Layout on {InfiniDeviceNames[device]}")
+    print(f"  Shape: {shape}, Dtype: {dtype}")
+    print(f"{'='*70}")
+    
+    device_str = torch_device_map[device]
+    torch_dtype = to_torch_dtype(dtype)
+    
+    # 创建连续张量
+    print(f"\nStep 1: Creating contiguous tensor...")
+    torch_tensor_orig = torch.arange(1, int(np.prod(shape)) + 1, 
+                                     dtype=torch_dtype, device=device_str).reshape(shape)
+    print(f"  Original shape: {torch_tensor_orig.shape}")
+    print(f"  Original stride: {torch_tensor_orig.stride()}")
+    print(f"  Is contiguous: {torch_tensor_orig.is_contiguous()}")
+    print(f"  Data:\n{torch_tensor_orig}")
+    
+    # 进行 transpose 操作，创建非连续张量
+    print(f"\nStep 2: Transposing to create non-contiguous tensor...")
+    torch_tensor_t = torch_tensor_orig.t()  # transpose
+    print(f"  Transposed shape: {torch_tensor_t.shape}")
+    print(f"  Transposed stride: {torch_tensor_t.stride()}")
+    print(f"  Is contiguous: {torch_tensor_t.is_contiguous()}")
+    print(f"  Data:\n{torch_tensor_t}")
+    
+    # 创建 InfiniCore 张量（非连续）
+    # 注意：from_blob 不支持 strides，所以我们使用 permute 创建非连续张量
+    # permute([1, 0]) 相当于 transpose，会创建非连续的内存布局
+    infini_tensor_orig = create_infinicore_tensor(torch_tensor_orig, device_str)
+    infini_tensor_t = infini_tensor_orig.as_strided(
+        list(torch_tensor_t.shape),
+        list(torch_tensor_t.stride())
     )
+
+    print(f"\nStep 3: InfiniCore tensor after permute:")
+    print(f"  Shape: {infini_tensor_t.shape}")
+    print(f"  Stride: {infini_tensor_t.stride()}")
+    print(f"  Is contiguous: {infini_tensor_t.is_contiguous()}")
+    
+    # ===== 测试二进制格式 =====
+    print(f"\n{'='*70}")
+    print(f"Testing Binary Format (.bin) with Non-Contiguous Memory Layout")
+    print(f"{'='*70}")
+    print(f"Note: Binary format now SUPPORTS non-contiguous memory layout!")
+    print(f"      It automatically detects and handles stride correctly.")
+    
+    bin_file = f"/tmp/debug_non_contiguous_{device}_{dtype}.bin"
+    infini_tensor_t.debug(bin_file)
+    
+    # 验证二进制文件
+    assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
+    
+    # 检查文件大小
+    actual_size = os.path.getsize(bin_file)
+    expected_size = int(np.prod(torch_tensor_t.shape)) * torch_tensor_t.element_size()
+    
+    print(f"\nFile size check:")
+    print(f"  Expected: {expected_size} bytes ({int(np.prod(torch_tensor_t.shape))} elements)")
+    print(f"  Actual: {actual_size} bytes")
+    
+    assert actual_size == expected_size, \
+        f"File size mismatch: {actual_size} vs {expected_size}"
+    print(f"  ✓ File size is correct")
+    
+    # 读取并验证数据
+    loaded_tensor = load_binary_with_torch(bin_file, dtype, torch_tensor_t.shape)
+    torch_tensor_cpu = torch_tensor_t.cpu()
+    loaded_tensor_cpu = loaded_tensor.cpu()
+    
+    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
     
-    bin_file = "/tmp/compare_test.bin"
-    txt_file = "/tmp/compare_test.txt"
+    print(f"\nData verification:")
+    print(f"  Expected (first 2 rows):\n{torch_tensor_cpu[:2]}")
+    print(f"  Got (first 2 rows):\n{loaded_tensor_cpu[:2]}")
     
-    # 保存两种格式
-    print("\n保存两种格式...")
+    assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu,
+                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
+        f"Data verification failed: loaded data doesn't match expected"
+    
+    print(f"\n✓ Binary format: Data matches perfectly!")
+    print(f"  Binary format correctly handles non-contiguous memory layout using stride")
+    
+    # 清理
+    os.remove(bin_file)
+    
+    print(f"\n{'='*70}")
+    print(f"Non-Contiguous Memory Layout Test Summary:")
+    print(f"  ✅ Binary format (.bin): NOW supports non-contiguous memory!")
+    print(f"  Performance: Contiguous tensors use fast path, non-contiguous use stride-based writing")
+    print(f"{'='*70}\n")
+
+
+def test_large_scale_binary_performance(device, test_case, dtype, config):
+    """测试大规模数据二进制保存性能（一千万个数据）"""
+    test_name, shape = test_case.args
+    
+    num_elements = int(np.prod(shape))
+    element_size_bytes = {
+        infinicore.float32: 4,
+        infinicore.float16: 2,
+        infinicore.bfloat16: 2,
+        infinicore.int32: 4,
+        infinicore.int64: 8,
+    }
+    
+    total_size_mb = (num_elements * element_size_bytes.get(dtype, 4)) / (1024 * 1024)
+    
+    print(f"\n{'='*70}")
+    print(f"Performance Test: Large Scale Binary Save")
+    print(f"  Device: {InfiniDeviceNames[device]}")
+    print(f"  Shape: {shape}")
+    print(f"  Elements: {num_elements:,}")
+    print(f"  Dtype: {dtype}")
+    print(f"  Expected file size: {total_size_mb:.2f} MB")
+    print(f"{'='*70}")
+    
+    device_str = torch_device_map[device]
+    torch_dtype = to_torch_dtype(dtype)
+    
+    # 创建大规模张量
+    print(f"Creating tensor with {num_elements:,} elements...")
+    create_start = time.time()
+    torch_tensor = torch.randn(shape, dtype=torch_dtype, device=device_str)
+    create_time = time.time() - create_start
+    print(f"  Tensor creation time: {create_time:.4f} seconds")
+    
+    infini_tensor = create_infinicore_tensor(torch_tensor, device_str)
+    
+    # 测试保存性能
+    bin_file = f"/tmp/debug_large_scale_{device}_{dtype}.bin"
+    
+    print(f"\n{'='*70}")
+    print(f"[1/2] Writing Binary File")
+    print(f"{'='*70}")
+    print(f"File: {bin_file}")
+    save_start = time.time()
     infini_tensor.debug(bin_file)
-    infini_tensor.debug(txt_file)
-    
-    # 对比文件大小
-    bin_size = os.path.getsize(bin_file)
-    txt_size = os.path.getsize(txt_file)
-    
-    print(f"\n文件大小对比:")
-    print(f"  二进制文件: {bin_size} 字节")
-    print(f"  文本文件: {txt_size} 字节")
-    print(f"  文本/二进制比: {txt_size/bin_size:.2f}x")
-    
-    # ===== 验证二进制文件 =====
-    print("\n验证二进制文件:")
-    bin_data = np.fromfile(bin_file, dtype=np.float32).reshape(2, 2)
-    print(f"  读取的数据:\n{bin_data}")
-    assert np.allclose(bin_data, torch_tensor.numpy()), "二进制数据不匹配"
-    print("  ✓ 二进制文件数值正确")
-    
-    # ===== 验证文本文件 =====
-    print("\n验证文本文件:")
-    with open(txt_file, 'r') as f:
-        txt_content = f.read()
-    
-    # 1. 元数据验证
-    assert "# Tensor Debug Output" in txt_content, "缺少标题"
-    assert "# Shape: [2, 2]" in txt_content, "缺少形状信息"
-    assert "# Dtype: F32" in txt_content, "缺少类型信息"
-    print("  ✓ 元数据正确")
-    
-    # 2. 数值验证
-    lines = txt_content.split('\n')
-    data_lines = [line.strip() for line in lines 
-                  if line.strip() and not line.startswith('#')]
-    
-    txt_data = []
-    for line in data_lines:
-        row = [float(x) for x in line.split()]
-        txt_data.append(row)
-    
-    txt_array = np.array(txt_data, dtype=np.float32)
-    print(f"  读取的数据:\n{txt_array}")
-    
-    assert txt_array.shape == torch_tensor.shape, \
-        f"文本文件形状不匹配: {txt_array.shape} vs {torch_tensor.shape}"
-    assert np.allclose(txt_array, torch_tensor.numpy()), \
-        f"文本文件数值不匹配"
-    print("  ✓ 文本文件数值正确")
-    
-    # ===== 对比两种格式的数据一致性 =====
-    print("\n验证两种格式数据一致性:")
-    assert np.allclose(bin_data, txt_array), \
-        "二进制和文本文件的数据不一致！"
-    print("  ✓ 两种格式数据完全一致")
+    save_time = time.time() - save_start
+    
+    # 验证文件存在
+    assert os.path.exists(bin_file), f"Binary file not created: {bin_file}"
+    
+    # 获取实际文件大小
+    actual_size = os.path.getsize(bin_file)
+    actual_size_mb = actual_size / (1024 * 1024)
+    
+    # 计算写入吞吐量
+    write_throughput_mbps = actual_size_mb / save_time if save_time > 0 else 0
+    
+    # 打印写入性能结果
+    print(f"\n✓ Write Performance:")
+    print(f"  File size: {actual_size_mb:.2f} MB ({actual_size:,} bytes)")
+    print(f"  Write time: {save_time:.4f} seconds")
+    print(f"  Write throughput: {write_throughput_mbps:.2f} MB/s")
+    print(f"  Elements written/sec: {num_elements/save_time:,.0f}")
+    
+    # 测试读取性能
+    print(f"\n{'='*70}")
+    print(f"[2/2] Reading Binary File (for verification)")
+    print(f"{'='*70}")
+    read_start = time.time()
+    loaded_tensor = load_binary_with_torch(bin_file, dtype, shape)
+    read_time = time.time() - read_start
+    read_throughput_mbps = actual_size_mb / read_time if read_time > 0 else 0
+    
+    print(f"\n✓ Read Performance:")
+    print(f"  Read time: {read_time:.4f} seconds")
+    print(f"  Read throughput: {read_throughput_mbps:.2f} MB/s")
+    print(f"  Elements read/sec: {num_elements/read_time:,.0f}")
+    
+    # 简单验证前几个元素（不做完整验证以节省时间）
+    torch_tensor_cpu = torch_tensor.cpu()
+    loaded_tensor_cpu = loaded_tensor.cpu()
+    
+    sample_size = min(1000, num_elements)
+    tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5})
+    assert torch.allclose(loaded_tensor_cpu.flatten()[:sample_size], 
+                         torch_tensor_cpu.flatten()[:sample_size],
+                         atol=tolerance["atol"], rtol=tolerance["rtol"]), \
+        f"Data verification failed (sampled first {sample_size} elements)"
+    
+    print(f"  Data verification: ✓ (sampled first {sample_size} elements)")
+    
+    # 打印性能总结
+    print(f"\n{'='*70}")
+    print(f"Performance Summary")
+    print(f"{'='*70}")
+    print(f"  Elements: {num_elements:,}")
+    print(f"  File size: {actual_size_mb:.2f} MB")
+    print(f"  Write time: {save_time:.4f} sec  →  {write_throughput_mbps:.2f} MB/s")
+    print(f"  Read time:  {read_time:.4f} sec  →  {read_throughput_mbps:.2f} MB/s")
+    print(f"  Speed ratio (Read/Write): {read_throughput_mbps/write_throughput_mbps:.2f}x")
+    print(f"{'='*70}")
     
     # 清理
     os.remove(bin_file)
-    os.remove(txt_file)
+    print(f"\n✓ Large scale performance test passed\n")
     
-    print("\n✓ 格式对比测试通过")
 
+# ==============================================================================
+# Main Execution Function
+# ==============================================================================
 
 def main():
-    """主测试函数"""
-    print("\n" + "=" * 80)
-    print("InfiniCore Tensor Debug 功能测试")
-    print("=" * 80)
-    
-    try:
-        test_basic_debug()
-        test_save_to_file()
-        test_multidimensional()
-        test_infinicore_created()
-        test_different_dtypes()
-        test_text_format()
-        test_binary_format()
-        test_format_comparison()
-        
-        print("\n" + "=" * 80)
-        print("✅ 所有测试通过!")
-        print("=" * 80)
-        return 0
+    args = get_args()
+    
+    # 创建测试配置
+    config = TestConfig(
+        tensor_dtypes=_TENSOR_DTYPES,
+        tolerance_map=_TOLERANCE_MAP,
+        debug=args.debug,
+        bench=False,  # debug 测试不需要性能测试
+    )
+    
+    # 获取测试设备
+    devices = get_test_devices(args)
+    
+    print("Starting debug tests...")
+    
+    all_passed = True
+    
+    # 为每种测试类型运行测试
+    test_funcs = [
+        ("Basic Print", test_basic_print, [_TEST_CASES[0]]),
+        ("Binary Save", test_binary_save, [_TEST_CASES[1]]),
+        ("Multidimensional", test_multidimensional, [_TEST_CASES[2]]),
+    ]
+    
+    for test_name, test_func, test_cases in test_funcs:
+        print(f"\n{'='*60}")
+        print(f"Testing {test_name}")
+        print(f"{'='*60}")
         
-    except Exception as e:
-        print(f"\n❌ 测试失败: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
+        runner = TestRunner(test_cases, config)
+        passed = runner.run_tests(devices, test_func)
+        all_passed = all_passed and passed
+    
+    # 运行非连续内存布局测试
+    print(f"\n{'='*60}")
+    print(f"Testing Non-Contiguous Memory Layout (is_contiguous=False)")
+    print(f"{'='*60}")
+    
+    non_contiguous_runner = TestRunner(_NON_CONTIGUOUS_TEST_CASES, config)
+    non_contiguous_passed = non_contiguous_runner.run_tests(devices, test_non_contiguous_stride)
+    all_passed = all_passed and non_contiguous_passed
+    
+    # 运行大规模性能测试
+    print(f"\n{'='*60}")
+    print(f"Testing Large Scale Performance (10M elements)")
+    print(f"{'='*60}")
+    
+    large_scale_runner = TestRunner(_LARGE_SCALE_TEST_CASES, config)
+    large_scale_passed = large_scale_runner.run_tests(devices, test_large_scale_binary_performance)
+    all_passed = all_passed and large_scale_passed
+    
+    # 打印总结
+    print(f"\n{'='*60}")
+    print("Test Summary")
+    print(f"{'='*60}")
+    
+    if all_passed:
+        print("\033[92m✅ All debug tests passed!\033[0m")
+    else:
+        print("\033[91m❌ Some tests failed!\033[0m")
+    
+    sys.exit(0 if all_passed else 1)
 
 
 if __name__ == "__main__":
-    sys.exit(main())
-
+    main()

From 6c0bd2c615910eb662114112183e15d639aa4746 Mon Sep 17 00:00:00 2001
From: zhuyue <zhuyue@qiyuanlab.com>
Date: Fri, 17 Oct 2025 15:37:34 +0800
Subject: [PATCH 3/3] Move debug.py out of the op operator test folder.

---
 test/infinicore/{op => }/debug.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/infinicore/{op => }/debug.py (100%)

diff --git a/test/infinicore/op/debug.py b/test/infinicore/debug.py
similarity index 100%
rename from test/infinicore/op/debug.py
rename to test/infinicore/debug.py