diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp index b3570e9fe..828034c94 100644 --- a/include/infinicore/tensor.hpp +++ b/include/infinicore/tensor.hpp @@ -121,6 +121,10 @@ class TensorImpl : public std::enable_shared_from_this { std::string info() const; + void debug(const std::string &filename) const; + + void debug() const; + /// /// Data Transfer APIs /// diff --git a/python/infinicore/tensor.py b/python/infinicore/tensor.py index 5095768c5..2df6df681 100644 --- a/python/infinicore/tensor.py +++ b/python/infinicore/tensor.py @@ -75,6 +75,17 @@ def permute(self, dims): def view(self, shape): return Tensor(self._underlying.view(shape)) + def debug(self, filename=None): + """Print tensor data or save to file for debugging + + Args: + filename: Optional filename to save raw binary data. If None, prints to stdout. + """ + if filename is None: + self._underlying.debug() + else: + self._underlying.debug(filename) + def empty(size, *, dtype=None, device=None, pin_memory=False): return Tensor( diff --git a/src/infinicore/pybind11/tensor.hpp b/src/infinicore/pybind11/tensor.hpp index 66fa06678..b7e50d561 100644 --- a/src/infinicore/pybind11/tensor.hpp +++ b/src/infinicore/pybind11/tensor.hpp @@ -17,7 +17,7 @@ inline void bind(py::module &m) { .def_property_readonly("dtype", [](const Tensor &tensor) { return tensor->dtype(); }) .def_property_readonly("device", [](const Tensor &tensor) { return tensor->device(); }) - .def("data_ptr", [](const Tensor &tensor) { return tensor->data(); }) + .def("data_ptr", [](const Tensor &tensor) { return reinterpret_cast(tensor->data()); }) .def("size", [](const Tensor &tensor, std::size_t dim) { return tensor->size(dim); }) .def("stride", [](const Tensor &tensor, std::size_t dim) { return tensor->stride(dim); }) .def("numel", [](const Tensor &tensor) { return tensor->numel(); }) @@ -25,6 +25,8 @@ inline void bind(py::module &m) { .def("is_contiguous", [](const Tensor &tensor) { return tensor->is_contiguous(); }) .def("is_pinned", [](const Tensor &tensor) { return tensor->is_pinned(); }) .def("info", [](const Tensor &tensor) { return tensor->info(); }) + .def("debug", [](const Tensor &tensor) { return tensor->debug(); }) + .def("debug", [](const Tensor &tensor, const std::string &filename) { return tensor->debug(filename); }) .def("copy_", [](Tensor &tensor, const Tensor &other) { tensor->copy_from(other); }) .def("to", [](const Tensor &tensor, const Device &device) { return tensor->to(device); }) diff --git a/src/infinicore/tensor/debug.cc b/src/infinicore/tensor/debug.cc new file mode 100644 index 000000000..8cf31d592 --- /dev/null +++ b/src/infinicore/tensor/debug.cc @@ -0,0 +1,294 @@ +#include "infinicore/context/context.hpp" +#include "infinicore/dtype.hpp" +#include "infinicore/tensor.hpp" + +#include +#include +#include +#include +#include +#include + +namespace infinicore { + +inline float f16_to_f32(uint16_t h) { + uint32_t sign = (h & 0x8000) << 16; + int32_t exponent = (h >> 10) & 0x1F; + uint32_t mantissa = h & 0x3FF; + + uint32_t f32; + if (exponent == 31) { + if (mantissa != 0) { + f32 = sign | 0x7F800000 | (mantissa << 13); + } else { + f32 = sign | 0x7F800000; + } + } else if (exponent == 0) { + if (mantissa == 0) { + f32 = sign; + } else { + exponent = -14; + while ((mantissa & 0x400) == 0) { + mantissa <<= 1; + exponent--; + } + mantissa &= 0x3FF; + f32 = sign | ((exponent + 127) << 23) | (mantissa << 13); + } + } else { + f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + } + + float result; + std::memcpy(&result, &f32, sizeof(result)); + return result; +} + +inline float bf16_to_f32(uint16_t val) { + uint32_t bits32 = static_cast(val) << 16; + float out; + std::memcpy(&out, &bits32, sizeof(out)); + return out; +} + +// Template function for printing data recursively +template +void print_data(const T *data, const Shape &shape, const Strides &strides, size_t dim) { + if (dim == shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + std::cout << data[i * strides[dim]] << " "; + } + std::cout << std::endl; + } else if (dim < shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + print_data(data + i * strides[dim], shape, strides, dim + 1); + } + } +} + +// Specialization for F16 (uint16_t) +template <> +void print_data(const uint16_t *data, const Shape &shape, const Strides &strides, size_t dim) { + if (dim == shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + std::cout << f16_to_f32(data[i * strides[dim]]) << " "; + } + std::cout << std::endl; + } else if (dim < shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + print_data(data + i * strides[dim], shape, strides, dim + 1); + } + } +} + +// Function for printing BF16 data +void print_data_bf16(const uint16_t *data, const Shape &shape, const Strides &strides, size_t dim) { + if (dim == shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + std::cout << bf16_to_f32(data[i * strides[dim]]) << " "; + } + std::cout << std::endl; + } else if (dim < shape.size() - 1) { + for (size_t i = 0; i < shape[dim]; i++) { + print_data_bf16(data + i * strides[dim], shape, strides, dim + 1); + } + } +} + +// Template function for writing data recursively to binary file (handles non-contiguous tensors) +template +void write_binary_data(std::ofstream &out, const T *data, const Shape &shape, const Strides &strides, size_t dim) { + if (dim == shape.size() - 1) { + // Write the innermost dimension + for (size_t i = 0; i < shape[dim]; i++) { + out.write(reinterpret_cast(&data[i * strides[dim]]), sizeof(T)); + } + } else { + // Recursively process higher dimensions + for (size_t i = 0; i < shape[dim]; i++) { + write_binary_data(out, data + i * strides[dim], shape, strides, dim + 1); + } + } +} + +void TensorImpl::debug(const std::string &filename) const { + // Synchronize device if needed + context::syncDevice(); + + std::cout << info() << std::endl; + + const std::byte *cpu_data = nullptr; + std::unique_ptr allocated_memory; // RAII: 自动管理内存 + + // Copy data to CPU if not already on CPU + if (this->device().getType() != Device::Type::CPU) { + size_t numel = this->numel(); + size_t element_size = dsize(this->dtype()); + + // 检查乘法溢出 + if (numel > 0 && element_size > std::numeric_limits::max() / numel) { + std::cerr << "Error: Memory size calculation overflow for tensor with " + << numel << " elements of size " << element_size << "\n"; + return; + } + + size_t mem_size = numel * element_size; + allocated_memory = std::make_unique(mem_size); + context::memcpyD2H(allocated_memory.get(), this->data(), mem_size); + cpu_data = allocated_memory.get(); + } else { + cpu_data = this->data(); + } + + // If filename is provided, save to binary file + if (!filename.empty()) { + std::ofstream outFile(filename, std::ios::binary); + if (!outFile) { + std::cerr << "Error opening file for writing: " << filename << "\n"; + return; // allocated_memory 会自动释放(RAII) + } + + // Check if tensor is contiguous - for optimization + if (this->is_contiguous()) { + // Fast path: contiguous tensor, write in one go + size_t mem_size = this->numel() * dsize(this->dtype()); + outFile.write(reinterpret_cast(cpu_data), mem_size); + } else { + // Slow path: non-contiguous tensor, write element by element using strides + switch (this->dtype()) { + case DataType::F16: + case DataType::BF16: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::F32: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::F64: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U64: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I64: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U32: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I32: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U16: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I16: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U8: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I8: + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::BOOL: + // 布尔类型特殊处理:转换为 uint8_t 以保证跨平台一致性 + write_binary_data(outFile, reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + default: + std::cerr << "Unsupported data type for binary output\n"; + return; + } + } + + // 显式关闭文件并检查是否成功 + outFile.close(); + if (!outFile) { + std::cerr << "Error: Failed to write data to file: " << filename << "\n"; + return; + } + + std::cout << "Data written to binary file: " << filename; + if (!this->is_contiguous()) { + std::cout << " (non-contiguous tensor, wrote " << this->numel() << " elements)"; + } + std::cout << "\n"; + return; + } + + // Print data based on dtype + switch (this->dtype()) { + case DataType::F16: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::F32: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::F64: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U64: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I64: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U32: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I32: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U16: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I16: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::U8: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::I8: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::BF16: + print_data_bf16(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + case DataType::BOOL: + print_data(reinterpret_cast(cpu_data), + this->shape(), this->strides(), 0); + break; + default: + std::cout << "Unsupported data type for debug" << std::endl; + break; + } +} + +void TensorImpl::debug() const { + this->debug(""); +} + +} // namespace infinicore diff --git a/test/infinicore/debug.py b/test/infinicore/debug.py new file mode 100644 index 000000000..e65db29d8 --- /dev/null +++ b/test/infinicore/debug.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +""" +Tensor Debug 功能测试脚本 + +测试 debug 功能在不同设备和数据类型下的正确性 +""" + +import torch +import infinicore +import sys +import os +import numpy as np +import time + +# Framework path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from framework import ( + TestConfig, + TestRunner, + TestCase, + create_infinicore_tensor, + get_args, + get_test_devices, + to_torch_dtype, + InfiniDeviceNames, + torch_device_map, +) + +# ============================================================================== +# Test Setup +# ============================================================================== + +# Test cases - 定义不同的测试场景 +_TEST_CASES = [ + TestCase("basic_print", (2, 3)), # 基本打印 + TestCase("binary_save", (3, 4)), # 二进制保存 + TestCase("multidimensional", (2, 2, 3)), # 多维张量 +] + +# 非连续内存布局测试用例 (is_contiguous=False) +_NON_CONTIGUOUS_TEST_CASES = [ + TestCase("non_contiguous", (3, 4)), # 测试 transpose 等导致的非连续内存布局 +] + +# 大规模性能测试用例 - 一千万个数据 +_LARGE_SCALE_TEST_CASES = [ + TestCase("large_scale_binary", (10000000,)), # 1D: 一千万个元素 +] + +# Data types - 包含所有需要测试的数据类型 +_TENSOR_DTYPES = [ + infinicore.float32, + infinicore.float16, + infinicore.bfloat16, +] + +# Tolerance map - 用于数值验证时的容差 +_TOLERANCE_MAP = { + infinicore.float16: {"atol": 0, "rtol": 1e-3}, + infinicore.float32: {"atol": 0, "rtol": 1e-5}, + infinicore.bfloat16: {"atol": 0, "rtol": 1e-2}, + infinicore.int32: {"atol": 0, "rtol": 0}, + infinicore.int64: {"atol": 0, "rtol": 0}, +} + +# ============================================================================== +# Helper Functions +# ============================================================================== + +def load_binary_with_torch(filename, dtype, shape): + """使用 torch.frombuffer 读取二进制文件""" + torch_dtype = to_torch_dtype(dtype) + with open(filename, 'rb') as f: + data = f.read() + return torch.frombuffer(data, dtype=torch_dtype).reshape(shape) + + +# ============================================================================== +# Test Methods +# ============================================================================== + +def test_basic_print(device, test_case, dtype, config): + """测试基本的 debug 打印功能""" + test_name, shape = test_case.args + + print(f"Testing Basic Print on {InfiniDeviceNames[device]} with " + f"shape:{shape}, dtype:{dtype}") + + device_str = torch_device_map[device] + torch_dtype = to_torch_dtype(dtype) + + # 创建测试张量 + torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, + dtype=torch_dtype, device=device_str).reshape(shape) + + infini_tensor = create_infinicore_tensor(torch_tensor, device_str) + + # 测试 debug 打印(不保存文件) + infini_tensor.debug() + + print(f"✓ Basic print test passed") + + +def test_binary_save(device, test_case, dtype, config): + """测试二进制格式保存""" + test_name, shape = test_case.args + + print(f"Testing Binary Save on {InfiniDeviceNames[device]} with " + f"shape:{shape}, dtype:{dtype}") + + device_str = torch_device_map[device] + torch_dtype = to_torch_dtype(dtype) + + # 创建测试张量 + torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, + dtype=torch_dtype, device=device_str).reshape(shape) + + infini_tensor = create_infinicore_tensor(torch_tensor, device_str) + + # 保存为二进制文件 + bin_file = f"/tmp/debug_test_{device}_{dtype}_binary.bin" + infini_tensor.debug(bin_file) + + # 验证文件存在 + assert os.path.exists(bin_file), f"Binary file not created: {bin_file}" + + # 验证文件大小 + expected_size = int(np.prod(shape)) * torch_tensor.element_size() + actual_size = os.path.getsize(bin_file) + assert actual_size == expected_size, \ + f"Binary file size mismatch: {actual_size} vs {expected_size}" + + # 使用 torch.frombuffer 读取并验证 + loaded_tensor = load_binary_with_torch(bin_file, dtype, shape) + + # 将两个张量都移到 CPU 进行比较 + torch_tensor_cpu = torch_tensor.cpu() + loaded_tensor_cpu = loaded_tensor.cpu() + + tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5}) + assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu, + atol=tolerance["atol"], rtol=tolerance["rtol"]), \ + f"Binary data mismatch" + + # 清理 + os.remove(bin_file) + print(f"✓ Binary save test passed") + + +def test_multidimensional(device, test_case, dtype, config): + """测试多维张量""" + test_name, shape = test_case.args + + print(f"Testing Multidimensional on {InfiniDeviceNames[device]} with " + f"shape:{shape}, dtype:{dtype}") + + device_str = torch_device_map[device] + torch_dtype = to_torch_dtype(dtype) + + # 创建多维张量 + torch_tensor = torch.arange(1, int(np.prod(shape)) + 1, + dtype=torch_dtype, device=device_str).reshape(shape) + + infini_tensor = create_infinicore_tensor(torch_tensor, device_str) + + # 测试打印 + infini_tensor.debug() + + # 测试保存和读取 + bin_file = f"/tmp/debug_test_multidim_{device}_{dtype}.bin" + infini_tensor.debug(bin_file) + + assert os.path.exists(bin_file), "Multidimensional binary file not created" + + # 验证 + loaded_tensor = load_binary_with_torch(bin_file, dtype, shape) + torch_tensor_cpu = torch_tensor.cpu() + loaded_tensor_cpu = loaded_tensor.cpu() + + tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5}) + assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu, + atol=tolerance["atol"], rtol=tolerance["rtol"]), \ + f"Multidimensional data mismatch" + + # 清理 + os.remove(bin_file) + print(f"✓ Multidimensional test passed") + + +def test_non_contiguous_stride(device, test_case, dtype, config): + """测试非连续内存布局的情况(is_contiguous=False,例如 transpose 后的张量)""" + test_name, shape = test_case.args + + print(f"\n{'='*70}") + print(f"Testing Non-Contiguous Memory Layout on {InfiniDeviceNames[device]}") + print(f" Shape: {shape}, Dtype: {dtype}") + print(f"{'='*70}") + + device_str = torch_device_map[device] + torch_dtype = to_torch_dtype(dtype) + + # 创建连续张量 + print(f"\nStep 1: Creating contiguous tensor...") + torch_tensor_orig = torch.arange(1, int(np.prod(shape)) + 1, + dtype=torch_dtype, device=device_str).reshape(shape) + print(f" Original shape: {torch_tensor_orig.shape}") + print(f" Original stride: {torch_tensor_orig.stride()}") + print(f" Is contiguous: {torch_tensor_orig.is_contiguous()}") + print(f" Data:\n{torch_tensor_orig}") + + # 进行 transpose 操作,创建非连续张量 + print(f"\nStep 2: Transposing to create non-contiguous tensor...") + torch_tensor_t = torch_tensor_orig.t() # transpose + print(f" Transposed shape: {torch_tensor_t.shape}") + print(f" Transposed stride: {torch_tensor_t.stride()}") + print(f" Is contiguous: {torch_tensor_t.is_contiguous()}") + print(f" Data:\n{torch_tensor_t}") + + # 创建 InfiniCore 张量(非连续) + # 注意:from_blob 不支持 strides,所以我们使用 permute 创建非连续张量 + # permute([1, 0]) 相当于 transpose,会创建非连续的内存布局 + infini_tensor_orig = create_infinicore_tensor(torch_tensor_orig, device_str) + infini_tensor_t = infini_tensor_orig.as_strided( + list(torch_tensor_t.shape), + list(torch_tensor_t.stride()) + ) + + print(f"\nStep 3: InfiniCore tensor after permute:") + print(f" Shape: {infini_tensor_t.shape}") + print(f" Stride: {infini_tensor_t.stride()}") + print(f" Is contiguous: {infini_tensor_t.is_contiguous()}") + + # ===== 测试二进制格式 ===== + print(f"\n{'='*70}") + print(f"Testing Binary Format (.bin) with Non-Contiguous Memory Layout") + print(f"{'='*70}") + print(f"Note: Binary format now SUPPORTS non-contiguous memory layout!") + print(f" It automatically detects and handles stride correctly.") + + bin_file = f"/tmp/debug_non_contiguous_{device}_{dtype}.bin" + infini_tensor_t.debug(bin_file) + + # 验证二进制文件 + assert os.path.exists(bin_file), f"Binary file not created: {bin_file}" + + # 检查文件大小 + actual_size = os.path.getsize(bin_file) + expected_size = int(np.prod(torch_tensor_t.shape)) * torch_tensor_t.element_size() + + print(f"\nFile size check:") + print(f" Expected: {expected_size} bytes ({int(np.prod(torch_tensor_t.shape))} elements)") + print(f" Actual: {actual_size} bytes") + + assert actual_size == expected_size, \ + f"File size mismatch: {actual_size} vs {expected_size}" + print(f" ✓ File size is correct") + + # 读取并验证数据 + loaded_tensor = load_binary_with_torch(bin_file, dtype, torch_tensor_t.shape) + torch_tensor_cpu = torch_tensor_t.cpu() + loaded_tensor_cpu = loaded_tensor.cpu() + + tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5}) + + print(f"\nData verification:") + print(f" Expected (first 2 rows):\n{torch_tensor_cpu[:2]}") + print(f" Got (first 2 rows):\n{loaded_tensor_cpu[:2]}") + + assert torch.allclose(loaded_tensor_cpu, torch_tensor_cpu, + atol=tolerance["atol"], rtol=tolerance["rtol"]), \ + f"Data verification failed: loaded data doesn't match expected" + + print(f"\n✓ Binary format: Data matches perfectly!") + print(f" Binary format correctly handles non-contiguous memory layout using stride") + + # 清理 + os.remove(bin_file) + + print(f"\n{'='*70}") + print(f"Non-Contiguous Memory Layout Test Summary:") + print(f" ✅ Binary format (.bin): NOW supports non-contiguous memory!") + print(f" Performance: Contiguous tensors use fast path, non-contiguous use stride-based writing") + print(f"{'='*70}\n") + + +def test_large_scale_binary_performance(device, test_case, dtype, config): + """测试大规模数据二进制保存性能(一千万个数据)""" + test_name, shape = test_case.args + + num_elements = int(np.prod(shape)) + element_size_bytes = { + infinicore.float32: 4, + infinicore.float16: 2, + infinicore.bfloat16: 2, + infinicore.int32: 4, + infinicore.int64: 8, + } + + total_size_mb = (num_elements * element_size_bytes.get(dtype, 4)) / (1024 * 1024) + + print(f"\n{'='*70}") + print(f"Performance Test: Large Scale Binary Save") + print(f" Device: {InfiniDeviceNames[device]}") + print(f" Shape: {shape}") + print(f" Elements: {num_elements:,}") + print(f" Dtype: {dtype}") + print(f" Expected file size: {total_size_mb:.2f} MB") + print(f"{'='*70}") + + device_str = torch_device_map[device] + torch_dtype = to_torch_dtype(dtype) + + # 创建大规模张量 + print(f"Creating tensor with {num_elements:,} elements...") + create_start = time.time() + torch_tensor = torch.randn(shape, dtype=torch_dtype, device=device_str) + create_time = time.time() - create_start + print(f" Tensor creation time: {create_time:.4f} seconds") + + infini_tensor = create_infinicore_tensor(torch_tensor, device_str) + + # 测试保存性能 + bin_file = f"/tmp/debug_large_scale_{device}_{dtype}.bin" + + print(f"\n{'='*70}") + print(f"[1/2] Writing Binary File") + print(f"{'='*70}") + print(f"File: {bin_file}") + save_start = time.time() + infini_tensor.debug(bin_file) + save_time = time.time() - save_start + + # 验证文件存在 + assert os.path.exists(bin_file), f"Binary file not created: {bin_file}" + + # 获取实际文件大小 + actual_size = os.path.getsize(bin_file) + actual_size_mb = actual_size / (1024 * 1024) + + # 计算写入吞吐量 + write_throughput_mbps = actual_size_mb / save_time if save_time > 0 else 0 + + # 打印写入性能结果 + print(f"\n✓ Write Performance:") + print(f" File size: {actual_size_mb:.2f} MB ({actual_size:,} bytes)") + print(f" Write time: {save_time:.4f} seconds") + print(f" Write throughput: {write_throughput_mbps:.2f} MB/s") + print(f" Elements written/sec: {num_elements/save_time:,.0f}") + + # 测试读取性能 + print(f"\n{'='*70}") + print(f"[2/2] Reading Binary File (for verification)") + print(f"{'='*70}") + read_start = time.time() + loaded_tensor = load_binary_with_torch(bin_file, dtype, shape) + read_time = time.time() - read_start + read_throughput_mbps = actual_size_mb / read_time if read_time > 0 else 0 + + print(f"\n✓ Read Performance:") + print(f" Read time: {read_time:.4f} seconds") + print(f" Read throughput: {read_throughput_mbps:.2f} MB/s") + print(f" Elements read/sec: {num_elements/read_time:,.0f}") + + # 简单验证前几个元素(不做完整验证以节省时间) + torch_tensor_cpu = torch_tensor.cpu() + loaded_tensor_cpu = loaded_tensor.cpu() + + sample_size = min(1000, num_elements) + tolerance = _TOLERANCE_MAP.get(dtype, {"atol": 0, "rtol": 1e-5}) + assert torch.allclose(loaded_tensor_cpu.flatten()[:sample_size], + torch_tensor_cpu.flatten()[:sample_size], + atol=tolerance["atol"], rtol=tolerance["rtol"]), \ + f"Data verification failed (sampled first {sample_size} elements)" + + print(f" Data verification: ✓ (sampled first {sample_size} elements)") + + # 打印性能总结 + print(f"\n{'='*70}") + print(f"Performance Summary") + print(f"{'='*70}") + print(f" Elements: {num_elements:,}") + print(f" File size: {actual_size_mb:.2f} MB") + print(f" Write time: {save_time:.4f} sec → {write_throughput_mbps:.2f} MB/s") + print(f" Read time: {read_time:.4f} sec → {read_throughput_mbps:.2f} MB/s") + print(f" Speed ratio (Read/Write): {read_throughput_mbps/write_throughput_mbps:.2f}x") + print(f"{'='*70}") + + # 清理 + os.remove(bin_file) + print(f"\n✓ Large scale performance test passed\n") + + +# ============================================================================== +# Main Execution Function +# ============================================================================== + +def main(): + args = get_args() + + # 创建测试配置 + config = TestConfig( + tensor_dtypes=_TENSOR_DTYPES, + tolerance_map=_TOLERANCE_MAP, + debug=args.debug, + bench=False, # debug 测试不需要性能测试 + ) + + # 获取测试设备 + devices = get_test_devices(args) + + print("Starting debug tests...") + + all_passed = True + + # 为每种测试类型运行测试 + test_funcs = [ + ("Basic Print", test_basic_print, [_TEST_CASES[0]]), + ("Binary Save", test_binary_save, [_TEST_CASES[1]]), + ("Multidimensional", test_multidimensional, [_TEST_CASES[2]]), + ] + + for test_name, test_func, test_cases in test_funcs: + print(f"\n{'='*60}") + print(f"Testing {test_name}") + print(f"{'='*60}") + + runner = TestRunner(test_cases, config) + passed = runner.run_tests(devices, test_func) + all_passed = all_passed and passed + + # 运行非连续内存布局测试 + print(f"\n{'='*60}") + print(f"Testing Non-Contiguous Memory Layout (is_contiguous=False)") + print(f"{'='*60}") + + non_contiguous_runner = TestRunner(_NON_CONTIGUOUS_TEST_CASES, config) + non_contiguous_passed = non_contiguous_runner.run_tests(devices, test_non_contiguous_stride) + all_passed = all_passed and non_contiguous_passed + + # 运行大规模性能测试 + print(f"\n{'='*60}") + print(f"Testing Large Scale Performance (10M elements)") + print(f"{'='*60}") + + large_scale_runner = TestRunner(_LARGE_SCALE_TEST_CASES, config) + large_scale_passed = large_scale_runner.run_tests(devices, test_large_scale_binary_performance) + all_passed = all_passed and large_scale_passed + + # 打印总结 + print(f"\n{'='*60}") + print("Test Summary") + print(f"{'='*60}") + + if all_passed: + print("\033[92m✅ All debug tests passed!\033[0m") + else: + print("\033[91m❌ Some tests failed!\033[0m") + + sys.exit(0 if all_passed else 1) + + +if __name__ == "__main__": + main()