diff --git a/README.md b/README.md
index 10ec103ae..7bef3ba08 100644
--- a/README.md
+++ b/README.md
@@ -182,9 +182,9 @@ pip install . -e
 
 ```bash
 # 测试单算子
-python test/infinicore/ops/[operator].py [--bench | --debug] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
+python test/infinicore/ops/[operator].py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun | --Hygon]
 # 测试全部算子
-python test/infinicore/run.py [--bench | --debug] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun]
+python test/infinicore/run.py [--bench | --debug | --verbose] [--cpu | --nvidia | --cambricon | --ascend | --iluvatar | --metax | --moore | --kunlun]
 ```
 
 使用 -h 查看更多参数。
diff --git a/test/infinicore/framework/base.py b/test/infinicore/framework/base.py
index 95d4a55f2..ebb889244 100644
--- a/test/infinicore/framework/base.py
+++ b/test/infinicore/framework/base.py
@@ -1,8 +1,9 @@
 import torch
 import infinicore
-
+import traceback
+from dataclasses import dataclass
 from abc import ABC, abstractmethod
-from typing import List, Dict, Any, Optional
+from typing import List, Dict, Any, Optional, Tuple
 
 from .datatypes import to_torch_dtype, to_infinicore_dtype
 from .devices import InfiniDeviceNames, torch_device_map
@@ -11,11 +12,21 @@
     create_test_comparator,
     infinicore_tensor_from_torch,
     profile_operation,
-    synchronize_device,
-    convert_infinicore_to_torch,
 )
 
 
+@dataclass
+class TestResult:
+    """Test result data structure"""
+    success: bool
+    return_code: int  # 0: success, -1: failure, -2: skipped, -3: partial
+    torch_time: float = 0.0
+    infini_time: float = 0.0
+    error_message: str = ""
+    test_case: Any = None
+    device: Any = None
+
+
 class TestCase:
     """Test case with all configuration included"""
 
@@ -24,11 +35,11 @@ def __init__(
         inputs,
         kwargs=None,
         output_spec=None,
+        output_specs=None,
         comparison_target=None,
         description="",
         tolerance=None,
         output_count=1,
-        output_specs=None,
     ):
         """
         Initialize a test case with complete configuration
@@ -216,14 +227,19 @@ def __str__(self):
 class TestConfig:
     """Test configuration"""
 
-    def __init__(self, debug=False, bench=False, num_prerun=10, num_iterations=1000):
+    def __init__(
+        self,
+        debug=False,
+        bench=False,
+        num_prerun=10,
+        num_iterations=1000,
+        verbose=False,
+    ):
         self.debug = debug
         self.bench = bench
         self.num_prerun = num_prerun
         self.num_iterations = num_iterations
-
-
-# In base.py - update the TestRunner class
+        self.verbose = verbose
 
 
 class TestRunner:
@@ -238,6 +254,14 @@ def __init__(self, test_cases, test_config):
         self.passed_tests = (
             []
         )  # Track passed tests (both operators implemented and passed)
+        # Add benchmark timing statistics
+        self.benchmark_times = {
+            "torch_total": 0.0,
+            "infinicore_total": 0.0,
+            "per_test_case": {},  # Store timing per test case
+        }
+        # Store test results
+        self.test_results = []
 
     def run_tests(self, devices, test_func, test_type="Test"):
         """
@@ -260,30 +284,30 @@ def run_tests(self, devices, test_func, test_type="Test"):
                 try:
                     print(f"{test_case}")
 
-                    # Execute test and get result status
-                    success, status = test_func(device, test_case, self.config)
+                    # Execute test and get TestResult object
+                    test_result = test_func(device, test_case, self.config)
+                    self.test_results.append(test_result)
 
-                    # Handle different test statuses
-                    if status == "passed":
+                    # Handle different test statuses based on return_code
+                    if test_result.return_code == 0:  # Success
                         self.passed_tests.append(
                             f"{test_case} - {InfiniDeviceNames[device]}"
                         )
                         print(f"\033[92m✓\033[0m Passed")
-                    elif status == "skipped":
-                        # Test was skipped due to both operators not being implemented
+                    elif test_result.return_code == -1:
+                        fail_msg = f"{test_case} - {InfiniDeviceNames[device]} - Test terminated in verbose mode."
+                        self.failed_tests.append(fail_msg)
+                    elif test_result.return_code == -2:  # Skipped
                         skip_msg = f"{test_case} - {InfiniDeviceNames[device]} - Both operators not implemented"
                         self.skipped_tests.append(skip_msg)
-                        print(
-                            f"\033[93m⚠\033[0m Skipped - both operators not implemented"
-                        )
-                    elif status == "partial":
-                        # Test was partially executed (one operator not implemented)
+                        print(f"\033[93m⚠\033[0m Both operators not implemented - test skipped")
+                    elif test_result.return_code == -3:  # Partial
                         partial_msg = f"{test_case} - {InfiniDeviceNames[device]} - One operator not implemented"
                         self.partial_tests.append(partial_msg)
-                        print(
-                            f"\033[93m⚠\033[0m Partial - one operator not implemented"
-                        )
-                    # Failed tests are handled in the exception handler below
+                        print(f"\033[93m⚠\033[0m One operator not implemented - running single operator without comparison")
+
+                    if self.config.verbose and test_result.return_code != 0:
+                        return False
 
                 except Exception as e:
                     error_msg = (
@@ -291,11 +315,25 @@ def run_tests(self, devices, test_func, test_type="Test"):
                     )
                     print(f"\033[91m✗\033[0m {error_msg}")
                     self.failed_tests.append(error_msg)
+                    
+                    # Create a failed TestResult
+                    failed_result = TestResult(
+                        success=False,
+                        return_code=-1,
+                        error_message=str(e),
+                        test_case=test_case,
+                        device=device
+                    )
+                    self.test_results.append(failed_result)
+                    # In verbose mode, print full traceback and stop execution
+                    if self.config.verbose:
+                        traceback.print_exc()
+                        return False  # Stop test execution immediately
+
                     if self.config.debug:
                         raise
 
-        # Return True if no tests failed (skipped/partial tests don't count as failures)
-        return len(self.failed_tests) == 0
+        return len(self.failed_tests) == 0 and len(self.skipped_tests) == 0 and len(self.partial_tests) == 0
 
     def print_summary(self):
         """
@@ -312,34 +350,16 @@ def print_summary(self):
 
         print(f"\n{'='*60}")
         print("TEST SUMMARY")
-        print(f"{'='*60}")
         print(f"Total tests: {total_tests}")
         print(f"\033[92mPassed: {passed_count}\033[0m")
 
-        # Display partial tests (one operator not implemented)
-        if self.partial_tests:
-            print(
-                f"\033[93mPartial (one operator not implemented): {partial_count}\033[0m"
-            )
-            for test in self.partial_tests:
-                print(f"  - {test}")
-
-        # Display skipped tests (both operators not implemented)
-        if self.skipped_tests:
-            print(
-                f"\033[93mSkipped (both operators not implemented): {skipped_count}\033[0m"
-            )
-            for test in self.skipped_tests:
-                print(f"  - {test}")
-
+        result = True
         # Display failed tests
         if self.failed_tests:
             print(f"\033[91mFailed: {failed_count}\033[0m")
-            for failure in self.failed_tests:
-                print(f"  - {failure}")
 
             # Return False only if there are actual test failures
-            return False
+            result = False
         else:
             # Calculate success rate based on actual executed tests
             executed_tests = passed_count + partial_count + failed_count
@@ -352,10 +372,41 @@ def print_summary(self):
                 print(
                     f"\n\033[93mTests completed with some implementations missing\033[0m"
                 )
-                return True  # Skipped/partial tests don't count as failures
             else:
                 print(f"\n\033[92mAll tests passed!\033[0m")
-                return True
+
+        # Print benchmark summary if benchmarking was enabled
+        if self.config.bench and (
+            self.benchmark_times["torch_total"] > 0
+            or self.benchmark_times["infinicore_total"] > 0
+        ):
+            self._print_benchmark_summary()
+
+        print(f"{'='*60}")
+        return result
+
+    def _print_benchmark_summary(self):
+        """Print benchmark timing summary"""
+        print(f"{'-'*60}")
+        print("BENCHMARK SUMMARY")
+
+        torch_total = self.benchmark_times["torch_total"]
+        infinicore_total = self.benchmark_times["infinicore_total"]
+
+        if torch_total > 0:
+            print(f"PyTorch Total Time: {torch_total * 1000:.3f} ms")
+        if infinicore_total > 0:
+            print(f"InfiniCore Total Time: {infinicore_total * 1000:.3f} ms")
+
+        if torch_total > 0 and infinicore_total > 0:
+            speedup = (
+                torch_total / infinicore_total if infinicore_total > 0 else float("inf")
+            )
+            print(f"Speedup (PyTorch/InfiniCore): {speedup:.2f}x")
+
+    def get_test_results(self):
+        """Get all test results"""
+        return self.test_results
 
 
 class BaseOperatorTest(ABC):
@@ -460,11 +511,17 @@ def run_test(self, device, test_case, config):
             config: Test configuration
 
         Returns:
-            tuple: (success, status) where:
-                success: bool indicating if test passed
-                status: str describing test status ("passed", "skipped", "partial")
+            TestResult: Test result object containing status and timing information
         """
         device_str = torch_device_map[device]
+        
+        # Initialize test result
+        test_result = TestResult(
+            success=False,
+            return_code=-1,  # Default to failure
+            test_case=test_case,
+            device=device
+        )
 
         # Prepare inputs and kwargs with actual tensors
         inputs, kwargs = self.prepare_inputs_and_kwargs(test_case, device)
@@ -537,6 +594,12 @@ def run_test(self, device, test_case, config):
             if torch_result is None:
                 torch_implemented = False
         except NotImplementedError:
+            if config.verbose:
+                traceback.print_exc()
+                # Return test result immediately in verbose mode
+                test_result.return_code = -1
+                test_result.error_message = "torch_operator not implemented"
+                return test_result
             torch_implemented = False
             torch_result = None
 
@@ -545,25 +608,26 @@ def run_test(self, device, test_case, config):
             if infini_result is None:
                 infini_implemented = False
         except NotImplementedError:
+            if config.verbose:
+                traceback.print_exc()
+                # Return test result immediately in verbose mode
+                test_result.return_code = -1
+                test_result.error_message = "infinicore_operator not implemented"
+                return test_result
             infini_implemented = False
             infini_result = None
 
         # Skip if neither operator is implemented
         if not torch_implemented and not infini_implemented:
-            print(f"\033[93m⚠\033[0m Both operators not implemented - test skipped")
-            return False, "skipped"
+            test_result.return_code = -2  # Skipped
+            return test_result
 
         # Single operator execution without comparison
         if not torch_implemented or not infini_implemented:
-            missing_op = (
-                "torch_operator" if not torch_implemented else "infinicore_operator"
-            )
-            print(
-                f"\033[93m⚠\033[0m {missing_op} not implemented - running single operator without comparison"
-            )
-
+            test_result.return_code = -3  # Partial
+            # Run benchmarking for partial tests if enabled
             if config.bench:
-                self._run_benchmarking(
+                torch_time, infini_time = self._run_benchmarking(
                     config,
                     device_str,
                     torch_implemented,
@@ -575,8 +639,9 @@ def run_test(self, device, test_case, config):
                     test_case.output_count,
                     comparison_target,
                 )
-            return False, "partial"
-
+                test_result.torch_time = torch_time
+                test_result.infini_time = infini_time
+            return test_result
         # ==========================================================================
         # MULTIPLE OUTPUTS COMPARISON LOGIC
         # ==========================================================================
@@ -685,7 +750,7 @@ def run_test(self, device, test_case, config):
         # UNIFIED BENCHMARKING LOGIC
         # ==========================================================================
         if config.bench:
-            self._run_benchmarking(
+            torch_time, infini_time = self._run_benchmarking(
                 config,
                 device_str,
                 True,
@@ -697,9 +762,13 @@ def run_test(self, device, test_case, config):
                 test_case.output_count,
                 comparison_target,
             )
+            test_result.torch_time = torch_time
+            test_result.infini_time = infini_time
 
         # Test passed successfully
-        return True, "passed"
+        test_result.success = True
+        test_result.return_code = 0
+        return test_result
 
     def _run_benchmarking(
         self,
@@ -715,8 +784,15 @@ def _run_benchmarking(
         comparison_target,
     ):
         """
-        Unified benchmarking logic
+        Unified benchmarking logic with timing accumulation
+
+        Returns:
+            tuple: (torch_time, infini_time) timing results
         """
+        # Initialize timing variables
+        torch_time = 0.0
+        infini_time = 0.0
+
         if torch_implemented:
             if output_count > 1:
                 # For multiple outputs, just call the operator
@@ -739,12 +815,13 @@ def torch_op():
                             else inputs[comparison_target]
                         )
 
-            profile_operation(
+            torch_time = profile_operation(
                 "PyTorch   ",
                 torch_op,
                 device_str,
                 config.num_prerun,
                 config.num_iterations,
+                total=True,
             )
 
         if infini_implemented:
@@ -763,10 +840,19 @@ def infini_op():
                         else infini_inputs[comparison_target]
                     )
 
-            profile_operation(
+            infini_time = profile_operation(
                 "InfiniCore",
                 infini_op,
                 device_str,
                 config.num_prerun,
                 config.num_iterations,
+                total=True,
             )
+
+        # Store timing information in the test runner
+        if hasattr(config, "_test_runner") and config._test_runner:
+            # Accumulate total times
+            config._test_runner.benchmark_times["torch_total"] += torch_time
+            config._test_runner.benchmark_times["infinicore_total"] += infini_time
+
+        return torch_time, infini_time
diff --git a/test/infinicore/framework/config.py b/test/infinicore/framework/config.py
index 8b09ea90c..ccbff88e6 100644
--- a/test/infinicore/framework/config.py
+++ b/test/infinicore/framework/config.py
@@ -1,7 +1,6 @@
 import argparse
 from .devices import InfiniDeviceEnum
 
-# hardware_info.py
 """
 Shared hardware platform information for the InfiniCore testing framework
 """
@@ -61,6 +60,9 @@ def get_args():
   # Run with debug mode on multiple devices
   python test_operator.py --cpu --nvidia --debug
 
+  # Run with verbose mode to stop on first error with full traceback
+  python test_operator.py --cpu --nvidia --verbose
+
   # Run performance profiling with custom iterations
   python test_operator.py --nvidia --bench --num_prerun 50 --num_iterations 5000
 
@@ -90,11 +92,17 @@ def get_args():
         action="store_true",
         help="Enable debug mode for detailed tensor comparison",
     )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose mode to stop on first error with full traceback",
+    )
 
     # Device options using shared hardware info
     hardware_group = get_hardware_args_group(parser)
+    args, unknown = parser.parse_known_args()
 
-    return parser.parse_args()
+    return args
 
 
 def get_test_devices(args):
diff --git a/test/infinicore/framework/runner.py b/test/infinicore/framework/runner.py
index 336686824..c0de4a7f9 100644
--- a/test/infinicore/framework/runner.py
+++ b/test/infinicore/framework/runner.py
@@ -21,16 +21,23 @@ def run(self):
         """Execute the complete test suite
 
         Returns:
-            bool: True if all tests passed or were skipped/partial, False if any tests failed
+            tuple: (success, test_runner) where:
+                success: bool indicating if all tests passed or were skipped/partial
+                test_runner: TestRunner instance with test results
         """
         config = TestConfig(
             debug=self.args.debug,
             bench=self.args.bench,
             num_prerun=self.args.num_prerun,
             num_iterations=self.args.num_iterations,
+            verbose=self.args.verbose,  # Pass verbose flag to TestConfig
         )
 
         runner = TestRunner(self.operator_test.test_cases, config)
+
+        # Pass the test runner instance to config for benchmark timing accumulation
+        config._test_runner = runner
+
         devices = get_test_devices(self.args)
 
         # Run unified tests - returns True if no tests failed
@@ -46,7 +53,7 @@ def run(self):
         # Both conditions must be True for overall success
         # - has_no_failures: no test failures during execution
         # - summary_passed: summary confirms no failures
-        return has_no_failures and summary_passed
+        return (has_no_failures and summary_passed), runner
 
     def run_and_exit(self):
         """Run tests and exit with appropriate status code
@@ -55,5 +62,5 @@ def run_and_exit(self):
             0: All tests passed or were skipped/partial (no failures)
             1: One or more tests failed
         """
-        success = self.run()
+        success, runner = self.run()
         sys.exit(0 if success else 1)
diff --git a/test/infinicore/framework/utils.py b/test/infinicore/framework/utils.py
index 2448e3857..051a30321 100644
--- a/test/infinicore/framework/utils.py
+++ b/test/infinicore/framework/utils.py
@@ -22,10 +22,12 @@ def timed_op(func, num_iterations, device):
     for _ in range(num_iterations):
         func()
     synchronize_device(device)
-    return (time.time() - start) / num_iterations
+    return time.time() - start
 
 
-def profile_operation(desc, func, torch_device, num_prerun, num_iterations):
+def profile_operation(
+    desc, func, torch_device, num_prerun, num_iterations, total=False
+):
     """
     Performance profiling workflow
     """
@@ -35,7 +37,11 @@ def profile_operation(desc, func, torch_device, num_prerun, num_iterations):
 
     # Timed execution
     elapsed = timed_op(lambda: func(), num_iterations, torch_device)
-    print(f"    {desc} time: {elapsed * 1000 :6f} ms")
+    print(f"    {desc} time: {elapsed / num_iterations * 1000 :6f} ms")
+    if total:
+        return elapsed
+    else:
+        return elapsed / num_iterations
 
 
 def debug(actual, desired, atol=0, rtol=1e-2, equal_nan=False, verbose=True):
diff --git a/test/infinicore/ops/elu.py b/test/infinicore/ops/elu.py
index 48cd846c0..92d2072d3 100644
--- a/test/infinicore/ops/elu.py
+++ b/test/infinicore/ops/elu.py
@@ -133,9 +133,9 @@ def torch_operator(self, *args, **kwargs):
         """PyTorch ELU implementation"""
         return torch.nn.functional.elu(*args, **kwargs)
 
-    def infinicore_operator(self, x, alpha=1.0, out=None, **kwargs):
-        """InfiniCore ELU implementation"""
-        return None
+    # def infinicore_operator(self, x, alpha=1.0, out=None, **kwargs):
+    #     """InfiniCore ELU implementation"""
+    #     return None
 
 
 def main():
diff --git a/test/infinicore/ops/multi_margin_loss.py b/test/infinicore/ops/multi_margin_loss.py
index d4620f109..cc8f0da5c 100644
--- a/test/infinicore/ops/multi_margin_loss.py
+++ b/test/infinicore/ops/multi_margin_loss.py
@@ -103,7 +103,7 @@ def parse_test_cases():
     return test_cases
 
 
-class MultiMarginLossOpTest(BaseOperatorTest):
+class OpTest(BaseOperatorTest):
     """MultiMarginLoss operator test with device handling"""
 
     def __init__(self):
@@ -116,9 +116,9 @@ def torch_operator(self, *args, **kwargs):
         """PyTorch multi_margin_loss implementation with device handling"""
         return F.multi_margin_loss(*args, **kwargs)
 
-    def infinicore_operator(self, *args, **kwargs):
-        """InfiniCore multi_margin_loss implementation"""
-        return None
+    # def infinicore_operator(self, *args, **kwargs):
+    #     """InfiniCore multi_margin_loss implementation"""
+    #     return None
 
 
 def main():
diff --git a/test/infinicore/run.py b/test/infinicore/run.py
index 32d52bfc6..ff642b8db 100644
--- a/test/infinicore/run.py
+++ b/test/infinicore/run.py
@@ -1,9 +1,10 @@
 import os
 import sys
-import subprocess
 import argparse
 from pathlib import Path
-from typing import Dict, Tuple, List
+import importlib.util
+
+from framework import get_hardware_args_group
 
 
 def find_ops_directory(location=None):
@@ -58,9 +59,59 @@ def get_available_operators(ops_dir):
     return sorted(operators)
 
 
-def run_all_op_tests(ops_dir=None, specific_ops=None, extra_args=None):
+def import_operator_test(test_file_path):
+    """
+    Import an operator test module and return the test class instance.
+
+    Args:
+        test_file_path: Path to the test file
+
+    Returns:
+        tuple: (success, test_instance_or_error)
+    """
+    try:
+        # Create a unique module name
+        module_name = f"op_test_{test_file_path.stem}"
+
+        # Load the module from file
+        spec = importlib.util.spec_from_file_location(module_name, test_file_path)
+        if spec is None or spec.loader is None:
+            return False, f"Could not load module from {test_file_path}"
+
+        module = importlib.util.module_from_spec(spec)
+
+        # Add the module to sys.modules
+        sys.modules[module_name] = module
+
+        # Execute the module
+        spec.loader.exec_module(module)
+
+        # Find the test class (usually named OpTest)
+        test_class = None
+        for attr_name in dir(module):
+            attr = getattr(module, attr_name)
+            if (
+                isinstance(attr, type)
+                and hasattr(attr, "__bases__")
+                and any("BaseOperatorTest" in str(base) for base in attr.__bases__)
+            ):
+                test_class = attr
+                break
+
+        if test_class is None:
+            return False, f"No test class found in {test_file_path}"
+
+        # Create an instance
+        test_instance = test_class()
+        return True, test_instance
+
+    except Exception as e:
+        return False, f"Error importing {test_file_path}: {str(e)}"
+
+
+def run_all_op_tests(ops_dir=None, specific_ops=None, bench=False, verbose=False):
     """
-    Run all operator test scripts in the ops directory.
+    Run all operator test scripts in the ops directory using direct import.
 
     Args:
         ops_dir (str, optional): Path to the ops directory. If None, uses auto-detection.
@@ -68,7 +119,7 @@ def run_all_op_tests(ops_dir=None, specific_ops=None, extra_args=None):
         extra_args (list, optional): Extra command line arguments to pass to test scripts.
 
     Returns:
-        dict: Results dictionary with test names as keys and (success, return_code, stdout, stderr) as values.
+        dict: Results dictionary with test names as keys and (success, test_runner, stdout, stderr) as values.
     """
     if ops_dir is None:
         ops_dir = find_ops_directory()
@@ -122,92 +173,184 @@ def run_all_op_tests(ops_dir=None, specific_ops=None, extra_args=None):
 
     results = {}
 
+    cumulative_timing = {
+        "total_torch_time": 0.0,
+        "total_infinicore_time": 0.0,
+        "operators_tested": 0,
+    }
+
     for test_file in operator_test_files:
         test_name = test_file.stem
 
         try:
-            # Run the test script - use the absolute path and run from current directory
-            cmd = [sys.executable, str(test_file.absolute())]
-
-            # Add extra arguments if provided
-            if extra_args:
-                cmd.extend(extra_args)
-
-            result = subprocess.run(
-                cmd,
-                capture_output=True,  # Capture output to analyze
-                text=True,
-            )
-
-            # Analyze output to determine test status
-            stdout_lower = result.stdout.lower()
-            stderr_lower = result.stderr.lower()
-
-            # Check for operator not implemented patterns
-            if (
-                "all tests passed!" in stdout_lower
-                and "success rate: 100.0%" in stdout_lower
-            ):
-                success = True
-                returncode = 0
-            elif "both operators not implemented" in stdout_lower:
-                # Both operators not implemented - skipped test
-                success = False  # Not a failure, but skipped
-                returncode = -2  # Special code for skipped
-            elif "one operator not implemented" in stdout_lower:
-                # One operator not implemented - partial test
-                success = False  # Not fully successful
-                returncode = -3  # Special code for partial
-            else:
-                success = False
-                returncode = -1
-
-            results[test_name] = (
-                success,
-                returncode,
-                result.stdout,
-                result.stderr,
-            )
-
-            # Print the output from the test script
-            print(f"\n{'='*60}")
-            print(f"TEST: {test_name}")
-            print(f"{'='*60}")
-
-            if result.stdout:
-                print(result.stdout.rstrip())
-
-            if result.stderr:
-                print("\nSTDERR:")
-                print(result.stderr.rstrip())
-
-            # Enhanced status display
-            if returncode == -2:
-                status_icon = "⏭️"
-                status_text = "SKIPPED"
-            elif returncode == -3:
-                status_icon = "⚠️"
-                status_text = "PARTIAL"
-            elif success:
-                status_icon = "✅"
-                status_text = "PASSED"
-            else:
-                status_icon = "❌"
-                status_text = "FAILED"
-
-            print(
-                f"{status_icon}  {test_name}: {status_text} (return code: {returncode})"
-            )
+            # Import and run the test directly
+            success, test_instance_or_error = import_operator_test(test_file)
+
+            if not success:
+                print(f"💥 {test_name}: ERROR - {test_instance_or_error}")
+                results[test_name] = {
+                    "success": False,
+                    "return_code": -1,
+                    "torch_time": 0.0,
+                    "infini_time": 0.0,
+                    "error_message": test_instance_or_error,
+                    "test_runner": None,
+                    "stdout": "",
+                    "stderr": test_instance_or_error,
+                }
+                continue
+
+            # Get the test runner class from the module
+            test_module = sys.modules[f"op_test_{test_file.stem}"]
+            if not hasattr(test_module, "GenericTestRunner"):
+                print(f"💥 {test_name}: ERROR - No GenericTestRunner found")
+                results[test_name] = {
+                    "success": False,
+                    "return_code": -1,
+                    "torch_time": 0.0,
+                    "infini_time": 0.0,
+                    "error_message": "No GenericTestRunner found",
+                    "test_runner": None,
+                    "stdout": "",
+                    "stderr": "No GenericTestRunner found",
+                }
+                continue
+
+            # Create and run the test runner
+            test_runner_class = test_module.GenericTestRunner
+            runner_instance = test_runner_class(test_instance_or_error.__class__)
+
+            # Temporarily redirect stdout to capture output
+            from io import StringIO
+
+            stdout_capture = StringIO()
+            stderr_capture = StringIO()
+
+            old_stdout = sys.stdout
+            old_stderr = sys.stderr
+            sys.stdout = stdout_capture
+            sys.stderr = stderr_capture
+
+            try:
+                # Run the test
+                test_success, test_runner = runner_instance.run()
+
+                # Get captured output
+                stdout_output = stdout_capture.getvalue()
+                stderr_output = stderr_capture.getvalue()
+
+                # Restore stdout/stderr
+                sys.stdout = old_stdout
+                sys.stderr = old_stderr
+
+                # Print the captured output
+                if stdout_output:
+                    print(stdout_output.rstrip())
+                if stderr_output:
+                    print("\nSTDERR:")
+                    print(stderr_output.rstrip())
+
+                # Analyze test results
+                test_results = test_runner.get_test_results() if test_runner else []
+
+                # Determine overall test status
+                if test_success:
+                    return_code = 0
+                    status_icon = "✅"
+                    status_text = "PASSED"
+                else:
+                    # Check if there are any failed tests
+                    has_failures = any(
+                        result.return_code == -1 for result in test_results
+                    )
+                    has_partial = any(
+                        result.return_code == -3 for result in test_results
+                    )
+                    has_skipped = any(
+                        result.return_code == -2 for result in test_results
+                    )
+
+                    if has_failures:
+                        return_code = -1
+                        status_icon = "❌"
+                        status_text = "FAILED"
+                    elif has_partial:
+                        return_code = -3
+                        status_icon = "⚠️"
+                        status_text = "PARTIAL"
+                    elif has_skipped:
+                        return_code = -2
+                        status_icon = "⏭️"
+                        status_text = "SKIPPED"
+                    else:
+                        return_code = -1
+                        status_icon = "❌"
+                        status_text = "FAILED"
+
+                # Calculate timing
+                torch_time = sum(result.torch_time for result in test_results)
+                infini_time = sum(result.infini_time for result in test_results)
+
+                results[test_name] = {
+                    "success": test_success,
+                    "return_code": return_code,
+                    "torch_time": torch_time,
+                    "infini_time": infini_time,
+                    "error_message": "",
+                    "test_runner": test_runner,
+                    "stdout": stdout_output,
+                    "stderr": stderr_output,
+                }
+
+                print(
+                    f"{status_icon}  {test_name}: {status_text} (return code: {return_code})"
+                )
+
+                # Extract benchmark timing if in bench mode
+                if bench and test_success and return_code == 0:
+                    cumulative_timing["total_torch_time"] += torch_time
+                    cumulative_timing["total_infinicore_time"] += infini_time
+                    cumulative_timing["operators_tested"] += 1
+
+            except Exception as e:
+                # Restore stdout/stderr in case of exception
+                sys.stdout = old_stdout
+                sys.stderr = old_stderr
+                raise e
+
+            # In verbose mode, stop execution on first failure
+            if verbose and not test_success and return_code != 0:
+                break
 
         except Exception as e:
             print(f"💥 {test_name}: ERROR - {str(e)}")
-            results[test_name] = (False, -1, "", str(e))
-
-    return results
-
-
-def print_summary(results):
-    """Print a comprehensive summary of test results."""
+            results[test_name] = {
+                "success": False,
+                "return_code": -1,
+                "torch_time": 0.0,
+                "infini_time": 0.0,
+                "error_message": str(e),
+                "test_runner": None,
+                "stdout": "",
+                "stderr": str(e),
+            }
+
+            # In verbose mode, stop execution on any exception
+            if verbose:
+                print(f"\n{'!'*60}")
+                print(
+                    f"VERBOSE MODE: Stopping execution due to exception in {test_name}"
+                )
+                print(f"{'!'*60}")
+                break
+
+    return results, cumulative_timing
+
+
+def print_summary(
+    results, verbose=False, total_expected_tests=0, cumulative_timing=None
+):
+    """Print a comprehensive summary of test results including benchmark data."""
     print(f"\n{'='*80}")
     print("CUMULATIVE TEST SUMMARY")
     print(f"{'='*80}")
@@ -226,14 +369,15 @@ def print_summary(results):
     skipped_operators = []  # Store skipped operator names
     partial_operators = []  # Store partial operator names
 
-    for test_name, (success, returncode, stdout, stderr) in results.items():
-        if success:
+    for test_name, result_data in results.items():
+        return_code = result_data["return_code"]
+        if return_code == 0:
             passed += 1
             passed_operators.append(test_name)
-        elif returncode == -2:  # Special code for skipped tests
+        elif return_code == -2:  # Special code for skipped tests
             skipped += 1
             skipped_operators.append(test_name)
-        elif returncode == -3:  # Special code for partial tests
+        elif return_code == -3:  # Special code for partial tests
             partial += 1
             partial_operators.append(test_name)
         else:
@@ -242,7 +386,11 @@ def print_summary(results):
 
     total = len(results)
 
-    print(f"Total tests: {total}")
+    print(f"Total tests run: {total}")
+    if total_expected_tests > 0 and total < total_expected_tests:
+        print(f"Total tests expected: {total_expected_tests}")
+        print(f"Tests not executed: {total_expected_tests - total}")
+
     print(f"Passed: {passed}")
     print(f"Failed: {failed}")
 
@@ -252,6 +400,19 @@ def print_summary(results):
     if partial > 0:
         print(f"Partial: {partial}")
 
+    # Print benchmark summary if cumulative_timing data is available
+    if cumulative_timing and cumulative_timing["operators_tested"] > 0:
+        print(f"{'-'*40}")
+        print("BENCHMARK SUMMARY:")
+        print(f"  Operators Tested: {cumulative_timing['operators_tested']}")
+        print(
+            f"  PyTorch    Total Time: {cumulative_timing['total_torch_time'] * 1000:12.3f} ms"
+        )
+        print(
+            f"  InfiniCore Total Time: {cumulative_timing['total_infinicore_time'] * 1000:12.3f} ms"
+        )
+        print(f"{'-'*40}")
+
     # Display passed operators
     if passed_operators:
         print(f"\n✅ PASSED OPERATORS ({len(passed_operators)}):")
@@ -284,12 +445,16 @@ def print_summary(results):
             print("  " + ", ".join(line_ops))
 
     if total > 0:
-        # Calculate success rate based on executed tests only
+        # Calculate success rate based on actual executed tests
         executed_tests = passed + failed + partial
         if executed_tests > 0:
             success_rate = passed / executed_tests * 100
             print(f"\nSuccess rate: {success_rate:.1f}%")
 
+    if verbose and total < total_expected_tests:
+        print(f"\n💡 Verbose mode: Execution stopped after first failure")
+        print(f"   {total_expected_tests - total} tests were not executed")
+
     if failed == 0:
         if skipped > 0 or partial > 0:
             print(f"\n⚠️  Tests completed with some operators not implemented")
@@ -358,6 +523,14 @@ def generate_help_epilog(ops_dir):
     epilog_parts.append("  # Run with debug mode on multiple devices")
     epilog_parts.append("  python run.py --cpu --nvidia --debug")
     epilog_parts.append("")
+    epilog_parts.append(
+        "  # Run with verbose mode to stop on first error with full traceback"
+    )
+    epilog_parts.append("  python run.py --cpu --nvidia --verbose")
+    epilog_parts.append("")
+    epilog_parts.append("  # Run with benchmarking to get cumulative timing")
+    epilog_parts.append("  python run.py --cpu --bench")
+    epilog_parts.append("")
     epilog_parts.append("  # List available tests without running")
     epilog_parts.append("  python run.py --list")
     epilog_parts.append("")
@@ -384,7 +557,13 @@ def generate_help_epilog(ops_dir):
         "  - Operators are automatically discovered from the ops directory"
     )
     epilog_parts.append(
-        "  - --bench option is disabled in batch mode (run individual tests for benchmarking)"
+        "  - --bench mode now shows cumulative timing across all operators"
+    )
+    epilog_parts.append(
+        "  - --verbose mode stops execution on first error and shows full traceback"
+    )
+    epilog_parts.append(
+        "  - In verbose mode, subsequent tests are skipped after first failure"
     )
 
     return "\n".join(epilog_parts)
@@ -413,15 +592,21 @@ def main():
         action="store_true",
         help="List all available test files without running them",
     )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose mode to stop on first error with full traceback",
+    )
+    parser.add_argument(
+        "--bench",
+        action="store_true",
+        help="Enable bench mode to show performance data",
+    )
 
-    from framework import get_hardware_args_group
-
-    if "-h" in sys.argv or "--help" in sys.argv:
-        get_hardware_args_group(parser)
+    get_hardware_args_group(parser)
 
     # Parse known args first, leave the rest for the test scripts
     args, unknown_args = parser.parse_known_args()
-    get_hardware_args_group(parser)
 
     # Handle list command
     if args.list:
@@ -453,6 +638,9 @@ def main():
     print(f"Operating directory: {ops_dir}")
     print(f"Available operators: {len(available_operators)}")
 
+    if args.verbose:
+        print(f"Verbose mode: ENABLED (will stop on first error with full traceback)")
+
     if args.ops:
         # Validate requested operators
         valid_ops = []
@@ -469,32 +657,50 @@ def main():
 
         if valid_ops:
             print(f"Testing operators: {', '.join(valid_ops)}")
+            total_expected_tests = len(valid_ops)
         else:
             print("No valid operators specified. Running all available tests.")
+            total_expected_tests = len(available_operators)
     else:
         print("Testing all available operators")
+        total_expected_tests = len(available_operators)
 
     print()
 
     # Run all tests
-    results = run_all_op_tests(
+    results, cumulative_timing = run_all_op_tests(
         ops_dir=ops_dir,
         specific_ops=args.ops,
-        extra_args=unknown_args,
+        bench=args.bench,
+        verbose=args.verbose,
     )
 
     # Print summary and exit with appropriate code
-    all_passed = print_summary(results)
+    all_passed = print_summary(
+        results, args.verbose, total_expected_tests, cumulative_timing
+    )
 
     # Check if there were any tests with missing implementations
     has_missing_implementations = any(
-        returncode in [-2, -3] for _, (_, returncode, _, _) in results.items()
+        result_data["return_code"] in [-2, -3] for result_data in results.values()
     )
 
     if all_passed and has_missing_implementations:
         print(f"\n⚠️  Note: Some operators are not fully implemented")
         print(f"   Run individual tests for details on missing implementations")
 
+    if args.verbose and not all_passed:
+        print(
+            f"\n💡 Verbose mode tip: Use individual test commands for detailed debugging:"
+        )
+        failed_ops = [
+            name
+            for name, result_data in results.items()
+            if result_data["return_code"] == -1
+        ]
+        for op in failed_ops[:3]:  # Show first 3 failed operators
+            print(f"   python {ops_dir / (op + '.py')} --verbose")
+
     sys.exit(0 if all_passed else 1)