# 🧠 Memory & Performance Benchmarks\n## {wtf²B•2^4*3} Manifold - CUDA Optimized\n\nThis notebook benchmarks the 48-manifold system's performance characteristics on GPU.

In [None]:
# Environment setup and GPU detection\nimport sys\nimport os\nsys.path.append('/work')  # Deepnote working directory\n\n# Import our CUDA manager\nfrom deepnote.cuda_devices import cuda_manager, get_device, get_gpu_memory_usage\nimport torch\nimport numpy as np\nimport time\nfrom IPython.display import display, HTML\nimport ipywidgets as widgets\nimport plotly.graph_objects as go\nfrom plotly.subplots import make_subplots\n\n# Initialize and display GPU info\ndevice = get_device()\ndevice_info = cuda_manager.get_device_info()\n\nprint(f"🎮 Device: {device}")\nprint(f"📊 GPU: {device_info.get('name', 'CPU')}")\nprint(f"💾 Memory: {device_info.get('total_memory_gb', 0):.1f} GB")\nprint(f"🔧 CUDA: {device_info.get('cuda_version', 'N/A')}")\n\n# Watermark for reproducibility\n%load_ext watermark\n%watermark -v -m -p torch,numpy,plotly -g

## 1. Memory Allocation Patterns\nTesting 48-manifold memory characteristics

In [None]:
class MemoryBenchmark:\n    """Memory allocation and usage patterns for 48-manifold operations."""\n    \n    def __init__(self, device=None):\n        self.device = device or get_device()\n        self.results = {}\n    \n    def benchmark_allocation(self, sizes=[48, 480, 4800, 48000]):\n        """Test allocation speeds for different tensor sizes."""\n        results = []\n        \n        for size in sizes:\n            # Clear cache\n            if self.device.type == 'cuda':\n                torch.cuda.empty_cache()\n                torch.cuda.synchronize()\n            \n            # Time allocation\n            start = time.perf_counter()\n            tensor = torch.zeros(size, size, device=self.device)\n            if self.device.type == 'cuda':\n                torch.cuda.synchronize()\n            alloc_time = (time.perf_counter() - start) * 1000  # ms\n            \n            # Memory usage\n            memory_mb = tensor.element_size() * tensor.numel() / (1024**2)\n            \n            results.append({\n                'size': size,\n                'shape': f'{size}×{size}',\n                'alloc_time_ms': alloc_time,\n                'memory_mb': memory_mb,\n                'elements': tensor.numel()\n            })\n            \n            del tensor\n        \n        return results\n    \n    def benchmark_transfer(self, size=4800):\n        """Test CPU <-> GPU transfer speeds."""\n        if self.device.type == 'cpu':\n            return {'error': 'GPU required for transfer benchmark'}\n        \n        # Create CPU tensor\n        cpu_tensor = torch.randn(size, size)\n        \n        # CPU -> GPU\n        torch.cuda.synchronize()\n        start = time.perf_counter()\n        gpu_tensor = cpu_tensor.to(self.device)\n        torch.cuda.synchronize()\n        h2d_time = (time.perf_counter() - start) * 1000\n        \n        # GPU -> CPU\n        torch.cuda.synchronize()\n        start = time.perf_counter()\n        cpu_back = gpu_tensor.to('cpu')\n        torch.cuda.synchronize()\n        d2h_time = (time.perf_counter() - start) * 1000\n        \n        bandwidth_gb = (cpu_tensor.element_size() * cpu_tensor.numel()) / (1024**3)\n        \n        return {\n            'h2d_ms': h2d_time,\n            'd2h_ms': d2h_time,\n            'h2d_bandwidth_gbps': bandwidth_gb / (h2d_time / 1000),\n            'd2h_bandwidth_gbps': bandwidth_gb / (d2h_time / 1000),\n            'size_gb': bandwidth_gb\n        }\n\n# Run benchmarks\nbench = MemoryBenchmark(device)\nalloc_results = bench.benchmark_allocation()\n\n# Display results\nimport pandas as pd\ndf_alloc = pd.DataFrame(alloc_results)\ndisplay(df_alloc)\n\n# Visualize\nfig = make_subplots(\n    rows=1, cols=2,\n    subplot_titles=('Allocation Time', 'Memory Usage')\n)\n\nfig.add_trace(\n    go.Bar(x=df_alloc['shape'], y=df_alloc['alloc_time_ms'], name='Time (ms)'),\n    row=1, col=1\n)\n\nfig.add_trace(\n    go.Bar(x=df_alloc['shape'], y=df_alloc['memory_mb'], name='Memory (MB)'),\n    row=1, col=2\n)\n\nfig.update_layout(height=400, showlegend=False)\nfig.show()

## 2. 48-Manifold Operations Performance

In [None]:
class Manifold48Benchmark:\n    """Benchmark 48-manifold specific operations."""\n    \n    def __init__(self, device=None):\n        self.device = device or get_device()\n        self.dims = 48  # Our magic number\n    \n    def benchmark_factorization(self, batch_size=32, iterations=100):\n        """Test the 48 = 2^4 × 3 factorization operations."""\n        \n        # Create test tensor (batch, 48, height, width)\n        h, w = 48, 48\n        x = torch.randn(batch_size, self.dims, h, w, device=self.device)\n        \n        times = {}\n        \n        # Factor by 3 (space-to-depth)\n        torch.cuda.synchronize()\n        start = time.perf_counter()\n        for _ in range(iterations):\n            # Simulate 3x3 space-to-depth\n            x_reshaped = x.reshape(batch_size, self.dims, h//3, 3, w//3, 3)\n            x_permuted = x_reshaped.permute(0, 1, 3, 5, 2, 4)\n            x_factor3 = x_permuted.reshape(batch_size, self.dims * 9, h//3, w//3)\n        torch.cuda.synchronize()\n        times['factor_3'] = (time.perf_counter() - start) / iterations * 1000\n        \n        # Factor by 2 (space-to-depth)\n        h2, w2 = h//3, w//3\n        x2 = x_factor3[:, :self.dims*4, :h2, :w2]  # Subset for 2x2\n        \n        torch.cuda.synchronize()\n        start = time.perf_counter()\n        for _ in range(iterations):\n            # Simulate 2x2 space-to-depth\n            if h2 % 2 == 0 and w2 % 2 == 0:\n                x_reshaped = x2.reshape(batch_size, -1, h2//2, 2, w2//2, 2)\n                x_permuted = x_reshaped.permute(0, 1, 3, 5, 2, 4)\n                x_factor2 = x_permuted.reshape(batch_size, -1, h2//2, w2//2)\n        torch.cuda.synchronize()\n        times['factor_2'] = (time.perf_counter() - start) / iterations * 1000\n        \n        # Inverse operations (depth-to-space)\n        # TODO: Implement inverse benchmarks\n        \n        return times\n    \n    def benchmark_matrix_ops(self, size=48):\n        """Benchmark matrix operations at different scales."""\n        scales = [1, 2, 4, 8, 16]  # Multiples of 48\n        results = []\n        \n        for scale in scales:\n            n = size * scale\n            A = torch.randn(n, n, device=self.device)\n            B = torch.randn(n, n, device=self.device)\n            \n            # Matrix multiplication\n            torch.cuda.synchronize()\n            start = time.perf_counter()\n            C = torch.matmul(A, B)\n            torch.cuda.synchronize()\n            matmul_time = (time.perf_counter() - start) * 1000\n            \n            # SVD (important for our operations)\n            if n <= 480:  # SVD gets expensive\n                torch.cuda.synchronize()\n                start = time.perf_counter()\n                U, S, V = torch.linalg.svd(A[:n//2, :n//2])\n                torch.cuda.synchronize()\n                svd_time = (time.perf_counter() - start) * 1000\n            else:\n                svd_time = None\n            \n            results.append({\n                'scale': scale,\n                'size': n,\n                'matmul_ms': matmul_time,\n                'svd_ms': svd_time,\n                'gflops': (2 * n**3) / (matmul_time * 1e6)  # Theoretical GFLOPS\n            })\n        \n        return results\n\n# Run manifold benchmarks\nm48_bench = Manifold48Benchmark(device)\nfactor_times = m48_bench.benchmark_factorization()\nmatrix_results = m48_bench.benchmark_matrix_ops()\n\nprint("\n🔢 Factorization Performance:")\nfor op, time_ms in factor_times.items():\n    print(f"  {op}: {time_ms:.3f} ms")\n\nprint("\n📊 Matrix Operations:")\ndf_matrix = pd.DataFrame(matrix_results)\ndisplay(df_matrix)

## 3. Interactive Performance Explorer

In [None]:
# Interactive widget for exploring performance characteristics\n@widgets.interact(\n    batch_size=widgets.IntSlider(min=1, max=128, step=1, value=32),\n    dimensions=widgets.Dropdown(options=[12, 24, 48, 96, 192], value=48),\n    precision=widgets.Dropdown(options=['float32', 'float16', 'bfloat16'], value='float32')\n)\ndef performance_explorer(batch_size, dimensions, precision):\n    """Interactive performance exploration tool."""\n    \n    # Set precision\n    if precision == 'float16':\n        dtype = torch.float16\n    elif precision == 'bfloat16' and device.type == 'cuda':\n        dtype = torch.bfloat16\n    else:\n        dtype = torch.float32\n    \n    # Create test tensor\n    x = torch.randn(batch_size, dimensions, 48, 48, device=device, dtype=dtype)\n    \n    # Measure memory\n    memory_mb = x.element_size() * x.numel() / (1024**2)\n    \n    # Simple operation benchmark\n    torch.cuda.synchronize()\n    start = time.perf_counter()\n    y = torch.nn.functional.gelu(x)\n    z = y.mean(dim=[2, 3])\n    torch.cuda.synchronize()\n    op_time = (time.perf_counter() - start) * 1000\n    \n    print(f"📊 Configuration:")\n    print(f"   Batch: {batch_size}")\n    print(f"   Dimensions: {dimensions}")\n    print(f"   Precision: {precision}")\n    print(f"\n💾 Memory: {memory_mb:.1f} MB")\n    print(f"⚡ Operation Time: {op_time:.2f} ms")\n    print(f"🚀 Throughput: {batch_size / (op_time / 1000):.1f} samples/sec")\n    \n    if device.type == 'cuda':\n        mem_info = get_gpu_memory_usage()\n        print(f"\n🎮 GPU Memory:")\n        print(f"   Allocated: {mem_info['allocated']:.2f} GB")\n        print(f"   Cached: {mem_info['cached']:.2f} GB")\n        print(f"   Available: {mem_info['total'] - mem_info['allocated']:.2f} GB")

## 4. Smoke Tests

In [None]:
def run_smoke_tests():\n    """Smoke tests to ensure environment is properly configured."""\n    tests_passed = []\n    tests_failed = []\n    \n    # Test 1: CUDA availability\n    try:\n        assert torch.cuda.is_available(), "CUDA not available"\n        tests_passed.append("✅ CUDA available")\n    except AssertionError as e:\n        tests_failed.append(f"❌ CUDA: {e}")\n    \n    # Test 2: Tensor operations\n    try:\n        x = torch.randn(48, 48, device=device)\n        y = x @ x.T\n        assert y.shape == (48, 48)\n        tests_passed.append("✅ Tensor operations working")\n    except Exception as e:\n        tests_failed.append(f"❌ Tensor ops: {e}")\n    \n    # Test 3: Memory allocation\n    try:\n        large = torch.zeros(1024, 1024, device=device)\n        del large\n        tests_passed.append("✅ Memory allocation working")\n    except Exception as e:\n        tests_failed.append(f"❌ Memory: {e}")\n    \n    # Test 4: Mixed precision\n    try:\n        if device.type == 'cuda':\n            with torch.cuda.amp.autocast():\n                x = torch.randn(48, 48, device=device)\n                y = x @ x.T\n            tests_passed.append("✅ Mixed precision available")\n    except Exception as e:\n        tests_failed.append(f"❌ AMP: {e}")\n    \n    # Display results\n    print("🧪 SMOKE TEST RESULTS\n" + "="*50)\n    for test in tests_passed:\n        print(test)\n    for test in tests_failed:\n        print(test)\n    \n    success_rate = len(tests_passed) / (len(tests_passed) + len(tests_failed)) * 100\n    print(f"\n📊 Success Rate: {success_rate:.0f}%")\n    \n    return success_rate == 100\n\n# Run smoke tests\nall_tests_passed = run_smoke_tests()

## Next Steps for AI Developers\n\n### 🎯 TODO: Implement these benchmarks\n1. **Memory Fragmentation Analysis** - Track fragmentation over time\n2. **Parallel Scaling** - Multi-GPU performance (DDP, FSDP)\n3. **Kernel Profiling** - Use NVIDIA Nsight for kernel analysis\n4. **Optimization Comparison** - Compare TF32, FP16, INT8 performance\n5. **Real Workload Simulation** - Protein folding end-to-end timing\n\n### 📝 Notes\n- All benchmarks should log to `results/memory/` directory\n- Use `watermark` for reproducibility\n- Consider memory pressure testing with OOM recovery\n- Add comparative benchmarks vs CPU/MPS