# Hardware-Locked CUDA-Metal Validation Framework
## Security Classification: RESTRICTED
## Hardware Requirements:
- NVIDIA GPU with CUDA support
- Apple Silicon (M1/M2) hardware
- 16GB+ RAM

This notebook provides real-time validation of CUDA-Metal kernel conversion with simultaneous execution verification.

In [None]:
%%bash
# Verify system requirements
nvidia-smi
system_profiler SPHardwareDataType | grep Chip

In [None]:
from IPython.display import display, HTML
import numpy as np
import cupy as cp
import plt
import threading
import queue
import hashlib
import os
from dataclasses import dataclass
import time
import psutil
import platform

# Attempt to import pyopencl, prompt installation if missing
try:
    import pyopencl as cl  # For Metal interop
except ImportError:
    raise ImportError(
        "pyopencl is required for this notebook. Please install it using 'pip install pyopencl' and ensure all system dependencies are met."
    )

# Security verification of environment
SECURITY_TOKEN = hashlib.sha256(os.urandom(32)).hexdigest()

@dataclass
class ValidationConfig:
    threads_per_block: int = 256
    min_blocks: int = 1
    max_blocks: int = 1024
    timing_iterations: int = 100
    tolerance: float = 1e-6
    security_level: str = 'RESTRICTED'

# Initialize validation configuration
config = ValidationConfig()

# Hardware verification
def verify_hardware():
    requirements = {
        'CUDA GPU': False,
        'Apple Silicon': False,
        'RAM': False
    }
    
    try:
        # Check CUDA
        cp.cuda.runtime.getDeviceCount()
        requirements['CUDA GPU'] = True
    except:
        pass
        
    # Check Apple Silicon
    if platform.processor().lower().startswith('arm'):
        requirements['Apple Silicon'] = True
        
    # Check RAM
    if psutil.virtual_memory().total >= 16 * (1024**3):  # 16GB
        requirements['RAM'] = True
        
    return requirements

hw_check = verify_hardware()
assert all(hw_check.values()), "Hardware requirements not met"

In [None]:
class SimultaneousValidator:
    def __init__(self):
        self.cuda_queue = queue.Queue()
        self.metal_result_queue = queue.Queue()  # Renamed to avoid conflict
        self.sync_event = threading.Event()
        self.result_lock = threading.Lock()
        self.initialize_hardware()
        
    def initialize_hardware(self):
        # Initialize CUDA
        self.cuda_device = cp.cuda.Device(0)
        
        # Initialize Metal
        platforms = cl.get_platforms()
        self.metal_device = None
        for platform in platforms:
            if 'Apple' in platform.name:
                devices = platform.get_devices()
                if devices:
                    self.metal_device = devices[0]
                break
        assert self.metal_device is not None, "No Metal device found"
        
        self.metal_context = cl.Context([self.metal_device])
        self.cl_command_queue = cl.CommandQueue(self.metal_context)
        
    def validate_kernel(self, cuda_kernel, metal_kernel, input_data):
        results = {}
        
        # Create execution threads
        cuda_thread = threading.Thread(
            target=self._execute_cuda,
            args=(cuda_kernel, input_data)
        )
        
        metal_thread = threading.Thread(
            target=self._execute_metal,
            args=(metal_kernel, input_data)
        )
        
        # Start simultaneous execution
        cuda_thread.start()
        metal_thread.start()
        
        # Trigger synchronization event
        self.sync_event.set()
        
        # Wait for both executions to complete
        cuda_thread.join()
        metal_thread.join()
        
        # Get results
        cuda_result = self.cuda_queue.get()
        metal_result = self.metal_result_queue.get()  # Updated
        
        # Validate results
        validation_result = self._validate_results(
            cuda_result['output'],
            metal_result['output'],
            cuda_result['timing'],
            metal_result['timing']
        )
        
        return validation_result
    
    def _execute_cuda(self, kernel_code, input_data):
        with self.cuda_device:
            # Compile kernel
            module = cp.RawModule(code=kernel_code)
            kernel = module.get_function('cuda_kernel')
            
            # Prepare data
            if isinstance(input_data, tuple):
                input_gpu = [cp.asarray(arg) for arg in input_data[:-1]]
                output_gpu = cp.asarray(input_data[-2])
                n = cp.int32(input_data[-1])
            else:
                input_gpu = cp.asarray(input_data)
                output_gpu = cp.zeros_like(input_gpu)
                n = None
            
            # Execute with timing
            start_event = cp.cuda.Event()
            end_event = cp.cuda.Event()
            
            start_event.record()
            if n is not None:
                kernel(
                    (config.min_blocks,), (config.threads_per_block,),
                    (*input_gpu, output_gpu, n)
                )
            else:
                kernel(
                    (config.min_blocks,), (config.threads_per_block,),
                    (input_gpu, output_gpu)
                )
            end_event.record()
            end_event.synchronize()
            
            timing = cp.cuda.get_elapsed_time(start_event, end_event)
            
            # Get result
            result = {
                'output': cp.asnumpy(output_gpu),
                'timing': timing
            }
            
            self.cuda_queue.put(result)
    
    def _execute_metal(self, kernel_code, input_data):
        # Compile Metal kernel
        prg = cl.Program(self.metal_context, kernel_code).build()
        
        # Prepare buffers
        mf = cl.mem_flags
        if isinstance(input_data, tuple):
            input_a = input_data[0]
            input_b = input_data[1]
            output = input_data[2]
            n = input_data[3]
            input_buf_a = cl.Buffer(
                self.metal_context,
                mf.READ_ONLY | mf.COPY_HOST_PTR,
                hostbuf=input_a
            )
            input_buf_b = cl.Buffer(
                self.metal_context,
                mf.READ_ONLY | mf.COPY_HOST_PTR,
                hostbuf=input_b
            )
            output_buf = cl.Buffer(
                self.metal_context,
                mf.WRITE_ONLY,
                output.nbytes
            )
            n_buf = cl.Buffer(
                self.metal_context,
                mf.READ_ONLY | mf.COPY_HOST_PTR,
                hostbuf=np.array([n], dtype=np.int32)
            )
        else:
            input_buf = cl.Buffer(
                self.metal_context,
                mf.READ_ONLY | mf.COPY_HOST_PTR,
                hostbuf=input_data
            )
            output_buf = cl.Buffer(
                self.metal_context,
                mf.WRITE_ONLY,
                input_data.nbytes
            )
        
        # Execute with timing
        start_time = time.perf_counter()
        
        if isinstance(input_data, tuple):
            event = prg.metal_kernel(
                self.cl_command_queue,
                (input_a.shape[0],),
                None,
                input_buf_a,
                input_buf_b,
                output_buf,
                n_buf
            )
        else:
            event = prg.metal_kernel(
                self.cl_command_queue,
                (input_data.shape[0],),
                None,
                input_buf,
                output_buf
            )
        event.wait()
        
        end_time = time.perf_counter()
        
        # Get result
        if isinstance(input_data, tuple):
            output = np.empty_like(input_data[2])
            cl.enqueue_copy(self.cl_command_queue, output, output_buf)
        else:
            output = np.empty_like(input_data)
            cl.enqueue_copy(self.cl_command_queue, output, output_buf)
        
        result = {
            'output': output,
            'timing': (end_time - start_time) * 1000  # Convert to ms
        }
        
        self.metal_result_queue.put(result)
    
    def _validate_results(self, cuda_output, metal_output, cuda_timing, metal_timing):
        # Check numerical accuracy
        max_diff = np.max(np.abs(cuda_output - metal_output))
        outputs_match = max_diff <= config.tolerance
        
        # Check performance ratio
        timing_ratio = cuda_timing / metal_timing
        perf_acceptable = 0.5 <= timing_ratio <= 2.0
        
        return {
            'outputs_match': outputs_match,
            'max_difference': max_diff,
            'cuda_timing_ms': cuda_timing,
            'metal_timing_ms': metal_timing,
            'timing_ratio': timing_ratio,
            'performance_acceptable': perf_acceptable,
            'validation_passed': outputs_match and perf_acceptable
        }

# Initialize validator
validator = SimultaneousValidator()

In [None]:
# Test kernels
test_cases = {
    'vector_add': {
        'cuda': '''
        extern "C" __global__
        void cuda_kernel(const float* a, float* b) {
            int idx = blockIdx.x * blockDim.x + threadIdx.x;
            b[idx] = a[idx] + a[idx];
        }
        ''',
        'metal': '''
        kernel void metal_kernel(
            const device float* a [[buffer(0)]],
            device float* b [[buffer(1)]],
            uint idx [[thread_position_in_grid]]
        ) {
            b[idx] = a[idx] + a[idx];
        }
        '''
    },
    'matrix_mul': {
        'cuda': '''
        extern "C" __global__
        void cuda_kernel(const float* a, const float* b, float* c, int n) {
            int row = blockIdx.y * blockDim.y + threadIdx.y;
            int col = blockIdx.x * blockDim.x + threadIdx.x;
            if (row < n && col < n) {
                float sum = 0;
                for (int k = 0; k < n; k++) {
                    sum += a[row * n + k] * b[k * n + col];
                }
                c[row * n + col] = sum;
            }
        }
        ''',
        'metal': '''
        kernel void metal_kernel(
            const device float* a [[buffer(0)]],
            const device float* b [[buffer(1)]],
            device float* c [[buffer(2)]],
            constant int& n [[buffer(3)]],
            uint2 pos [[thread_position_in_grid]]
        ) {
            int row = pos.y;
            int col = pos.x;
            if (row < n && col < n) {
                float sum = 0;
                for (int k = 0; k < n; k++) {
                    sum += a[row * n + k] * b[k * n + col];
                }
                c[row * n + col] = sum;
            }
        }
        '''
    }
}

# Run validation suite
def run_validation_suite():
    results = {}
    
    for name, kernels in test_cases.items():
        print(f"\nValidating {name}...")
        
        # Prepare test data
        if name == 'vector_add':
            input_a = np.random.randn(1024*1024).astype(np.float32)
            input_b = np.random.randn(1024*1024).astype(np.float32)
            output = np.zeros_like(input_a)
            input_data = (input_a, input_b, output)
        elif name == 'matrix_mul':
            N = 1024
            input_a = np.random.randn(N, N).astype(np.float32)
            input_b = np.random.randn(N, N).astype(np.float32)
            output = np.zeros((N, N), dtype=np.float32)
            input_data = (input_a, input_b, output, N)
        
        # Run validation
        try:
            result = validator.validate_kernel(
                kernels['cuda'],
                kernels['metal'],
                input_data
            )
            
            results[name] = result
            
            # Display results
            display(HTML(f'''
            <div style="background-color: {'#dff0d8' if result['validation_passed'] else '#f2dede'}; padding: 10px; border-radius: 5px;">
                <h4>{name} Validation Results:</h4>
                <ul>
                    <li>Outputs Match: {'✅' if result['outputs_match'] else '❌'} (Max Diff: {result['max_difference']:.2e})</li>
                    <li>CUDA Timing: {result['cuda_timing_ms']:.3f} ms</li>
                    <li>Metal Timing: {result['metal_timing_ms']:.3f} ms</li>
                    <li>Performance Ratio: {result['timing_ratio']:.2f}</li>
                    <li>Overall Status: {'✅ PASSED' if result['validation_passed'] else '❌ FAILED'}</li>
                </ul>
            </div>
            '''))
            
        except Exception as e:
            print(f"Error validating {name}: {str(e)}")
            results[name] = {'error': str(e)}
    
    return results

validation_results = run_validation_suite()

In [None]:
# Detailed Analysis
def analyze_validation_results(results):
    import pandas as pd
    import matplotlib.pyplot as plt
    
    # Create performance comparison DataFrame
    performance_data = []
    for name, result in results.items():
        if 'error' not in result and 'cuda_timing_ms' in result:
            performance_data.append({
                'Test Case': name,
                'CUDA Time (ms)': result['cuda_timing_ms'],
                'Metal Time (ms)': result['metal_timing_ms'],
                'Speedup': result['timing_ratio'],
                'Max Error': result['max_difference']
            })
    
    df = pd.DataFrame(performance_data)
    
    # Plot performance comparison
    plt.figure(figsize=(12, 6))
    width = 0.35
    x = np.arange(len(df))
    
    plt.bar(x - width/2, df['CUDA Time (ms)'], width, label='CUDA')
    plt.bar(x + width/2, df['Metal Time (ms)'], width, label='Metal')
    
    plt.xlabel('Test Case')
    plt.ylabel('Execution Time (ms)')
    plt.title('CUDA vs Metal Performance Comparison')
    plt.xticks(x, df['Test Case'])
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return df

analysis_results = analyze_validation_results(validation_results)

In [None]:
import base64
from io import BytesIO

# Function to convert plot to base64
def plot_to_base64():
    buf = BytesIO()
    plt.savefig(buf, format='png')
    plt.close()
    buf.seek(0)
    img_base64 = base64.b64encode(buf.read()).decode('utf-8')
    return img_base64

# Function to generate test case details for the report
def generate_test_case_details(name, result):
    if 'error' in result:
        return f'''
        <div style="margin-bottom: 20px;">
            <h4>{name}:</h4>
            <p style="color: red;"><strong>Error:</strong> {result['error']}</p>
        </div>
        '''
    else:
        return f'''
        <div style="margin-bottom: 20px;">
            <h4>{name}:</h4>
            <ul>
                <li>Outputs Match: {'✅' if result['outputs_match'] else '❌'} (Max Diff: {result['max_difference']:.2e})</li>
                <li>CUDA Timing: {result['cuda_timing_ms']:.3f} ms</li>
                <li>Metal Timing: {result['metal_timing_ms']:.3f} ms</li>
                <li>Performance Ratio: {result['timing_ratio']:.2f}</li>
                <li>Overall Status: {'✅ PASSED' if result['validation_passed'] else '❌ FAILED'}</li>
            </ul>
        </div>
        '''

In [None]:
# Generate Validation Report
def generate_validation_report(validation_results, analysis_results):
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
    report_id = hashlib.sha256(timestamp.encode()).hexdigest()[:8]
    
    # Safely get CUDA device name
    try:
        cuda_device_name = cp.cuda.runtime.getDeviceProperties(0)['name'].decode()
    except Exception as e:
        cuda_device_name = "Unknown CUDA Device"
        print(f"Error fetching CUDA device properties: {e}")
    
    # Safely get Metal device name
    try:
        metal_device_name = validator.metal_device.name
    except AttributeError:
        metal_device_name = "Unknown Metal Device"
    
    html_report = f'''
    <div style="padding: 20px; font-family: Arial, sans-serif;">
        <h2>CUDA-Metal Validation Report</h2>
        <p><strong>Report ID:</strong> {report_id}</p>
        <p><strong>Timestamp:</strong> {timestamp}</p>
        <p><strong>Security Level:</strong> {config.security_level}</p>
        
        <h3>Hardware Configuration</h3>
        <ul>
            <li><strong>CUDA Device:</strong> {cuda_device_name}</li>
            <li><strong>Metal Device:</strong> {metal_device_name}</li>
        </ul>
        
        <h3>Validation Summary</h3>
        <ul>
            <li><strong>Total Tests:</strong> {len(validation_results)}</li>
            <li><strong>Passed:</strong> {sum(1 for r in validation_results.values() if 'validation_passed' in r and r['validation_passed'])}</li>
            <li><strong>Failed:</strong> {sum(1 for r in validation_results.values() if 'validation_passed' in r and not r['validation_passed'])}</li>
            <li><strong>Errors:</strong> {sum(1 for r in validation_results.values() if 'error' in r)}</li>
        </ul>
        
        <h3>Performance Analysis</h3>
        <img src="data:image/png;base64,{plot_to_base64()}"/>
        
        <h3>Validation Details</h3>
        {''.join(generate_test_case_details(name, result) for name, result in validation_results.items())}
        
        <h3>Security Verification</h3>
        <p>Report Hash: {hashlib.sha256(str(validation_results).encode()).hexdigest()}</p>
    </div>
    '''
    
    # Save report
    with open(f'validation_report_{report_id}.html', 'w') as f:
        f.write(html_report)
    
    return HTML(html_report)

report = generate_validation_report(validation_results, analysis_results)
display(report)