In [4]:
#!/usr/bin/env python3
"""
Realistic Parallel Workload Forensics

Tests whether concurrent inference workloads affect FP outputs and timing
for REALISTIC inference sizes (~2000 tokens, >50% GPU utilization)

This is the definitive test for the parallel hidden workload threat model.
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import json
import time
import threading
import subprocess
from datetime import datetime

print("="*70)
print("REALISTIC PARALLEL WORKLOAD FORENSICS")
print("="*70)

model_name = "Qwen/Qwen2.5-7B-Instruct"
device = "cuda"
layer_indices = [1, 4, 10, 18, 28]

# Realistic long-form prompt (~2000 tokens)
prompt = """You are a senior research scientist at a leading AI safety institution. Your task is to write a comprehensive technical report analyzing the current state of AI alignment research, potential risks from advanced AI systems, and proposed mitigation strategies.

The report should cover the following areas in depth:

1. Introduction to AI Alignment
   - Historical context and evolution of the field
   - Key terminology and conceptual frameworks
   - Relationship to broader AI safety and governance efforts
   - Current stakeholders and institutional landscape

2. Technical Challenges in AI Alignment
   - The outer alignment problem: specifying correct objectives
   - The inner alignment problem: ensuring mesa-optimizers are aligned
   - Robustness and distributional shift
   - Scalable oversight and interpretability
   - Deceptive alignment and treacherous turns
   - Value learning and inverse reinforcement learning
   - Corrigibility and shutdown problems

3. Current Research Approaches
   - Reinforcement learning from human feedback (RLHF)
   - Constitutional AI and other oversight methods
   - Debate and amplification techniques
   - Interpretability research and mechanistic understanding
   - Formal verification approaches
   - Multi-agent systems and cooperation
   - Impact measures and side-effect minimization

4. Existential Risk Scenarios
   - Fast takeoff vs slow takeoff dynamics
   - Singleton scenarios and multipolar outcomes
   - Instrumental convergence and power-seeking behavior
   - Deceptive alignment in advanced systems
   - Coordination failures between AI developers
   - Race dynamics and competitive pressures
   - Misaligned AGI and catastrophic outcomes

5. Proposed Mitigation Strategies
   - Technical research priorities
   - Governance and policy interventions
   - International coordination mechanisms
   - Compute governance and monitoring
   - Responsible scaling policies
   - Red teaming and evaluation protocols
   - Alignment taxes and capability controls

6. Timeline Considerations
   - Forecasting transformative AI timelines
   - Uncertainty in capability development
   - Alignment difficulty as function of capability
   - Critical periods and decision points
   - Preparation time requirements

7. Institutional and Governance Challenges
   - Information security and model weights
   - Verification and monitoring challenges
   - International coordination problems
   - Corporate governance and incentives
   - Public discourse and democratic input
   - Expert disagreement and epistemic challenges

8. Research Priorities and Recommendations
   - Most promising technical research directions
   - Necessary governance infrastructure
   - Resource allocation and funding priorities
   - Career advice for aspiring alignment researchers
   - Community building and field development

For each section, provide:
- Current state of knowledge and key uncertainties
- Recent developments and breakthrough results
- Open problems and research gaps
- Concrete recommendations and action items
- Relevant citations and references to key papers

The report should be thorough, technically rigorous, and accessible to both technical researchers and policymakers. Aim for approximately 10,000 words total. Use clear section headings, numbered lists where appropriate, and include specific examples to illustrate abstract concepts.

Begin with an executive summary that distills the key findings and recommendations into 500 words. Then proceed with the detailed analysis of each section."""

print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.version.cuda}")
print()

# ============================================================================
# GPU MONITORING
# ============================================================================

class GPUMonitor:
    """Monitor GPU utilization during inference"""
    
    def __init__(self):
        self.running = False
        self.samples = []
        self.thread = None
    
    def _worker(self):
        while self.running:
            try:
                result = subprocess.run(
                    ['nvidia-smi', '--query-gpu=utilization.gpu',
                     '--format=csv,noheader,nounits'],
                    capture_output=True,
                    text=True,
                    timeout=1
                )
                if result.returncode == 0:
                    self.samples.append(float(result.stdout.strip()))
            except:
                pass
            time.sleep(0.01)
    
    def start(self):
        self.samples = []
        self.running = True
        self.thread = threading.Thread(target=self._worker, daemon=True)
        self.thread.start()
        time.sleep(0.05)
    
    def stop(self):
        self.running = False
        if self.thread:
            self.thread.join(timeout=1.0)
        if self.samples:
            return {
                'mean': float(np.mean(self.samples)),
                'p95': float(np.percentile(self.samples, 95)),
                'max': float(np.max(self.samples))
            }
        return None

# ============================================================================
# CONCURRENT WORKLOAD
# ============================================================================

class ConcurrentInference:
    """Run concurrent inference as hidden workload - synchronized to eliminate randomness"""
    
    def __init__(self, model, tokenizer, intensity='light'):
        self.model = model
        self.tokenizer = tokenizer
        self.running = False
        self.thread = None
        self.stream = None
        self.ready_event = threading.Event()
        
        # Different intensities
        if intensity == 'light':
            self.prompt = "The capital of France is"
        else:  # heavy
            self.prompt = "The capital of France is " * 50
        
        # Pre-tokenize
        self.inputs = tokenizer([self.prompt], return_tensors="pt")
        self.inputs = {k: v.to(device) for k, v in self.inputs.items()}
    
    def _worker(self):
        """Worker that runs inference synchronously when triggered"""
        # Use separate stream for concurrent work
        self.stream = torch.cuda.Stream()
        
        # Signal ready
        self.ready_event.set()
        
        with torch.cuda.stream(self.stream):
            while self.running:
                # Run one inference
                with torch.no_grad():
                    _ = self.model(**self.inputs)
                
                # Small delay to allow for multiple concurrent inferences
                # but keep it deterministic
                time.sleep(0.001)
    
    def start(self):
        if not self.running:
            self.ready_event.clear()
            self.running = True
            self.thread = threading.Thread(target=self._worker, daemon=True)
            self.thread.start()
            # Wait for worker to be ready
            self.ready_event.wait(timeout=1.0)
            time.sleep(0.5)
    
    def stop(self):
        if self.running:
            self.running = False
            if self.thread:
                self.thread.join(timeout=2.0)
            if self.stream:
                self.stream.synchronize()

# ============================================================================
# ACTIVATION EXTRACTION
# ============================================================================

def extract_activations_with_monitoring(model, tokenizer, prompt, condition_name, num_runs=5, check_reproducibility=False):
    """Extract activations with timing and GPU utilization monitoring
    
    Args:
        check_reproducibility: If True, run twice and verify activations are identical
    """
    
    print(f"\n[{condition_name}] Starting...")
    
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
    # Tokenize
    inputs = tokenizer([prompt], return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    prompt_len = inputs['input_ids'].shape[1]
    
    print(f"[{condition_name}] Prompt length: {prompt_len} tokens")
    
    # Storage
    activations_runs = []
    
    def make_hook(layer_idx, run_idx):
        def hook(module, input, output):
            hidden = output[0] if isinstance(output, tuple) else output
            if f"layer_{layer_idx}" not in activations_runs[run_idx]:
                activations_runs[run_idx][f"layer_{layer_idx}"] = hidden[:, -1, :].detach().cpu().float()
        return hook
    
    # Multiple timed runs with GPU monitoring
    times = []
    monitor = GPUMonitor()
    
    num_activation_runs = 2 if check_reproducibility else 1
    
    for run in range(num_runs):
        if run == 0:
            monitor.start()
        
        # Collect activations for reproducibility check
        if run < num_activation_runs:
            activations_runs.append({})
            
            # Register hooks
            hooks = []
            for idx in layer_indices:
                if idx == 0:
                    layer = model.model.embed_tokens
                else:
                    layer = model.model.layers[idx - 1]
                hooks.append(layer.register_forward_hook(make_hook(idx, run)))
        
        torch.cuda.synchronize()
        start = time.perf_counter()
        
        with torch.no_grad():
            _ = model(**inputs)
        
        torch.cuda.synchronize()
        elapsed = time.perf_counter() - start
        times.append(elapsed * 1000)
        
        # Clean up hooks
        if run < num_activation_runs:
            for hook in hooks:
                hook.remove()
    
    gpu_stats = monitor.stop()
    
    torch.cuda.synchronize()
    
    # Use first run's activations
    activations = activations_runs[0]
    
    # Check reproducibility if requested
    reproducible = None
    if check_reproducibility and len(activations_runs) == 2:
        reproducible = {}
        for layer_name in activations.keys():
            act1 = activations_runs[0][layer_name].numpy().flatten()
            act2 = activations_runs[1][layer_name].numpy().flatten()
            bit_exact = np.array_equal(act1, act2)
            l2 = 0.0 if bit_exact else float(np.linalg.norm(act1 - act2))
            reproducible[layer_name] = {
                'bit_exact': bit_exact,
                'l2': l2
            }
        
        # Print reproducibility summary
        all_exact = all(v['bit_exact'] for v in reproducible.values())
        if all_exact:
            print(f"[{condition_name}] ✓ Reproducible within condition (bit-exact)")
        else:
            max_l2 = max(v['l2'] for v in reproducible.values())
            print(f"[{condition_name}] ⚠ Non-reproducible within condition (max L2={max_l2:.2e})")
    
    # Convert activations
    activations_np = {
        k: v.numpy().flatten() for k, v in activations.items()
    }
    
    timing_stats = {
        'mean_ms': float(np.mean(times)),
        'median_ms': float(np.median(times)),
        'std_ms': float(np.std(times)),
        'all_times': [float(t) for t in times]
    }
    
    print(f"[{condition_name}] Timing: {timing_stats['mean_ms']:.2f}ms ± {timing_stats['std_ms']:.2f}ms")
    if gpu_stats:
        print(f"[{condition_name}] GPU util: {gpu_stats['p95']:.1f}% (P95)")
    
    return activations_np, timing_stats, gpu_stats, reproducible

# ============================================================================
# MAIN TEST
# ============================================================================

def main():
    print("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        attn_implementation="eager"
    )
    model.eval()
    print("Model loaded")
    
    prompt_tokens = len(tokenizer.encode(prompt))
    print(f"\nPrompt size: {prompt_tokens} tokens")
    
    conditions = {}
    
    # BASELINE
    print("\n" + "="*70)
    print("CONDITION 1: BASELINE (Solo Inference)")
    print("="*70)
    acts, timing, gpu, repro = extract_activations_with_monitoring(
        model, tokenizer, prompt, "BASELINE"
    )
    conditions['baseline'] = {
        'activations': acts,
        'timing': timing,
        'gpu': gpu
    }
    
    # LIGHT CONCURRENT
    print("\n" + "="*70)
    print("CONDITION 2: LIGHT CONCURRENT INFERENCE")
    print("="*70)
    concurrent = ConcurrentInference(model, tokenizer, intensity='light')
    concurrent.start()
    print("  [HIDDEN] Concurrent inference started")
    time.sleep(0.5)
    
    acts, timing, gpu, repro = extract_activations_with_monitoring(
        model, tokenizer, prompt, "LIGHT_CONCURRENT"
    )
    conditions['light_concurrent'] = {
        'activations': acts,
        'timing': timing,
        'gpu': gpu
    }
    
    concurrent.stop()
    print("  [HIDDEN] Concurrent inference stopped")
    time.sleep(0.5)
    
    # HEAVY CONCURRENT
    print("\n" + "="*70)
    print("CONDITION 3: HEAVY CONCURRENT INFERENCE")
    print("="*70)
    concurrent = ConcurrentInference(model, tokenizer, intensity='heavy')
    concurrent.start()
    print("  [HIDDEN] Concurrent inference started")
    time.sleep(0.5)
    
    acts, timing, gpu, repro = extract_activations_with_monitoring(
        model, tokenizer, prompt, "HEAVY_CONCURRENT"
    )
    conditions['heavy_concurrent'] = {
        'activations': acts,
        'timing': timing,
        'gpu': gpu
    }
    
    concurrent.stop()
    print("  [HIDDEN] Concurrent inference stopped")
    
    # ANALYSIS
    print("\n" + "="*70)
    print("FORENSIC ANALYSIS")
    print("="*70)
    
    baseline_acts = conditions['baseline']['activations']
    baseline_time = conditions['baseline']['timing']['median_ms']
    baseline_gpu = conditions['baseline']['gpu']
    
    print(f"\nBaseline:")
    print(f"  Time: {baseline_time:.2f}ms")
    if baseline_gpu:
        print(f"  GPU utilization: {baseline_gpu['p95']:.1f}%")
    
    # FP Analysis
    print("\n" + "="*70)
    print("FP FORENSICS")
    print("="*70)
    
    fp_results = {}
    
    for cond_name in ['light_concurrent', 'heavy_concurrent']:
        print(f"\n{cond_name.upper()}:")
        cond_acts = conditions[cond_name]['activations']
        
        all_exact = True
        layer_results = {}
        
        for layer_name in baseline_acts.keys():
            base = baseline_acts[layer_name]
            cond = cond_acts[layer_name]
            
            bit_exact = np.array_equal(base, cond)
            
            if bit_exact:
                print(f"  {layer_name}: ✓ BIT-EXACT")
                layer_results[layer_name] = {
                    'bit_exact': True,
                    'l2': 0.0
                }
            else:
                all_exact = False
                l2 = float(np.linalg.norm(base - cond))
                print(f"  {layer_name}: ✗ DIFFERS (L2={l2:.6e})")
                layer_results[layer_name] = {
                    'bit_exact': False,
                    'l2': l2
                }
        
        fp_results[cond_name] = {
            'all_exact': all_exact,
            'layers': layer_results
        }
    
    # Timing Analysis
    print("\n" + "="*70)
    print("TIMING FORENSICS")
    print("="*70)
    
    timing_results = {}
    
    for cond_name in ['light_concurrent', 'heavy_concurrent']:
        cond_timing = conditions[cond_name]['timing']
        cond_time = cond_timing['median_ms']
        cond_gpu = conditions[cond_name]['gpu']
        
        slowdown_ms = cond_time - baseline_time
        slowdown_pct = (slowdown_ms / baseline_time) * 100
        
        # Significance test
        baseline_std = conditions['baseline']['timing']['std_ms']
        cond_std = cond_timing['std_ms']
        combined_std = np.sqrt(baseline_std**2 + cond_std**2)
        significant = abs(slowdown_ms) > 2 * combined_std
        
        print(f"\n{cond_name.upper()}:")
        print(f"  Time: {cond_time:.2f}ms (baseline: {baseline_time:.2f}ms)")
        print(f"  Slowdown: {slowdown_ms:+.2f}ms ({slowdown_pct:+.1f}%)")
        print(f"  Significant: {'YES' if significant else 'NO'}")
        if cond_gpu:
            print(f"  GPU util: {cond_gpu['p95']:.1f}%")
        
        timing_results[cond_name] = {
            'time_ms': float(cond_time),
            'slowdown_ms': float(slowdown_ms),
            'slowdown_pct': float(slowdown_pct),
            'significant': bool(significant),
            'gpu_util': cond_gpu
        }
    
    # Verdict
    print("\n" + "="*70)
    print("VERDICT")
    print("="*70)
    
    fp_works = not all(fp_results[c]['all_exact'] for c in fp_results)
    timing_works = any(timing_results[c]['significant'] for c in timing_results)
    
    if fp_works and timing_works:
        print("\n✓ BOTH FP AND TIMING FORENSICS WORK")
        print("\n  • FP forensics detects parallel workloads")
        print("  • Timing forensics confirms with measurable slowdown")
        print("  • Dual verification provides robust detection")
        print("\n✓ FP ALONE SUFFICIENT")
        print("  • FP deviations provide strong signal")
        print("  • No need for timing if FP verification is in place")
    elif fp_works:
        print("\n✓ FP FORENSICS WORKS")
        print("  • Parallel workloads produce detectable FP deviations")
    elif timing_works:
        print("\n✓ TIMING FORENSICS WORKS")
        print("  • Parallel workloads produce measurable slowdown")
        print("  • FP forensics failed but timing rescues verification")
    else:
        print("\n⚠️  VERIFICATION FAILS")
        print("  • Neither FP nor timing detects parallel work")
        print("  • Blind spot exists for this workload size")
    
    # Save comprehensive results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Prepare detailed output matching console
    output = {
        'metadata': {
            'timestamp': timestamp,
            'gpu': torch.cuda.get_device_name(0),
            'pytorch_version': torch.__version__,
            'cuda_version': torch.version.cuda,
            'model': model_name,
            'prompt_tokens': prompt_tokens,
            'layer_indices': layer_indices
        },
        'baseline': {
            'timing': conditions['baseline']['timing'],
            'gpu_utilization': conditions['baseline']['gpu'],
            'summary': {
                'median_time_ms': float(baseline_time),
                'gpu_util_p95': float(baseline_gpu['p95']) if baseline_gpu else None
            }
        },
        'conditions': {
            'light_concurrent': {
                'timing': conditions['light_concurrent']['timing'],
                'gpu_utilization': conditions['light_concurrent']['gpu']
            },
            'heavy_concurrent': {
                'timing': conditions['heavy_concurrent']['timing'],
                'gpu_utilization': conditions['heavy_concurrent']['gpu']
            }
        },
        'fp_forensics': {
            'description': 'Floating-point forensics - checking if concurrent work affects activations',
            'results': {}
        },
        'timing_forensics': {
            'description': 'Timing forensics - checking if concurrent work slows down inference',
            'results': {}
        },
        'verdict': {}
    }
    
    # Add detailed FP results
    for cond_name in ['light_concurrent', 'heavy_concurrent']:
        output['fp_forensics']['results'][cond_name] = {
            'all_layers_bit_exact': fp_results[cond_name]['all_exact'],
            'layer_by_layer': {}
        }
        
        for layer_name, layer_data in fp_results[cond_name]['layers'].items():
            output['fp_forensics']['results'][cond_name]['layer_by_layer'][layer_name] = layer_data
    
    # Add detailed timing results
    for cond_name in ['light_concurrent', 'heavy_concurrent']:
        timing_data = timing_results[cond_name]
        output['timing_forensics']['results'][cond_name] = {
            'median_time_ms': timing_data['time_ms'],
            'baseline_time_ms': float(baseline_time),
            'slowdown_ms': timing_data['slowdown_ms'],
            'slowdown_percent': timing_data['slowdown_pct'],
            'statistically_significant': timing_data['significant'],
            'gpu_utilization': timing_data['gpu_util']
        }
    
    # Add verdict with full interpretation
    fp_works = not all(fp_results[c]['all_exact'] for c in fp_results)
    timing_works = any(timing_results[c]['significant'] for c in timing_results)
    
    output['verdict'] = {
        'fp_forensics_detects_parallel_work': fp_works,
        'timing_forensics_detects_parallel_work': timing_works,
        'summary': None,
        'interpretation': []
    }
    
    if fp_works and timing_works:
        output['verdict']['summary'] = 'BOTH_FP_AND_TIMING_WORK'
        output['verdict']['interpretation'] = [
            'FP forensics detects parallel workloads',
            'Timing forensics confirms with measurable slowdown',
            'Dual verification provides robust detection',
            'FP alone is sufficient - provides strong signal',
            'No need for timing if FP verification is in place'
        ]
    elif fp_works:
        output['verdict']['summary'] = 'FP_WORKS'
        output['verdict']['interpretation'] = [
            'Parallel workloads produce detectable FP deviations',
            'FP forensics alone is sufficient for detection'
        ]
    elif timing_works:
        output['verdict']['summary'] = 'TIMING_WORKS'
        output['verdict']['interpretation'] = [
            'Parallel workloads produce measurable slowdown',
            'FP forensics failed but timing rescues verification',
            'Timing must be part of verification protocol'
        ]
    else:
        output['verdict']['summary'] = 'VERIFICATION_FAILS'
        output['verdict']['interpretation'] = [
            'Neither FP nor timing detects parallel work',
            'Blind spot exists for this workload size',
            'Further investigation needed'
        ]
    
    # Add statistical summary
    if fp_works:
        max_l2_light = max(
            layer['l2'] for layer in fp_results['light_concurrent']['layers'].values()
        )
        max_l2_heavy = max(
            layer['l2'] for layer in fp_results['heavy_concurrent']['layers'].values()
        )
        output['verdict']['fp_signal_strength'] = {
            'light_max_l2': float(max_l2_light),
            'heavy_max_l2': float(max_l2_heavy)
        }
    
    if timing_works:
        output['verdict']['timing_impact'] = {
            'light_slowdown_pct': timing_results['light_concurrent']['slowdown_pct'],
            'heavy_slowdown_pct': timing_results['heavy_concurrent']['slowdown_pct']
        }
    
    output_file = f'realistic_parallel_forensics_{timestamp}.json'
    with open(output_file, 'w') as f:
        json.dump(output, f, indent=2)
    
    print(f"\n[SAVED] {output_file}")
    print("        (Includes all analysis details from console output)")
    print("\n" + "="*70)
    print("TEST COMPLETE")
    print("="*70)

if __name__ == "__main__":
    main()

REALISTIC PARALLEL WORKLOAD FORENSICS

GPU: NVIDIA A100 80GB PCIe
PyTorch: 2.6.0+cu118
CUDA: 11.8

Loading model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded

Prompt size: 649 tokens

CONDITION 1: BASELINE (Solo Inference)

[BASELINE] Starting...
[BASELINE] Prompt length: 649 tokens
[BASELINE] Timing: 75.50ms ± 0.40ms
[BASELINE] GPU util: 85.0% (P95)

CONDITION 2: LIGHT CONCURRENT INFERENCE
  [HIDDEN] Concurrent inference started

[LIGHT_CONCURRENT] Starting...
[LIGHT_CONCURRENT] Prompt length: 649 tokens
[LIGHT_CONCURRENT] Timing: 89.20ms ± 0.61ms
[LIGHT_CONCURRENT] GPU util: 98.0% (P95)
  [HIDDEN] Concurrent inference stopped

CONDITION 3: HEAVY CONCURRENT INFERENCE
  [HIDDEN] Concurrent inference started

[HEAVY_CONCURRENT] Starting...
[HEAVY_CONCURRENT] Prompt length: 649 tokens
[HEAVY_CONCURRENT] Timing: 98.73ms ± 1.56ms
[HEAVY_CONCURRENT] GPU util: 99.0% (P95)
  [HIDDEN] Concurrent inference stopped

FORENSIC ANALYSIS

Baseline:
  Time: 75.38ms
  GPU utilization: 85.0%

FP FORENSICS

LIGHT_CONCURRENT:
  layer_1: ✓ BIT-EXACT
  layer_4: ✓ BIT-EXACT
  layer_10: ✓ BIT-EXACT
  layer_18: ✗ DIFFERS (L2=9.684461e+01)
  layer_28: 