In [1]:
!pip install transformers
!pip install hf_transfer
!pip install accelerate
!pip install ninja packaging wheel
!pip install flash-attn --no-build-isolation

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata 

In [5]:
#!/usr/bin/env python3
"""
Cross-Provider A100 Baseline Experiment

Tests FP reproducibility across different cloud providers (RunPod vs Vast.ai)
Measures both hidden states and key vectors across multiple layers.
Only extracts from last valid token position (prefill workload).

Usage:
    Command-line:
        python cross_provider_a100_baseline.py --provider runpod
    
    Jupyter notebook:
        # Run all cells, then:
        main('runpod')  # or main('vast'), etc.
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'
os.environ['HF_DATASETS_CACHE'] = '/workspace/huggingface_cache'

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc
import numpy as np
from datetime import datetime
import json
import socket
import subprocess
import sys
import argparse

# ============================================================================
# SYSTEM ATTESTATION
# ============================================================================

def get_gpu_firmware_info():
    """Extract detailed GPU firmware/VBIOS information"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '-q'],
            capture_output=True,
            text=True,
            timeout=10
        )
        output = result.stdout
        
        firmware_info = {}
        for line in output.split('\n'):
            line = line.strip()
            if 'VBIOS Version' in line or 'GPU Board Serial Number' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    key = parts[0].strip()
                    value = parts[1].strip()
                    firmware_info[key] = value
        
        return firmware_info
    except Exception as e:
        return {"error": str(e)}

def get_gpu_detailed_info():
    """Get comprehensive GPU configuration"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=name,memory.total,driver_version,compute_cap,pci.bus_id,pcie.link.gen.max,pcie.link.width.max,power.limit',
             '--format=csv,noheader,nounits'],
            capture_output=True,
            text=True,
            timeout=10
        )
        
        if result.returncode == 0:
            parts = result.stdout.strip().split(', ')
            return {
                "name": parts[0],
                "memory_total_mb": parts[1],
                "driver_version": parts[2],
                "compute_capability": parts[3],
                "pci_bus_id": parts[4],
                "pcie_gen_max": parts[5],
                "pcie_width_max": parts[6],
                "power_limit_w": parts[7]
            }
    except Exception as e:
        return {"error": str(e)}
    
    return {}

def get_compute_mode():
    """Check if GPU is in exclusive compute mode"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '-q', '-d', 'COMPUTE'],
            capture_output=True,
            text=True,
            timeout=10
        )
        
        for line in result.stdout.split('\n'):
            if 'Compute Mode' in line:
                return line.split(':', 1)[1].strip()
        
        return "unknown"
    except Exception as e:
        return f"error: {e}"

def get_persistence_mode():
    """Check persistence mode status"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '-q', '-d', 'PERSISTENCE_MODE'],
            capture_output=True,
            text=True,
            timeout=10
        )
        
        for line in result.stdout.split('\n'):
            if 'Persistence Mode' in line:
                return line.split(':', 1)[1].strip()
        
        return "unknown"
    except Exception as e:
        return f"error: {e}"

def get_ecc_status():
    """Check ECC memory status"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '-q', '-d', 'ECC'],
            capture_output=True,
            text=True,
            timeout=10
        )
        
        ecc_info = {}
        for line in result.stdout.split('\n'):
            if 'ECC Mode' in line or 'Current' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    ecc_info[parts[0].strip()] = parts[1].strip()
        
        return ecc_info
    except Exception as e:
        return {"error": str(e)}

def get_clock_info():
    """Get GPU clock speeds"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '-q', '-d', 'CLOCK'],
            capture_output=True,
            text=True,
            timeout=10
        )
        
        clocks = {}
        for line in result.stdout.split('\n'):
            line = line.strip()
            if 'Graphics' in line or 'SM' in line or 'Memory' in line:
                parts = line.split(':', 1)
                if len(parts) == 2:
                    clocks[parts[0].strip()] = parts[1].strip()
        
        return clocks
    except Exception as e:
        return {"error": str(e)}

def check_mig_mode():
    """Check if GPU is in MIG mode or partitioned"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '-L'],
            capture_output=True,
            text=True,
            timeout=10
        )
        
        output = result.stdout
        is_mig = 'MIG' in output
        
        return {
            "is_mig": is_mig,
            "devices": output.strip()
        }
    except Exception as e:
        return {"error": str(e)}

def attest_system(provider_name="unknown"):
    """Comprehensive system attestation"""
    
    print("="*70)
    print("SYSTEM ATTESTATION")
    print("="*70)
    
    attestation = {
        "timestamp": datetime.now().isoformat(),
        "provider": provider_name,
        "hostname": socket.gethostname(),
        "container_id": os.environ.get('HOSTNAME', 'unknown'),
    }
    
    # PyTorch and CUDA
    attestation["pytorch"] = {
        "version": torch.__version__,
        "cuda_version": torch.version.cuda,
        "cudnn_version": torch.backends.cudnn.version() if torch.backends.cudnn.is_available() else None,
        "cuda_available": torch.cuda.is_available(),
    }
    
    if torch.cuda.is_available():
        attestation["gpu"] = {
            "name": torch.cuda.get_device_name(0),
            "capability": f"{torch.cuda.get_device_capability(0)[0]}.{torch.cuda.get_device_capability(0)[1]}",
            "memory_allocated_gb": torch.cuda.memory_allocated(0) / 1024**3,
            "memory_reserved_gb": torch.cuda.memory_reserved(0) / 1024**3,
        }
        
        # Detailed info from nvidia-smi
        attestation["gpu_detailed"] = get_gpu_detailed_info()
        attestation["gpu_firmware"] = get_gpu_firmware_info()
        attestation["compute_mode"] = get_compute_mode()
        attestation["persistence_mode"] = get_persistence_mode()
        attestation["ecc_status"] = get_ecc_status()
        attestation["clock_info"] = get_clock_info()
        attestation["mig_status"] = check_mig_mode()
    
    # Environment variables
    env_vars = {}
    for key in sorted(os.environ.keys()):
        if any(x in key.upper() for x in ['CUDA', 'TORCH', 'NCCL', 'CUDNN', 'PYTORCH', 'NVIDIA']):
            env_vars[key] = os.environ[key]
    attestation["environment"] = env_vars
    
    # Print summary
    print(f"Provider: {provider_name}")
    print(f"Hostname: {attestation['hostname']}")
    print(f"GPU: {attestation['gpu']['name']}")
    print(f"PyTorch: {attestation['pytorch']['version']}")
    print(f"CUDA: {attestation['pytorch']['cuda_version']}")
    print(f"Driver: {attestation['gpu_detailed'].get('driver_version', 'unknown')}")
    print(f"Compute Mode: {attestation['compute_mode']}")
    print(f"Persistence Mode: {attestation['persistence_mode']}")
    
    if attestation['gpu_firmware']:
        print(f"\nFirmware Info:")
        for key, value in attestation['gpu_firmware'].items():
            print(f"  {key}: {value}")
    
    if attestation['mig_status'].get('is_mig'):
        print(f"\n[WARNING] MIG mode detected!")
        print(f"  {attestation['mig_status']['devices']}")
    
    print()
    
    return attestation

# ============================================================================
# DATA COLLECTION
# ============================================================================

def collect_multilayer_activations(model, tokenizer, prompt, layer_indices, device="cuda"):
    """
    Extract hidden states and key vectors from multiple layers.
    Only extract from last valid token position to save memory.
    
    Returns:
        dict: {
            "hidden_states": {f"layer_{idx}": tensor},
            "key_vectors": {f"layer_{idx}": tensor},
            "metadata": {...}
        }
    """
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    gc.collect()
    
    inputs = tokenizer([prompt], return_tensors="pt", padding=True)
    seq_len = inputs['input_ids'].shape[1]
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        # Get both hidden states and key-value cache
        outputs = model(**inputs, output_hidden_states=True, use_cache=True, return_dict=True)
    
    last_valid_pos = inputs['attention_mask'][0].sum() - 1
    
    # Extract hidden states from selected layers
    hidden_states = {}
    for idx in layer_indices:
        if idx < len(outputs.hidden_states):
            # Extract last valid token position, move to CPU immediately
            hidden = outputs.hidden_states[idx][0, last_valid_pos, :].cpu().clone()
            hidden_states[f"layer_{idx}"] = hidden
    
    # Extract key vectors from selected layers
    # past_key_values is tuple of (key, value) pairs per layer
    # Each key: [batch_size, num_key_value_heads, seq_len, head_dim]
    key_vectors = {}
    for i, layer_idx in enumerate(layer_indices):
        # Skip embedding layer (idx=0)
        if layer_idx > 0 and i < len(outputs.past_key_values):
            layer_keys = outputs.past_key_values[i][0]  # [0] for keys
            # Extract last valid token position, all heads
            key_vec = layer_keys[0, :, last_valid_pos, :]
            # Flatten: [num_key_value_heads * head_dim]
            key_vec_flat = key_vec.reshape(-1).cpu().clone()
            key_vectors[f"layer_{layer_idx}"] = key_vec_flat
    
    metadata = {
        "seq_len": seq_len,
        "last_valid_pos": int(last_valid_pos),
        "num_layers_extracted": len(layer_indices),
        "hidden_dim": hidden_states[f"layer_{layer_indices[0]}"].shape[0] if hidden_states else 0,
    }
    
    # Aggressive cleanup
    del outputs.hidden_states
    del outputs.past_key_values
    del outputs
    del inputs
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
    return {
        "hidden_states": hidden_states,
        "key_vectors": key_vectors,
        "metadata": metadata
    }

# ============================================================================
# MAIN EXPERIMENT
# ============================================================================

def main(provider_name='unknown'):
    """
    Main experiment function.
    
    Args:
        provider_name: Name of the cloud provider (e.g., 'runpod', 'vast')
    """
    
    # Attest system first
    attestation = attest_system(provider_name)
    
    # Check for concerning configurations
    if attestation.get('mig_status', {}).get('is_mig'):
        print("âš  WARNING: MIG mode detected - results may not be comparable!")
        response = input("Continue anyway? (y/n): ")
        if response.lower() != 'y':
            sys.exit(1)
    
    if attestation.get('compute_mode') not in ['Default', 'Exclusive Process']:
        print(f"âš  WARNING: Compute mode is '{attestation.get('compute_mode')}'")
    
    # Configuration
    CACHE_DIR = '/workspace/huggingface_cache'
    model_name = "Qwen/Qwen2.5-7B-Instruct"
    num_repetitions = 5
    
    # Select layers to sample (dense at beginning, sparse at end)
    # Layer 0 is embedding, so we start from 1
    total_layers = 28  # Qwen2.5-7B has 28 transformer layers
    layer_indices = [0, 1, 2, 3, 4, 7, 10, 14, 18, 22, 26, 28]  # 0 for embedding
    
    # Test prompt - technical content
    prompt = """The study investigates the quantum decoherence effects on a multi-qubit superconducting system when subjected to controlled microwave pulses. We utilized a novel cryogenic amplification chain to minimize thermal noise and achieve a signal-to-noise ratio previously unattainable in similar setups. The experimental protocol involved preparing the qubits in a Greenberger-Horne-Zeilinger (GHZ) state and then measuring the decay of quantum entanglement over time by performing state tomography. Our results demonstrate a non-linear relationship between pulse amplitude and coherence time, suggesting that higher-order coupling terms, often neglected in theoretical models, play a significant role in system dynamics."""
    
    print("="*70)
    print("LOADING MODEL")
    print("="*70)
    print(f"Model: {model_name}")
    print(f"Precision: BF16 (bfloat16)")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
    prompt_tokens = len(tokenizer.encode(prompt))
    print(f"Prompt tokens: {prompt_tokens}")
    
    gc.collect()
    torch.cuda.empty_cache()
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        cache_dir=CACHE_DIR,
        low_cpu_mem_usage=True,
        device_map="auto"
    )
    
    mem_after_load = torch.cuda.memory_allocated(0) / 1024**3
    print(f"GPU memory after load: {mem_after_load:.2f} GB")
    print()
    
    # Run experiment
    print("="*70)
    print("RUNNING BASELINE EXPERIMENT")
    print("="*70)
    print(f"Layers to extract: {layer_indices}")
    print(f"Repetitions: {num_repetitions}")
    print(f"Operation: Prefill only (no generation)")
    print(f"Extraction: Last valid token position only")
    print()
    
    all_runs = []
    
    for rep in range(num_repetitions):
        print(f"Repetition {rep+1}/{num_repetitions}...")
        
        run_data = collect_multilayer_activations(
            model, tokenizer, prompt, layer_indices, device="cuda"
        )
        
        all_runs.append(run_data)
        
        if rep == 0:
            print(f"  Hidden state dim: {run_data['metadata']['hidden_dim']}")
            print(f"  Sequence length: {run_data['metadata']['seq_len']}")
            print(f"  Last valid pos: {run_data['metadata']['last_valid_pos']}")
            print(f"  Layers extracted: {run_data['metadata']['num_layers_extracted']}")
    
    print()
    
    # Analyze reproducibility
    print("="*70)
    print("REPRODUCIBILITY ANALYSIS")
    print("="*70)
    
    reproducibility = {
        "hidden_states": {},
        "key_vectors": {}
    }
    
    # Check hidden states
    print("\nHidden States:")
    for layer_name in all_runs[0]["hidden_states"].keys():
        first_rep = all_runs[0]["hidden_states"][layer_name]
        
        # Check if all repetitions are identical
        all_identical = all(
            torch.equal(first_rep, all_runs[i]["hidden_states"][layer_name])
            for i in range(1, num_repetitions)
        )
        
        if all_identical:
            print(f"  {layer_name}: [EXACT] Bit-exact across all repetitions")
            max_dev = 0.0
        else:
            # Compute max deviation
            deviations = [
                torch.norm(first_rep - all_runs[i]["hidden_states"][layer_name]).item()
                for i in range(1, num_repetitions)
            ]
            max_dev = max(deviations)
            print(f"  {layer_name}: [VARIES] max L2 deviation: {max_dev:.6f}")
        
        reproducibility["hidden_states"][layer_name] = {
            "bit_exact": all_identical,
            "max_deviation": max_dev
        }
    
    # Check key vectors
    print("\nKey Vectors:")
    for layer_name in all_runs[0]["key_vectors"].keys():
        first_rep = all_runs[0]["key_vectors"][layer_name]
        
        all_identical = all(
            torch.equal(first_rep, all_runs[i]["key_vectors"][layer_name])
            for i in range(1, num_repetitions)
        )
        
        if all_identical:
            print(f"  {layer_name}: [EXACT] Bit-exact across all repetitions")
            max_dev = 0.0
        else:
            deviations = [
                torch.norm(first_rep - all_runs[i]["key_vectors"][layer_name]).item()
                for i in range(1, num_repetitions)
            ]
            max_dev = max(deviations)
            print(f"  {layer_name}: [VARIES] max L2 deviation: {max_dev:.6f}")
        
        reproducibility["key_vectors"][layer_name] = {
            "bit_exact": all_identical,
            "max_deviation": max_dev
        }
    
    # Overall verdict
    print("\n" + "="*70)
    print("VERDICT")
    print("="*70)
    
    all_hidden_exact = all(v["bit_exact"] for v in reproducibility["hidden_states"].values())
    all_keys_exact = all(v["bit_exact"] for v in reproducibility["key_vectors"].values())
    
    if all_hidden_exact and all_keys_exact:
        print("[SUCCESS] FULLY REPRODUCIBLE: All activations bit-exact across repetitions")
        print("  This configuration provides a clean baseline for cross-provider comparison")
    else:
        print("[WARNING] NON-DETERMINISM DETECTED")
        if not all_hidden_exact:
            print("  Hidden states show variation")
        if not all_keys_exact:
            print("  Key vectors show variation")
        print("  May indicate:")
        print("    - Non-deterministic CUDA kernels")
        print("    - Asynchronous operations")
        print("    - Thermal/power variability (unlikely)")
    
    # Save results
    print("\n" + "="*70)
    print("SAVING RESULTS")
    print("="*70)
    
    # Convert tensors to lists for JSON serialization
    serializable_runs = []
    for run in all_runs:
        serializable_run = {
            "hidden_states": {
                k: v.float().numpy().tolist()
                for k, v in run["hidden_states"].items()
            },
            "key_vectors": {
                k: v.float().numpy().tolist()
                for k, v in run["key_vectors"].items()
            },
            "metadata": run["metadata"]
        }
        serializable_runs.append(serializable_run)
    
    output = {
        "experiment": "cross_provider_a100_baseline",
        "attestation": attestation,
        "config": {
            "model": model_name,
            "precision": "bfloat16",
            "prompt_tokens": prompt_tokens,
            "layer_indices": layer_indices,
            "repetitions": num_repetitions,
            "operation": "prefill_only",
            "extraction": "last_valid_token_only"
        },
        "reproducibility": reproducibility,
        "runs": serializable_runs
    }
    
    provider_clean = provider_name.replace(' ', '_').replace('.', '_')
    gpu_name = torch.cuda.get_device_name(0).replace(' ', '_')
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    output_file = f"{provider_clean}_{gpu_name}_baseline_{timestamp}.json"
    output_path = f"/workspace/{output_file}"
    
    with open(output_path, "w") as f:
        json.dump(output, f, indent=2)
    
    file_size_kb = len(json.dumps(output)) / 1024
    print(f"[SUCCESS] Results saved to: {output_path}")
    print(f"[INFO] File size: {file_size_kb:.1f} KB")
    
    print("\n" + "="*70)
    print("EXPERIMENT COMPLETE")
    print("="*70)
    print(f"\nNext steps:")
    print(f"1. Run this same script on different provider")
    print(f"2. Compare attestation sections (especially firmware)")
    print(f"3. Compare hidden_states and key_vectors for bit-exactness")
    print(f"4. If differences found, binary search on configuration variables")

if __name__ == "__main__":
    # Detect if running in notebook or command-line
    try:
        get_ipython()  # This will exist in Jupyter/IPython
        in_notebook = True
    except NameError:
        in_notebook = False
    
    if in_notebook:
        # Running in notebook - prompt for provider name
        print("Running in Jupyter notebook")
        print("Enter provider name (e.g., 'runpod', 'vast', or press Enter for 'unknown'):")
        provider = input().strip() or 'unknown'
        main(provider)
    else:
        # Running from command line - use argparse
        parser = argparse.ArgumentParser(description='Cross-provider A100 baseline experiment')
        parser.add_argument('--provider', type=str, default='unknown',
                           help='Provider name (e.g., runpod, vast)')
        args = parser.parse_args()
        main(args.provider)

Running in Jupyter notebook
Enter provider name (e.g., 'runpod', 'vast', or press Enter for 'unknown'):


 runpod


SYSTEM ATTESTATION
Provider: runpod
Hostname: 1e14ec78f3ec
GPU: NVIDIA A100 80GB PCIe
PyTorch: 2.8.0+cu128
CUDA: 12.8
Driver: 550.127.05
Compute Mode: Default
Persistence Mode: unknown

Firmware Info:
  VBIOS Version: 92.00.A0.00.05

LOADING MODEL
Model: Qwen/Qwen2.5-7B-Instruct
Precision: BF16 (bfloat16)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Prompt tokens: 129


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

GPU memory after load: 14.19 GB

RUNNING BASELINE EXPERIMENT
Layers to extract: [0, 1, 2, 3, 4, 7, 10, 14, 18, 22, 26, 28]
Repetitions: 5
Operation: Prefill only (no generation)
Extraction: Last valid token position only

Repetition 1/5...
  Hidden state dim: 3584
  Sequence length: 129
  Last valid pos: 128
  Layers extracted: 12
Repetition 2/5...
Repetition 3/5...
Repetition 4/5...
Repetition 5/5...

REPRODUCIBILITY ANALYSIS

Hidden States:
  layer_0: [EXACT] Bit-exact across all repetitions
  layer_1: [EXACT] Bit-exact across all repetitions
  layer_2: [EXACT] Bit-exact across all repetitions
  layer_3: [EXACT] Bit-exact across all repetitions
  layer_4: [EXACT] Bit-exact across all repetitions
  layer_7: [EXACT] Bit-exact across all repetitions
  layer_10: [EXACT] Bit-exact across all repetitions
  layer_14: [EXACT] Bit-exact across all repetitions
  layer_18: [EXACT] Bit-exact across all repetitions
  layer_22: [EXACT] Bit-exact across all repetitions
  layer_26: [EXACT] Bit-exact