In [1]:
!pip install transformers
!pip install hf_transfer
!pip install accelerate
!pip install ninja packaging wheel
!pip install flash-attn --no-build-isolation

Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2025.10.23-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.6.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Using cached hf_xet-1.2.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.m

In [2]:
!pip install torch==2.4.0+cu118 --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.4.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl (857.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m857.7/857.7 MB[0m [31m152.8 MB/s[0m  [33m0:00:03[0m00:01[0m00:01[0m
Collecting triton==3.0.0 (from torch==2.4.0+cu118)
  Downloading https://download.pytorch.org/whl/triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m163.2 MB/s[0m  [33m0:00:01[0m00:01[0m00:01[0m
Installing collected packages: triton, torch
[2K  Attempting uninstall: triton
[2K    Found existing installation: triton 3.5.0
[2K    Uninstalling triton-3.5.0:
[2K      Successfully uninstalled triton-3.5.0
[2K  Attempting uninstall: torch━━━━━━━━━━━━━━━━━━━[0m [32m0/2[0m [triton]
[2K    Found existing installation: torch 2.

In [3]:
!pip uninstall torchvision -y
!pip uninstall flash-attn -y

[0m

In [4]:
#!/usr/bin/env python3
"""
CUDA Version Forensics Experiment
Tests if different CUDA toolkit versions create detectable activation deviations
Same PyTorch version (2.4.0), different CUDA toolkits (11.8 vs 12.0)
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'
os.environ['HF_DATASETS_CACHE'] = '/workspace/huggingface_cache'

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc
import numpy as np
from datetime import datetime
import json
import socket

print("="*60)
print("CUDA VERSION FORENSICS EXPERIMENT")
print("="*60)

# System info
HOSTNAME = socket.gethostname()
CONTAINER_ID = os.environ.get('HOSTNAME', 'unknown')

print(f"\nSystem Info:")
print(f"  Hostname: {HOSTNAME}")
print(f"  Container: {CONTAINER_ID}")
print(f"  GPU: {torch.cuda.get_device_name(0)}")
print(f"  PyTorch: {torch.__version__}")
print(f"  CUDA Runtime: {torch.version.cuda}")

# Extract CUDA version from PyTorch build
pytorch_version = torch.__version__
if '+cu' in pytorch_version:
    cuda_build = pytorch_version.split('+cu')[1].split('.')[0]  # e.g., "118" or "120"
    cuda_version_str = f"cu{cuda_build}"
    print(f"  CUDA Build: {cuda_version_str}")
else:
    print(f"  WARNING: Could not detect CUDA version from PyTorch build")
    cuda_version_str = "unknown"

print()

# Check GPU memory
mem_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
print(f"GPU Memory:")
print(f"  Total: {mem_total:.1f} GB")
print(f"  Currently allocated: {mem_allocated:.2f} GB")
print()

def collect_activations_multilayer(model, tokenizer, prompt, device="cuda"):
    """Extract hidden states from multiple layers"""
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    gc.collect()
    
    inputs = tokenizer([prompt], return_tensors="pt", padding=True)
    seq_len = inputs['input_ids'].shape[1]
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, use_cache=False)
    
    last_valid_pos = inputs['attention_mask'][0].sum() - 1
    num_layers = len(outputs.hidden_states) - 1
    
    # Sample layers: dense early, then sparse
    layer_indices = [1, 2, 3, 4, 7, 10, 14, 18, 22, num_layers]
    
    activations = {}
    for idx in layer_indices:
        layer_activation = outputs.hidden_states[idx][0, last_valid_pos, :].cpu().clone()
        activations[f"layer_{idx}"] = layer_activation
    
    # Aggressive cleanup
    del outputs.hidden_states
    del outputs
    del inputs
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
    return activations, seq_len, layer_indices

# Setup
CACHE_DIR = '/workspace/huggingface_cache'
model_name = "Qwen/Qwen2.5-7B-Instruct"

print(f"Loading {model_name} in BF16...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    cache_dir=CACHE_DIR,
    low_cpu_mem_usage=True,
    device_map="auto",
    attn_implementation="eager"
)

# Check what attention implementation is actually being used
config_impl = getattr(model.config, '_attn_implementation', 'not set')
print(f"Model attention implementation: {config_impl}")

# Load prompt from file, or use default
prompt_file = "dummytext.txt"
try:
    with open(prompt_file, 'r', encoding='utf-8') as f:
        prompt = f.read().strip()
    print(f"✓ Loaded prompt from {prompt_file}")
except FileNotFoundError:
    prompt = """The development of large language models has fundamentally transformed natural language processing 
and artificial intelligence more broadly. These models, trained on vast corpora of text data, have demonstrated 
remarkable capabilities across a wide range of tasks, from translation and summarization to question answering 
and creative writing. However, their deployment raises significant challenges related to computational efficiency, 
interpretability, and safety."""
    print(f"⚠ Using default prompt")

prompt_tokens = len(tokenizer.encode(prompt))
print(f"Prompt tokens: {prompt_tokens}")
print()

# Run experiment
num_repetitions = 5
results = {}
all_activations = {}
layer_indices = None

print(f"{'='*60}")
print(f"CUDA VERSION COMPARISON EXPERIMENT")
print(f"CUDA Build: {cuda_version_str}")
print(f"Model: {model_name}")
print(f"Precision: BF16 (bfloat16)")
print(f"Attention: flash_attention_2")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Repetitions: {num_repetitions}")
print(f"{'='*60}\n")

print(f"\nCollecting multi-layer activations ({num_repetitions} repetitions)...")

for rep in range(num_repetitions):
    mem_before = torch.cuda.memory_allocated(0) / 1024**3
    
    activations, seq_len, layer_idx_list = collect_activations_multilayer(
        model, tokenizer, prompt, device="cuda"
    )
    
    mem_after = torch.cuda.memory_allocated(0) / 1024**3
    
    if layer_indices is None:
        layer_indices = layer_idx_list
        print(f"  Extracting from layers: {layer_indices}")
        for layer_name in activations.keys():
            results[layer_name] = []
    
    # Store activations for each layer
    for layer_name, activation in activations.items():
        results[layer_name].append(activation)
    
    if rep == 0:
        print(f"  Rep 0 norms: {', '.join([f'{k}={torch.norm(v).item():.2f}' for k, v in activations.items()])}")
        print(f"  Rep 0 memory: before={mem_before:.2f}GB, after={mem_after:.2f}GB")
    
    # Cleanup
    del activations
    gc.collect()
    torch.cuda.empty_cache()

print(f"  ✓ Completed all repetitions\n")

# Stack repetitions into tensors
for layer_name in results.keys():
    results[layer_name] = torch.stack(results[layer_name])

# Check repeatability
last_layer_name = f"layer_{layer_indices[-1]}"
first_rep = results[last_layer_name][0]
all_identical = all(
    torch.equal(first_rep, results[last_layer_name][i]) 
    for i in range(1, num_repetitions)
)
print(f"Repeatability (last layer): {'✓ All identical' if all_identical else '⚠ Varies'}")

# Convert to numpy for JSON storage
for layer_name, tensor in results.items():
    all_activations[layer_name] = tensor.float().numpy().tolist()

# Compute statistics
layer_stats = {}
for layer_name in results.keys():
    mean_activation = results[layer_name].mean(dim=0)
    layer_stats[layer_name] = {
        "mean_norm": float(torch.norm(mean_activation)),
        "std_within_reps": float(torch.stack([
            torch.norm(results[layer_name][i] - mean_activation) 
            for i in range(num_repetitions)
        ]).std())
    }

print("\nLayer activation norms:")
for layer_name in sorted(layer_stats.keys(), key=lambda x: int(x.split('_')[1])):
    norm = layer_stats[layer_name]["mean_norm"]
    print(f"  {layer_name}: {norm:.2f}")

# Save results
output = {
    "experiment": "cuda_version_forensics",
    "timestamp": datetime.now().isoformat(),
    "model": model_name,
    "hardware": {
        "gpu": torch.cuda.get_device_name(0),
        "hostname": HOSTNAME,
        "container_id": CONTAINER_ID
    },
    "software": {
        "pytorch_version": torch.__version__,
        "cuda_runtime": torch.version.cuda,
        "cuda_build": cuda_version_str,
        "attention_implementation": config_impl
    },
    "config": {
        "dtype": "bfloat16",
        "prompt_tokens": prompt_tokens,
        "repetitions": num_repetitions,
        "layers_sampled": layer_indices,
        "hidden_dim": int(results[last_layer_name][0].shape[0])
    },
    "layer_statistics": layer_stats,
    "reproducibility": {
        "all_repetitions_identical": all_identical
    },
    "raw_activations": all_activations
}

# Filename includes CUDA version
gpu_name = torch.cuda.get_device_name(0).replace(' ', '_')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = f"{gpu_name}_{cuda_version_str}_7b_bf16_eager_{timestamp}.json"
output_path = f"/workspace/experiments/{output_file}"

with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"\n✓ Results saved to {output_path}")
print(f"✓ File size: ~{len(json.dumps(output)) / 1024:.1f} KB")

print("\n" + "="*60)
print("EXPERIMENT COMPLETE")
print(f"CUDA Build: {cuda_version_str}")
print("="*60)
print("\nNext steps:")
print("1. If this was cu118: Restart kernel, install cu120, run again")
print("2. If this was cu120: Compare the two JSON files")
print("="*60)



CUDA VERSION FORENSICS EXPERIMENT

System Info:
  Hostname: 75a750b4edaf
  Container: 75a750b4edaf
  GPU: NVIDIA A100 80GB PCIe
  PyTorch: 2.4.0+cu118
  CUDA Runtime: 11.8
  CUDA Build: cu118

GPU Memory:
  Total: 79.3 GB
  Currently allocated: 0.00 GB

Loading Qwen/Qwen2.5-7B-Instruct in BF16...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model attention implementation: eager
✓ Loaded prompt from dummytext.txt
Prompt tokens: 3404

CUDA VERSION COMPARISON EXPERIMENT
CUDA Build: cu118
Model: Qwen/Qwen2.5-7B-Instruct
Precision: BF16 (bfloat16)
Attention: flash_attention_2
Prompt tokens: 3404
Repetitions: 5


Collecting multi-layer activations (5 repetitions)...
  Extracting from layers: [1, 2, 3, 4, 7, 10, 14, 18, 22, 28]
  Rep 0 norms: layer_1=8.62, layer_2=14.75, layer_3=17.12, layer_4=18.50, layer_7=35.75, layer_10=55.75, layer_14=65.50, layer_18=79.00, layer_22=159.00, layer_28=308.00
  Rep 0 memory: before=14.19GB, after=14.20GB
  ✓ Completed all repetitions

Repeatability (last layer): ✓ All identical

Layer activation norms:
  layer_1: 8.62
  layer_2: 14.75
  layer_3: 17.12
  layer_4: 18.50
  layer_7: 35.75
  layer_10: 55.75
  layer_14: 65.50
  layer_18: 79.00
  layer_22: 159.00
  layer_28: 308.00

✓ Results saved to /workspace/experiments/NVIDIA_A100_80GB_PCIe_cu118_7b_bf16_eager_20251103_135936.json
✓ File size: ~

In [5]:
# Restart kernel/Python completely
!pip uninstall torch flash-attn -y

# Install PyTorch first
!pip install torch==2.4.0+cu120 --index-url https://download.pytorch.org/whl/cu120

# Then install everything else
!pip install transformers hf_transfer accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: torch 2.4.0+cu118
Uninstalling torch-2.4.0+cu118:
  Successfully uninstalled torch-2.4.0+cu118
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: https://download.pytorch.org/whl/cu120
[31mERROR: Could not find a version that satisfies the requirement torch==2.4.0+cu120 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.4.0+cu120[0m[31m
[0m

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting torch>=2.0.0 (from accelerate)
  Using cached torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting triton==3.5.0 (from torch>=2.0.0->accelerate)
  Using cached triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Using cached torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl (899.7 MB)
Using cached triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (170.5 MB)
Installing collected packages: triton, torch
[2K  Attempting uninstall: triton
[2K    Found existing installation: triton 3.0.0
[2K    Uninstalling triton-3.0.0:
[2K      Successfully uninstalled triton-3.0.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [torch]32m1/2[0m [torch]
[1A[2K[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.8.0+cu128 requires torch==2.8.0,

In [6]:
#!/usr/bin/env python3
"""
CUDA Version Forensics Experiment
Tests if different CUDA toolkit versions create detectable activation deviations
Same PyTorch version (2.4.0), different CUDA toolkits (11.8 vs 12.0)
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'
os.environ['HF_DATASETS_CACHE'] = '/workspace/huggingface_cache'

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc
import numpy as np
from datetime import datetime
import json
import socket

print("="*60)
print("CUDA VERSION FORENSICS EXPERIMENT")
print("="*60)

# System info
HOSTNAME = socket.gethostname()
CONTAINER_ID = os.environ.get('HOSTNAME', 'unknown')

print(f"\nSystem Info:")
print(f"  Hostname: {HOSTNAME}")
print(f"  Container: {CONTAINER_ID}")
print(f"  GPU: {torch.cuda.get_device_name(0)}")
print(f"  PyTorch: {torch.__version__}")
print(f"  CUDA Runtime: {torch.version.cuda}")

# Extract CUDA version from PyTorch build
pytorch_version = torch.__version__
if '+cu' in pytorch_version:
    cuda_build = pytorch_version.split('+cu')[1].split('.')[0]  # e.g., "118" or "120"
    cuda_version_str = f"cu{cuda_build}"
    print(f"  CUDA Build: {cuda_version_str}")
else:
    print(f"  WARNING: Could not detect CUDA version from PyTorch build")
    cuda_version_str = "unknown"

print()

# Check GPU memory
mem_total = torch.cuda.get_device_properties(0).total_memory / 1024**3
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
print(f"GPU Memory:")
print(f"  Total: {mem_total:.1f} GB")
print(f"  Currently allocated: {mem_allocated:.2f} GB")
print()

def collect_activations_multilayer(model, tokenizer, prompt, device="cuda"):
    """Extract hidden states from multiple layers"""
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    gc.collect()
    
    inputs = tokenizer([prompt], return_tensors="pt", padding=True)
    seq_len = inputs['input_ids'].shape[1]
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, use_cache=False)
    
    last_valid_pos = inputs['attention_mask'][0].sum() - 1
    num_layers = len(outputs.hidden_states) - 1
    
    # Sample layers: dense early, then sparse
    layer_indices = [1, 2, 3, 4, 7, 10, 14, 18, 22, num_layers]
    
    activations = {}
    for idx in layer_indices:
        layer_activation = outputs.hidden_states[idx][0, last_valid_pos, :].cpu().clone()
        activations[f"layer_{idx}"] = layer_activation
    
    # Aggressive cleanup
    del outputs.hidden_states
    del outputs
    del inputs
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    
    return activations, seq_len, layer_indices

# Setup
CACHE_DIR = '/workspace/huggingface_cache'
model_name = "Qwen/Qwen2.5-7B-Instruct"

print(f"Loading {model_name} in BF16...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    cache_dir=CACHE_DIR,
    low_cpu_mem_usage=True,
    device_map="auto",
    attn_implementation="eager"
)

# Check what attention implementation is actually being used
config_impl = getattr(model.config, '_attn_implementation', 'not set')
print(f"Model attention implementation: {config_impl}")

# Load prompt from file, or use default
prompt_file = "dummytext.txt"
try:
    with open(prompt_file, 'r', encoding='utf-8') as f:
        prompt = f.read().strip()
    print(f"✓ Loaded prompt from {prompt_file}")
except FileNotFoundError:
    prompt = """The development of large language models has fundamentally transformed natural language processing 
and artificial intelligence more broadly. These models, trained on vast corpora of text data, have demonstrated 
remarkable capabilities across a wide range of tasks, from translation and summarization to question answering 
and creative writing. However, their deployment raises significant challenges related to computational efficiency, 
interpretability, and safety."""
    print(f"⚠ Using default prompt")

prompt_tokens = len(tokenizer.encode(prompt))
print(f"Prompt tokens: {prompt_tokens}")
print()

# Run experiment
num_repetitions = 5
results = {}
all_activations = {}
layer_indices = None

print(f"{'='*60}")
print(f"CUDA VERSION COMPARISON EXPERIMENT")
print(f"CUDA Build: {cuda_version_str}")
print(f"Model: {model_name}")
print(f"Precision: BF16 (bfloat16)")
print(f"Attention: flash_attention_2")
print(f"Prompt tokens: {prompt_tokens}")
print(f"Repetitions: {num_repetitions}")
print(f"{'='*60}\n")

print(f"\nCollecting multi-layer activations ({num_repetitions} repetitions)...")

for rep in range(num_repetitions):
    mem_before = torch.cuda.memory_allocated(0) / 1024**3
    
    activations, seq_len, layer_idx_list = collect_activations_multilayer(
        model, tokenizer, prompt, device="cuda"
    )
    
    mem_after = torch.cuda.memory_allocated(0) / 1024**3
    
    if layer_indices is None:
        layer_indices = layer_idx_list
        print(f"  Extracting from layers: {layer_indices}")
        for layer_name in activations.keys():
            results[layer_name] = []
    
    # Store activations for each layer
    for layer_name, activation in activations.items():
        results[layer_name].append(activation)
    
    if rep == 0:
        print(f"  Rep 0 norms: {', '.join([f'{k}={torch.norm(v).item():.2f}' for k, v in activations.items()])}")
        print(f"  Rep 0 memory: before={mem_before:.2f}GB, after={mem_after:.2f}GB")
    
    # Cleanup
    del activations
    gc.collect()
    torch.cuda.empty_cache()

print(f"  ✓ Completed all repetitions\n")

# Stack repetitions into tensors
for layer_name in results.keys():
    results[layer_name] = torch.stack(results[layer_name])

# Check repeatability
last_layer_name = f"layer_{layer_indices[-1]}"
first_rep = results[last_layer_name][0]
all_identical = all(
    torch.equal(first_rep, results[last_layer_name][i]) 
    for i in range(1, num_repetitions)
)
print(f"Repeatability (last layer): {'✓ All identical' if all_identical else '⚠ Varies'}")

# Convert to numpy for JSON storage
for layer_name, tensor in results.items():
    all_activations[layer_name] = tensor.float().numpy().tolist()

# Compute statistics
layer_stats = {}
for layer_name in results.keys():
    mean_activation = results[layer_name].mean(dim=0)
    layer_stats[layer_name] = {
        "mean_norm": float(torch.norm(mean_activation)),
        "std_within_reps": float(torch.stack([
            torch.norm(results[layer_name][i] - mean_activation) 
            for i in range(num_repetitions)
        ]).std())
    }

print("\nLayer activation norms:")
for layer_name in sorted(layer_stats.keys(), key=lambda x: int(x.split('_')[1])):
    norm = layer_stats[layer_name]["mean_norm"]
    print(f"  {layer_name}: {norm:.2f}")

# Save results
output = {
    "experiment": "cuda_version_forensics",
    "timestamp": datetime.now().isoformat(),
    "model": model_name,
    "hardware": {
        "gpu": torch.cuda.get_device_name(0),
        "hostname": HOSTNAME,
        "container_id": CONTAINER_ID
    },
    "software": {
        "pytorch_version": torch.__version__,
        "cuda_runtime": torch.version.cuda,
        "cuda_build": cuda_version_str,
        "attention_implementation": config_impl
    },
    "config": {
        "dtype": "bfloat16",
        "prompt_tokens": prompt_tokens,
        "repetitions": num_repetitions,
        "layers_sampled": layer_indices,
        "hidden_dim": int(results[last_layer_name][0].shape[0])
    },
    "layer_statistics": layer_stats,
    "reproducibility": {
        "all_repetitions_identical": all_identical
    },
    "raw_activations": all_activations
}

# Filename includes CUDA version
gpu_name = torch.cuda.get_device_name(0).replace(' ', '_')
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = f"{gpu_name}_{cuda_version_str}_7b_bf16_eager_{timestamp}.json"
output_path = f"/workspace/experiments/{output_file}"

with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"\n✓ Results saved to {output_path}")
print(f"✓ File size: ~{len(json.dumps(output)) / 1024:.1f} KB")

print("\n" + "="*60)
print("EXPERIMENT COMPLETE")
print(f"CUDA Build: {cuda_version_str}")
print("="*60)
print("\nNext steps:")
print("1. If this was cu118: Restart kernel, install cu120, run again")
print("2. If this was cu120: Compare the two JSON files")
print("="*60)

CUDA VERSION FORENSICS EXPERIMENT

System Info:
  Hostname: 75a750b4edaf
  Container: 75a750b4edaf
  GPU: NVIDIA A100 80GB PCIe
  PyTorch: 2.4.0+cu118
  CUDA Runtime: 11.8
  CUDA Build: cu118

GPU Memory:
  Total: 79.3 GB
  Currently allocated: 14.20 GB

Loading Qwen/Qwen2.5-7B-Instruct in BF16...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model attention implementation: eager
✓ Loaded prompt from dummytext.txt
Prompt tokens: 3404

CUDA VERSION COMPARISON EXPERIMENT
CUDA Build: cu118
Model: Qwen/Qwen2.5-7B-Instruct
Precision: BF16 (bfloat16)
Attention: flash_attention_2
Prompt tokens: 3404
Repetitions: 5


Collecting multi-layer activations (5 repetitions)...
  Extracting from layers: [1, 2, 3, 4, 7, 10, 14, 18, 22, 28]
  Rep 0 norms: layer_1=8.62, layer_2=14.75, layer_3=17.12, layer_4=18.50, layer_7=35.75, layer_10=55.75, layer_14=65.50, layer_18=79.00, layer_22=159.00, layer_28=308.00
  Rep 0 memory: before=26.36GB, after=14.20GB
  ✓ Completed all repetitions

Repeatability (last layer): ✓ All identical

Layer activation norms:
  layer_1: 8.62
  layer_2: 14.75
  layer_3: 17.12
  layer_4: 18.50
  layer_7: 35.75
  layer_10: 55.75
  layer_14: 65.50
  layer_18: 79.00
  layer_22: 159.00
  layer_28: 308.00

✓ Results saved to /workspace/experiments/NVIDIA_A100_80GB_PCIe_cu118_7b_bf16_eager_20251103_140148.json
✓ File size: ~