In [1]:
#!/usr/bin/env python3
"""
Batch Size Control Test - DECODE Phase
Control experiment: Does batch SIZE affect sequence 0's key vectors during decode?
- Same sequence 0 across all runs
- Different batch sizes: 1 vs 4
- Generate 30 tokens autoregressively
- Compare sequence 0's key vectors at final decode step

This SHOULD show a difference (otherwise verification is impossible)
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'
os.environ['HF_DATASETS_CACHE'] = '/workspace/huggingface_cache'

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
from datetime import datetime
import json
import socket

HOSTNAME = socket.gethostname()
CONTAINER_ID = os.environ.get('HOSTNAME', 'unknown')

print("="*60)
print("BATCH SIZE CONTROL TEST - DECODE")
print("="*60)
print(f"System Info:")
print(f"  Hostname: {HOSTNAME}")
print(f"  Container: {CONTAINER_ID}")
print(f"  GPU: {torch.cuda.get_device_name(0)}")
print(f"  PyTorch: {torch.__version__}")
print(f"  CUDA: {torch.version.cuda}")
print()

# Capture relevant environment variables
print("Environment Variables:")
env_vars = {}
for key in sorted(os.environ.keys()):
    if any(x in key.upper() for x in ['CUDA', 'TORCH', 'NCCL', 'CUDNN', 'PYTORCH']):
        env_vars[key] = os.environ[key]
        print(f"  {key}={os.environ[key]}")
if not env_vars:
    print("  (No CUDA/TORCH env vars set)")
print()

CACHE_DIR = '/workspace/huggingface_cache'
model_name = "Qwen/Qwen2.5-7B-Instruct"

print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    cache_dir=CACHE_DIR,
    low_cpu_mem_usage=True,
    device_map="auto"
)

num_layers = len(model.model.layers)
num_heads = model.config.num_attention_heads
head_dim = model.config.hidden_size // num_heads
key_vector_dim = num_heads * head_dim

print(f"Model: {model_name}")
print(f"Layers: {num_layers}, Heads: {num_heads}, Key dim: {key_vector_dim}\n")

# Use same sequences
raw_sequences = [
    """The automated data-processing pipeline ingests raw telemetry from distributed sensors 
    across multiple geographic locations. A proprietary algorithm then normalizes the dataset, 
    filtering for anomalies based on predefined statistical parameters derived from historical 
    patterns. The resulting output is a clean, structured matrix ready for machine learning model 
    ingestion and downstream analytical workflows. System efficiency is monitored in real-time 
    through a comprehensive dashboard, with automated alerts triggered if latency exceeds the 
    established threshold or if data quality metrics fall below acceptable ranges. Advanced 
    compression techniques optimize storage utilization across the distributed infrastructure.
    Performance metrics are tracked continuously to ensure optimal throughput and minimal latency.""",
    
    """Climate modeling techniques have advanced significantly through the integration of 
    high-resolution satellite imagery and ground-based observation networks. Researchers combine 
    atmospheric physics equations with empirical data to simulate complex weather patterns and 
    predict long-term climate trends. These models incorporate ocean currents, ice sheet dynamics, 
    and greenhouse gas concentrations to provide increasingly accurate projections for policymakers 
    and environmental scientists worldwide. Modern computational infrastructure enables simulations 
    at unprecedented scales and temporal resolution with remarkable accuracy and scientific validity.
    International collaboration facilitates data sharing and model validation across research institutions.""",
    
    """Quantum entanglement represents one of the most counterintuitive phenomena in modern physics, 
    where particles become correlated in ways that defy classical explanations. When two particles 
    are entangled, measuring the state of one instantaneously affects the other regardless of the 
    distance separating them. This property has profound implications for quantum computing and 
    cryptography, enabling novel approaches to information processing and secure communication that 
    are fundamentally impossible with classical systems and traditional computational paradigms. Research 
    continues to explore practical applications of these quantum mechanical principles. Experimental 
    verification requires sophisticated detection equipment and precisely controlled laboratory conditions.""",
    
    """The human immune system comprises an intricate network of cells, tissues, and organs that 
    work collaboratively to defend against pathogens and foreign substances. White blood cells 
    patrol the bloodstream and tissues, identifying and neutralizing threats through both innate 
    and adaptive immune responses. B cells produce antibodies that target specific antigens, while 
    T cells orchestrate cellular immunity and eliminate infected cells. This sophisticated biological 
    defense mechanism evolved over millions of years to protect organisms. Memory cells enable rapid 
    responses to previously encountered pathogens through accelerated antibody production. The lymphatic 
    system transports immune cells throughout the body to maintain comprehensive surveillance.""",
]

TARGET_LENGTH = 100
token_ids_list = []

print("Tokenizing:")
for i, text in enumerate(raw_sequences):
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    if len(token_ids) < TARGET_LENGTH:
        raise ValueError(f"Sequence {i} has only {len(token_ids)} tokens")
    token_ids = token_ids[:TARGET_LENGTH]
    token_ids_list.append(token_ids)
    print(f"  Seq {i}: {len(token_ids)} tokens")

print()

def collect_key_vector_decode(model, tokenizer, token_ids_batch, max_new_tokens=30, device="cuda"):
    input_ids = torch.tensor(token_ids_batch, dtype=torch.long).to(device)
    attention_mask = torch.ones_like(input_ids)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            return_dict_in_generate=True,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    if not hasattr(outputs, 'past_key_values') or outputs.past_key_values is None:
        raise RuntimeError("No KV cache returned")
    
    past_kv = outputs.past_key_values
    key_cache = past_kv[-1][0]  # Last layer keys
    seq_0_keys = key_cache[0, :, -1, :]
    key_vector = seq_0_keys.reshape(-1).cpu().clone()
    
    seq_0_generated = outputs.sequences[0, input_ids.shape[1]:].cpu().tolist()
    
    del outputs
    torch.cuda.empty_cache()
    
    return key_vector, seq_0_generated

batch_configs = {
    "bs1": [token_ids_list[0]],
    "bs4": [token_ids_list[0], token_ids_list[1], token_ids_list[2], token_ids_list[3]],
}

print("="*60)
print("EXPERIMENT: Batch Size Control - DECODE")
print("="*60)
print(f"Repetitions: 5 per batch size")
print(f"EXPECTED: bs1 ≠ bs4 (otherwise no signal for verification)")
print()

num_reps = 5
results = {}
generated_texts = {}

for bs_name, batch in batch_configs.items():
    bs = len(batch)
    print(f"Batch size {bs}:")
    
    runs = []
    gen_tokens_list = []
    
    for rep in range(num_reps):
        key_vec, gen_tokens = collect_key_vector_decode(
            model, tokenizer, batch, max_new_tokens=30, device="cuda"
        )
        runs.append(key_vec)
        gen_tokens_list.append(gen_tokens)
        
        if rep == 0:
            gen_text = tokenizer.decode(gen_tokens, skip_special_tokens=True)
            print(f"  Generated: '{gen_text[:60]}...'")
            print(f"  Key: norm={torch.norm(key_vec).item():.6f}, first={key_vec[0].item():.6f}")
    
    results[bs_name] = torch.stack(runs)
    generated_texts[bs_name] = gen_tokens_list
    
    # Check reproducibility
    first = runs[0]
    all_identical = all(torch.equal(first, runs[i]) for i in range(1, num_reps))
    print(f"  {'✓' if all_identical else '⚠'} Reproducibility within batch size")
    print()

print("="*60)
print("ANALYSIS: Does Batch Size Matter?")
print("="*60)

bs1_mean = results["bs1"].mean(dim=0)
bs4_mean = results["bs4"].mean(dim=0)

l2_distance = torch.norm(bs1_mean - bs4_mean).item()
rel_diff = l2_distance / torch.norm(bs1_mean).item() if torch.norm(bs1_mean) > 0 else 0

print(f"L2 distance (bs1 vs bs4): {l2_distance:.6f}")
print(f"Relative difference: {rel_diff:.8f}")
print()

diff = (bs1_mean - bs4_mean).abs()
print(f"Max element diff: {diff.max().item():.8f}")
print(f"Elements with |diff| > 1e-6: {(diff > 1e-6).sum().item()}/{diff.shape[0]}")
print()

# Check generated tokens
bs1_tokens = generated_texts["bs1"][0]
bs4_tokens = generated_texts["bs4"][0]
tokens_match = bs1_tokens == bs4_tokens

print("="*60)
print("VERDICT")
print("="*60)
print(f"Generated tokens: {'SAME' if tokens_match else 'DIFFERENT'}")
print()

if l2_distance > 0.1:
    print("✓ STRONG SIGNAL: Batch size creates clear differences")
    print(f"  L2 distance: {l2_distance:.4f}")
    print("  → Verification CAN detect hidden batch capacity")
    print("  → ✓✓✓ BATCH SIZE IS A VIABLE FORENSIC SIGNAL")
elif l2_distance > 0.001:
    print("⚠ WEAK SIGNAL: Small but detectable batch size effect")
    print(f"  L2 distance: {l2_distance:.6f}")
    print("  → May be sufficient for verification")
elif l2_distance > 0:
    print("⚠ VERY WEAK: Minimal batch size effect")
    print(f"  L2 distance: {l2_distance:.8f}")
    print("  → Signal might be too weak for practical verification")
else:
    print("✗ NO SIGNAL: Batch size has no effect")
    print("  → Cannot detect hidden capacity via batch size")
    print("  → ✗✗ VERIFICATION SCHEME FAILS")

output = {
    "experiment": "batch_size_control_decode_test",
    "timestamp": datetime.now().isoformat(),
    "model": model_name,
    "hardware": {
        "gpu": torch.cuda.get_device_name(0),
        "pytorch": torch.__version__,
        "cuda": torch.version.cuda,
        "hostname": HOSTNAME,
        "container_id": CONTAINER_ID
    },
    "environment": env_vars,
    "config": {
        "batch_sizes": [1, 4],
        "prefill_length": TARGET_LENGTH,
        "decode_length": 30,
        "repetitions": num_reps,
        "dtype": "bfloat16",
    },
    "results": {
        "l2_distance_bs1_vs_bs4": l2_distance,
        "relative_difference": rel_diff,
        "tokens_match": tokens_match,
        "max_element_diff": diff.max().item(),
    },
    "conclusion": (
        "strong_signal" if l2_distance > 0.1 else
        "weak_signal" if l2_distance > 0.001 else
        "very_weak" if l2_distance > 0 else
        "no_signal"
    )
}

output_file = f"batch_size_control_decode_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
output_path = f"/mnt/user-data/outputs/{output_file}"

os.makedirs("/mnt/user-data/outputs", exist_ok=True)
with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"\n✓ Results saved to {output_path}")
print("="*60)



BATCH SIZE CONTROL TEST - DECODE
System Info:
  Hostname: d07c0e0bfd13
  Container: d07c0e0bfd13
  GPU: NVIDIA A40
  PyTorch: 2.8.0+cu128
  CUDA: 12.8

Environment Variables:
  CUDA_MODULE_LOADING=LAZY
  CUDA_VERSION=12.8.1
  NCCL_VERSION=2.25.1-1
  NVIDIA_REQUIRE_CUDA=cuda>=12.8 brand=unknown,driver>=470,driver<471 brand=grid,driver>=470,driver<471 brand=tesla,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=vapps,driver>=470,driver<471 brand=vpc,driver>=470,driver<471 brand=vcs,driver>=470,driver<471 brand=vws,driver>=470,driver<471 brand=cloudgaming,driver>=470,driver<471 brand=unknown,driver>=535,driver<536 brand=grid,driver>=535,driver<536 brand=tesla,driver>=535,driver<536 brand=nvidia,driver>=535,driver<536 brand=quadro,driver>=535,driver<536 brand=quadrortx,driver>=535,driver<536 brand=nvidiartx,driver>=535,driver<536 brand=vapps,driver>=535,driver<5

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model: Qwen/Qwen2.5-7B-Instruct
Layers: 28, Heads: 28, Key dim: 3584

Tokenizing:
  Seq 0: 100 tokens
  Seq 1: 100 tokens
  Seq 2: 100 tokens
  Seq 3: 100 tokens

EXPERIMENT: Batch Size Control - DECODE
Repetitions: 5 per batch size
EXPECTED: bs1 ≠ bs4 (otherwise no signal for verification)

Batch size 1:
  Generated: ' acceptable levels.

How would you modify this system to inc...'
  Key: norm=924.000000, first=0.028809
  ✓ Reproducibility within batch size

Batch size 4:
  Generated: ' acceptable levels.

How would you modify this system to inc...'
  Key: norm=924.000000, first=0.026367
  ✓ Reproducibility within batch size

ANALYSIS: Does Batch Size Matter?
L2 distance (bs1 vs bs4): 0.330078
Relative difference: 0.00035723

Max element diff: 0.25000000
Elements with |diff| > 1e-6: 392/512

VERDICT
Generated tokens: SAME

✓ STRONG SIGNAL: Batch size creates clear differences
  L2 distance: 0.3301
  → Verification CAN detect hidden batch capacity
  → ✓✓✓ BATCH SIZE IS A VIABLE FORENSI