In [2]:
import torch

torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()

In [4]:
#!/usr/bin/env python3
"""
vLLM Logprobs Extraction Test - Standalone Script
Test if we can extract logprobs from vLLM for reproducibility forensics

Paste this entire script into a Jupyter cell and run.
"""

# ============================================================================
# CONFIG SECTION - CHANGE THESE PARAMETERS
# ============================================================================

# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"  # Small model for testing
TENSOR_PARALLEL_SIZE = 1  # Single GPU for now
MAX_MODEL_LEN = 2048  # Context length (reduced for memory)
GPU_MEMORY_UTILIZATION = 0.4  # Use 40% of GPU memory (default is 0.9)

# Generation parameters
NUM_REPETITIONS = 5
MAX_TOKENS = 20  # Generate 20 tokens per pass
TEMPERATURE = 0.0  # Greedy sampling for determinism
SEED = 42

# Input text
INPUT_TEXT = """The field of artificial intelligence has witnessed remarkable 
transformation over the past decade, driven primarily by advances in deep learning 
and the emergence of increasingly sophisticated language models. These models, trained 
on vast corpora of text data, have demonstrated remarkable capabilities across a wide 
range of tasks."""

# ============================================================================
# IMPORTS
# ============================================================================

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'

from vllm import LLM, SamplingParams
import numpy as np
from datetime import datetime
import json
import torch

# ============================================================================
# SETUP
# ============================================================================

print("="*80)
print("vLLM LOGPROBS EXTRACTION TEST")
print("="*80)
print(f"\nConfiguration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Tensor parallel: {TENSOR_PARALLEL_SIZE}")
print(f"  Max model len: {MAX_MODEL_LEN}")
print(f"  GPU memory utilization: {GPU_MEMORY_UTILIZATION} ({GPU_MEMORY_UTILIZATION*100:.0f}%)")
print(f"  Max tokens: {MAX_TOKENS}")
print(f"  Repetitions: {NUM_REPETITIONS}")
print(f"  Temperature: {TEMPERATURE} (greedy)")
print()
print(f"GPU Info:")
print(f"  Device: {torch.cuda.get_device_name(0)}")
print(f"  Available: {torch.cuda.device_count()} GPUs")
print()

# ============================================================================
# LOAD MODEL
# ============================================================================

print("Loading model with vLLM...")
llm = LLM(
    model=MODEL_NAME,
    tensor_parallel_size=TENSOR_PARALLEL_SIZE,
    download_dir="/workspace/huggingface_cache",
    dtype="bfloat16",
    max_model_len=MAX_MODEL_LEN,
    gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    trust_remote_code=True,
)
print("✓ Model loaded\n")

# ============================================================================
# SAMPLING PARAMETERS
# ============================================================================

sampling_params = SamplingParams(
    temperature=TEMPERATURE,
    max_tokens=MAX_TOKENS,
    logprobs=1,  # Return logprob of chosen token
    seed=SEED,
)

print(f"Sampling params: temperature={TEMPERATURE}, max_tokens={MAX_TOKENS}, seed={SEED}")
print()

# ============================================================================
# LOGPROBS EXTRACTION FUNCTION
# ============================================================================

def extract_logprobs_vector(output):
    """
    Extract logprobs from vLLM output for forensic comparison
    
    For each generated token, extract the logprob of that token.
    Returns a numpy array of logprobs.
    """
    logprobs_list = []
    
    # Get the token IDs that were actually generated
    token_ids = output.outputs[0].token_ids
    
    # output.outputs[0].logprobs is a list (one entry per generated token)
    # Each entry is a dict mapping token_id -> Logprob object
    for i, token_logprobs_dict in enumerate(output.outputs[0].logprobs):
        if token_logprobs_dict is None:
            continue
        
        # Get the token ID that was actually generated at this position
        generated_token_id = token_ids[i]
        
        # Get the logprob for this token from the dict
        if generated_token_id in token_logprobs_dict:
            logprob_obj = token_logprobs_dict[generated_token_id]
            logprobs_list.append(logprob_obj.logprob)
        else:
            # This shouldn't happen, but handle it gracefully
            print(f"Warning: Token {generated_token_id} not in logprobs dict at position {i}")
            logprobs_list.append(None)
    
    return np.array(logprobs_list)

# ============================================================================
# WARM-UP PASS
# ============================================================================

print("="*80)
print("WARM-UP PASS")
print("="*80)
print()
print("Running warm-up pass to initialize CUDA kernels...")
_ = llm.generate([INPUT_TEXT], sampling_params)
print("✓ Warm-up complete - CUDA kernels compiled and cached\n")

# ============================================================================
# RUN REPEATED FORWARD PASSES
# ============================================================================

print("="*80)
print(f"RUNNING {NUM_REPETITIONS} FORWARD PASSES")
print("="*80)
print()

results_logprobs = []
results_tokens = []
results_texts = []

for rep in range(NUM_REPETITIONS):
    print(f"Rep {rep + 1}/{NUM_REPETITIONS}:", end=" ")
    
    # Generate (vLLM automatically manages cache per request)
    output = llm.generate([INPUT_TEXT], sampling_params)[0]
    
    # Extract logprobs
    logprobs_vec = extract_logprobs_vector(output)
    results_logprobs.append(logprobs_vec)
    
    # Extract tokens
    token_ids = output.outputs[0].token_ids
    results_tokens.append(token_ids)
    
    # Extract text
    text = output.outputs[0].text
    results_texts.append(text)
    
    print(f"{len(token_ids)} tokens, mean logprob={np.mean(logprobs_vec):.4f}")

print()

# Show first generation
print("First generation:")
print(f"  Text: '{results_texts[0][:100]}...'")
print(f"  Tokens: {results_tokens[0][:10]}...")
print(f"  Logprobs (first 5): {results_logprobs[0][:5]}")
print()

# ============================================================================
# ANALYSIS
# ============================================================================

print("="*80)
print("REPRODUCIBILITY ANALYSIS")
print("="*80)
print()

# Check token sequences
first_tokens = results_tokens[0]
tokens_identical = all(results_tokens[i] == first_tokens for i in range(NUM_REPETITIONS))

print(f"Token sequences identical: {tokens_identical}")
if not tokens_identical:
    for i in range(1, NUM_REPETITIONS):
        if results_tokens[i] != first_tokens:
            print(f"  Rep 0 vs Rep {i}: DIFFER")

print()

# Check logprobs
first_logprobs = results_logprobs[0]

# Bit-exact comparison
logprobs_exact = all(np.array_equal(first_logprobs, results_logprobs[i]) 
                     for i in range(1, NUM_REPETITIONS))

print(f"Logprobs bit-exact: {logprobs_exact}")

if not logprobs_exact:
    print("\nL2 distances:")
    l2_distances = []
    for i in range(1, NUM_REPETITIONS):
        l2 = np.linalg.norm(first_logprobs - results_logprobs[i])
        l2_distances.append(l2)
        print(f"  Rep 0 vs Rep {i}: L2 = {l2:.6e}")
    
    print(f"\nMax L2: {max(l2_distances):.6e}")
    print(f"Mean L2: {np.mean(l2_distances):.6e}")
    
    # Element-wise statistics
    all_logprobs = np.array(results_logprobs)
    std_per_token = all_logprobs.std(axis=0)
    print(f"\nPer-token std statistics:")
    print(f"  Mean: {std_per_token.mean():.6e}")
    print(f"  Max: {std_per_token.max():.6e}")
    print(f"  Median: {np.median(std_per_token):.6e}")

print()

# ============================================================================
# VERDICT
# ============================================================================

print("="*80)
print("VERDICT")
print("="*80)
print()

if tokens_identical and logprobs_exact:
    print("✓ PERFECT REPRODUCIBILITY")
    print("  - Token sequences: bit-exact")
    print("  - Logprobs: bit-exact")
    print("  - vLLM is deterministic for this config")
    print("  → Ready to scale up to K2 Thinking on Blackwell")
elif tokens_identical and not logprobs_exact:
    print("⚠ TOKENS IDENTICAL, LOGPROBS VARY")
    print("  - Token sequences: bit-exact")
    print("  - Logprobs: small numerical variation")
    max_l2 = max(l2_distances) if not logprobs_exact else 0.0
    if max_l2 < 1e-6:
        print(f"  - Variation very small (L2={max_l2:.2e})")
        print("  → Likely acceptable for forensics")
    else:
        print(f"  - Variation notable (L2={max_l2:.2e})")
        print("  → Investigate noise source")
else:
    print("✗ TOKEN SEQUENCES DIFFER")
    print("  - This should NOT happen with temperature=0")
    print("  → Something is wrong, investigate")

print()

# ============================================================================
# SAVE RESULTS
# ============================================================================

output_data = {
    "experiment": "vllm_logprobs_test",
    "timestamp": datetime.now().isoformat(),
    "config": {
        "model": MODEL_NAME,
        "tensor_parallel": TENSOR_PARALLEL_SIZE,
        "max_tokens": MAX_TOKENS,
        "repetitions": NUM_REPETITIONS,
        "temperature": TEMPERATURE,
        "seed": SEED,
        "warmup_enabled": True
    },
    "results": {
        "tokens_identical": tokens_identical,
        "logprobs_exact": logprobs_exact,
        "perfect_reproducibility": tokens_identical and logprobs_exact
    },
    "token_sequences": results_tokens,
    "logprobs_vectors": [lp.tolist() for lp in results_logprobs],
    "generated_texts": results_texts
}

output_file = f"/workspace/vllm_logprobs_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, "w") as f:
    json.dump(output_data, f, indent=2)

print(f"Results saved to: {output_file}")
print()
print("="*80)
print("TEST COMPLETE")
print("="*80)
print()
print("If successful, next steps:")
print("  1. Scale up to K2 Thinking")
print("  2. Test on 4× B200 with tensor parallelism")
print("  3. Run with 100K context")
print()

vLLM LOGPROBS EXTRACTION TEST

Configuration:
  Model: Qwen/Qwen2.5-7B-Instruct
  Tensor parallel: 1
  Max model len: 2048
  GPU memory utilization: 0.4 (40%)
  Max tokens: 20
  Repetitions: 5
  Temperature: 0.0 (greedy)

GPU Info:
  Device: NVIDIA A40
  Available: 1 GPUs

Loading model with vLLM...
INFO 11-09 19:27:04 [utils.py:233] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'bfloat16', 'max_model_len': 2048, 'gpu_memory_utilization': 0.4, 'disable_log_stats': True, 'model': 'Qwen/Qwen2.5-7B-Instruct'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-09 19:27:04 [model.py:547] Resolved architecture: Qwen2ForCausalLM
INFO 11-09 19:27:04 [model.py:1510] Using max model len 2048
INFO 11-09 19:27:04 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.




INFO 11-09 19:27:09 [__init__.py:216] Automatically detected platform cuda.
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:27:11 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:27:11 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observabi

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:15<00:47, 15.71s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:34<00:34, 17.44s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:50<00:17, 17.03s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:09<00:00, 17.48s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:09<00:00, 17.27s/it]
[1;36m(EngineCore_DP0 pid=1851)[0;0m 


[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:23 [default_loader.py:267] Loading weights took 69.25 seconds
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:24 [gpu_model_runner.py:2653] Model loading took 14.2488 GiB and 69.703528 seconds
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:29 [backends.py:548] Using cache directory: /root/.cache/vllm/torch_compile_cache/860723eb7f/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:29 [backends.py:559] Dynamo bytecode transform time: 4.70 s
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:30 [backends.py:164] Directly load the compiled graph(s) for dynamic shape from the cache, took 1.664 s
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:31 [monitor.py:34] torch.compile takes 4.70 s in total
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:33 [gpu_worker.py:298] Available KV cache memory: 2.04 GiB
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:04<00:00, 16.13it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 20.63it/s]


[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:39 [gpu_model_runner.py:3480] Graph capturing finished in 6 secs, took 0.66 GiB
[1;36m(EngineCore_DP0 pid=1851)[0;0m INFO 11-09 19:28:39 [core.py:210] init engine (profile, create kv cache, warmup model) took 15.81 seconds
INFO 11-09 19:28:40 [llm.py:306] Supported_tasks: ['generate']




✓ Model loaded

Sampling params: temperature=0.0, max_tokens=20, seed=42

WARM-UP PASS

Running warm-up pass to initialize CUDA kernels...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

✓ Warm-up complete - CUDA kernels compiled and cached

RUNNING 5 FORWARD PASSES

Rep 1/5: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

20 tokens, mean logprob=-0.5974
Rep 2/5: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

20 tokens, mean logprob=-0.5974
Rep 3/5: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

20 tokens, mean logprob=-0.5974
Rep 4/5: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

20 tokens, mean logprob=-0.5974
Rep 5/5: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

20 tokens, mean logprob=-0.5974

First generation:
  Text: ' However, the development of these models has also raised ethical 
concerns, particularly around iss...'
  Tokens: [4354, 11, 279, 4401, 315, 1493, 4119, 702, 1083, 9226]...
  Logprobs (first 5): [-1.10141468e+00 -3.20667859e-05 -1.61608422e+00 -2.52417779e+00
 -4.36653823e-01]

REPRODUCIBILITY ANALYSIS

Token sequences identical: True

Logprobs bit-exact: True

VERDICT

✓ PERFECT REPRODUCIBILITY
  - Token sequences: bit-exact
  - Logprobs: bit-exact
  - vLLM is deterministic for this config
  → Ready to scale up to K2 Thinking on Blackwell

Results saved to: /workspace/vllm_logprobs_test_20251109_192845.json

TEST COMPLETE

If successful, next steps:
  1. Scale up to K2 Thinking
  2. Test on 4× B200 with tensor parallelism
  3. Run with 100K context

