In [4]:
#!/usr/bin/env python3
"""
vLLM Tensor Parallelism Test with Auto File Detection
Tests bit-exact reproducibility across multiple runs with TP
Automatically finds txt/pdf files in current directory
Validates prompt length before loading model weights
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import numpy as np
from datetime import datetime
import json
import torch
import glob

# ============================================================================
# CONFIGURATION
# ============================================================================

# Model configuration
MODEL_NAME = "Qwen/Qwen3-30B-A3B-Thinking-2507"  # Change to "moonshotai/Kimi-K2-Thinking" for K2
TENSOR_PARALLEL_SIZE = 4
MAX_MODEL_LEN = 150000
GPU_MEMORY_UTILIZATION = 0.9

# Generation configuration
MAX_TOKENS = 20
NUM_REPETITIONS = 5
TEMPERATURE = 0.0  # Greedy decoding
SEED = 42
TOP_LOGPROBS = 10

# Prompt source - finds first txt or pdf in current directory
AUTO_FIND_FILE = True  # Set to False to use hardcoded content

# Chat template configuration
SYSTEM_PROMPT = "You are a helpful AI assistant."
USER_TASK = "Please provide a detailed summary of the following text."

# Hardcoded content (used if AUTO_FIND_FILE=False or no files found)
HARDCODED_CONTENT = """The development of large language models has fundamentally transformed natural language processing and artificial intelligence more broadly. These models, trained on vast corpora of text data, have demonstrated remarkable capabilities across a wide range of tasks, from translation and summarization to question answering and creative writing."""

# ============================================================================
# FILE LOADING UTILITIES
# ============================================================================

def find_prompt_file():
    """Find first txt or pdf file in current directory"""
    cwd = os.getcwd()
    
    # Look for txt files first, then pdf
    txt_files = glob.glob(os.path.join(cwd, "*.txt"))
    pdf_files = glob.glob(os.path.join(cwd, "*.pdf"))
    
    if txt_files:
        return txt_files[0]
    elif pdf_files:
        return pdf_files[0]
    else:
        return None

def load_text_from_file(filepath):
    """Load text from txt or pdf file"""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    if filepath.endswith('.txt'):
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        print(f"Loaded {len(text)} characters from txt file")
        return text
    
    elif filepath.endswith('.pdf'):
        try:
            import PyPDF2
        except ImportError:
            raise ImportError("PyPDF2 required for PDF loading. Install with: pip install PyPDF2")
        
        text = []
        with open(filepath, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            num_pages = len(pdf_reader.pages)
            print(f"Loading {num_pages} pages from PDF...")
            
            for page_num, page in enumerate(pdf_reader.pages, 1):
                page_text = page.extract_text()
                text.append(page_text)
                if page_num % 10 == 0:
                    print(f"  Processed {page_num}/{num_pages} pages")
        
        full_text = '\n'.join(text)
        print(f"Loaded {len(full_text)} characters from PDF ({num_pages} pages)")
        return full_text
    
    else:
        raise ValueError(f"Unsupported file type: {filepath}. Use .txt or .pdf")

# ============================================================================
# PROMPT LOADING
# ============================================================================

print("=" * 80)
print("vLLM LOGPROBS EXTRACTION TEST")
print("=" * 80)
print()

# Load document content
prompt_file = None
if AUTO_FIND_FILE:
    prompt_file = find_prompt_file()
    if prompt_file:
        print(f"Found file: {os.path.basename(prompt_file)}")
        DOCUMENT_CONTENT = load_text_from_file(prompt_file)
        print()
    else:
        print("No txt/pdf files found in current directory")
        print("Using hardcoded content")
        DOCUMENT_CONTENT = HARDCODED_CONTENT
        print()
else:
    DOCUMENT_CONTENT = HARDCODED_CONTENT
    print("Using hardcoded content")
    print()

# Apply manual chat template
# Format controlled manually for reproducibility across vLLM versions
# Avoiding llm.chat() which depends on tokenizer's chat template
print("Applying manual chat template...")
PROMPT = f"""System: {SYSTEM_PROMPT}

User: {USER_TASK}

{DOCUMENT_CONTENT}"""
print(f"Formatted prompt length: {len(PROMPT)} characters")
print()

# ============================================================================
# TOKENIZER PRECHECK
# ============================================================================

print("Loading tokenizer to validate prompt length...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir='/workspace/huggingface_cache',
    trust_remote_code=True
)

# Tokenize prompt
prompt_tokens = tokenizer.encode(PROMPT)
prompt_length = len(prompt_tokens)

print(f"Prompt statistics:")
print(f"  Characters: {len(PROMPT):,}")
print(f"  Tokens: {prompt_length:,}")
print(f"  Max model length: {MAX_MODEL_LEN:,}")
print(f"  Generation tokens: {MAX_TOKENS}")
print(f"  Total required: {prompt_length + MAX_TOKENS:,}")
print()

# Validate length
if prompt_length > MAX_MODEL_LEN:
    print(f"❌ ERROR: Prompt is too long!")
    print(f"  Prompt has {prompt_length:,} tokens")
    print(f"  Model max is {MAX_MODEL_LEN:,} tokens")
    print(f"  Exceeds by {prompt_length - MAX_MODEL_LEN:,} tokens")
    print()
    print("Solutions:")
    print(f"  1. Increase MAX_MODEL_LEN to at least {prompt_length + MAX_TOKENS}")
    print(f"  2. Truncate/reduce the prompt")
    exit(1)

if prompt_length + MAX_TOKENS > MAX_MODEL_LEN:
    print(f"⚠ WARNING: Prompt + generation may exceed context")
    print(f"  Prompt: {prompt_length:,} tokens")
    print(f"  Generation: {MAX_TOKENS} tokens")
    print(f"  Total: {prompt_length + MAX_TOKENS:,} tokens")
    print(f"  Model max: {MAX_MODEL_LEN:,} tokens")
    print(f"  Consider increasing MAX_MODEL_LEN to {prompt_length + MAX_TOKENS + 100}")
    print()
else:
    print(f"✓ Prompt length validation passed")
    print(f"  Remaining capacity: {MAX_MODEL_LEN - prompt_length - MAX_TOKENS:,} tokens")
    print()

# ============================================================================
# VLLM PROMPT TEMPLATE HANDLING
# ============================================================================

print("Note on prompt formatting:")
print("  Using MANUAL chat template for reproducibility")
print("  Why not llm.chat():")
print("    - llm.chat() depends on tokenizer's chat template")
print("    - Templates can change between model/vLLM versions")
print("    - Creates confounding variable for forensics testing")
print("  Manual template gives:")
print("    - Full control over exact formatting")
print("    - Reproducible across environments")
print("    - No hidden dependencies")
print()

# ============================================================================
# MODEL LOADING
# ============================================================================

print("Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Tensor parallel: {TENSOR_PARALLEL_SIZE}")
print(f"  Max model len: {MAX_MODEL_LEN:,}")
print(f"  GPU memory utilization: {GPU_MEMORY_UTILIZATION} ({int(GPU_MEMORY_UTILIZATION*100)}%)")
print(f"  Max tokens: {MAX_TOKENS}")
print(f"  Repetitions: {NUM_REPETITIONS}")
print(f"  Temperature: {TEMPERATURE} (greedy)")
print(f"  Top logprobs: {TOP_LOGPROBS}")
print()

print("GPU Info:")
print(f"  Device: {torch.cuda.get_device_name(0)}")
print(f"  Available: {torch.cuda.device_count()} GPUs")
print()

print("Loading model with vLLM...")
llm = LLM(
    model=MODEL_NAME,
    trust_remote_code=True,
    download_dir='/workspace/huggingface_cache',
    dtype='bfloat16',
    max_model_len=MAX_MODEL_LEN,
    tensor_parallel_size=TENSOR_PARALLEL_SIZE,
    enable_prefix_caching=False,
    gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    disable_log_stats=True
)
print("Model loaded successfully!")
print()

# Sampling parameters
sampling_params = SamplingParams(
    temperature=TEMPERATURE,
    max_tokens=MAX_TOKENS,
    seed=SEED,
    logprobs=TOP_LOGPROBS,
    prompt_logprobs=None
)

# Warmup run
print("Running warmup...")
warmup_output = llm.generate([PROMPT], sampling_params=sampling_params)
print("Warmup complete")
print()

# ============================================================================
# TEST RUNS
# ============================================================================

print("=" * 80)
print(f"Running {NUM_REPETITIONS} test repetitions")
print("=" * 80)
print()

results_tokens = []
results_logprobs = []
results_texts = []
results_distributions = []

for rep in range(NUM_REPETITIONS):
    print(f"Repetition {rep + 1}/{NUM_REPETITIONS}...")
    
    outputs = llm.generate([PROMPT], sampling_params=sampling_params)
    output = outputs[0]
    
    # Extract token IDs
    token_ids = output.outputs[0].token_ids
    results_tokens.append(token_ids)
    
    # Extract generated text
    text = output.outputs[0].text
    results_texts.append(text)
    
    # Extract logprobs for selected tokens
    logprobs_data = output.outputs[0].logprobs
    selected_logprobs = [lp[token_ids[i]].logprob for i, lp in enumerate(logprobs_data)]
    results_logprobs.append(np.array(selected_logprobs))
    
    # Extract full top-k distributions
    rep_distributions = []
    for position_logprobs in logprobs_data:
        # Get top-k sorted by logprob (descending)
        sorted_items = sorted(position_logprobs.items(), 
                            key=lambda x: x[1].logprob, 
                            reverse=True)[:TOP_LOGPROBS]
        rep_distributions.append([(tok, lp.logprob) for tok, lp in sorted_items])
    results_distributions.append(rep_distributions)
    
    print(f"  Generated {len(token_ids)} tokens")

print()
print("All repetitions complete!")
print()

# ============================================================================
# ANALYSIS
# ============================================================================

print("=" * 80)
print("ANALYSIS")
print("=" * 80)
print()

# Check token sequence identity
print("Checking token sequences...")
tokens_identical = all(
    results_tokens[0] == results_tokens[i] 
    for i in range(1, NUM_REPETITIONS)
)
print(f"Token sequences identical: {tokens_identical}")

if not tokens_identical:
    print("\n⚠ Token sequences differ!")
    for i in range(1, NUM_REPETITIONS):
        if results_tokens[0] != results_tokens[i]:
            diff_positions = [
                j for j in range(min(len(results_tokens[0]), len(results_tokens[i])))
                if results_tokens[0][j] != results_tokens[i][j]
            ]
            print(f"  Rep 0 vs Rep {i}: {len(diff_positions)} positions differ")
            if diff_positions:
                print(f"    First difference at position {diff_positions[0]}")

# Check logprobs for selected tokens
print("\nChecking selected token logprobs...")
first_logprobs = results_logprobs[0]
logprobs_exact = all(
    np.allclose(first_logprobs, results_logprobs[i], rtol=0, atol=1e-10)
    for i in range(1, NUM_REPETITIONS)
)
print(f"Selected token logprobs bit-exact: {logprobs_exact}")

# Check top-k distributions
print("\nChecking full top-k distributions...")
distributions_exact = True
distribution_mismatches = []

first_dist = results_distributions[0]
for rep_idx in range(1, NUM_REPETITIONS):
    for pos_idx in range(len(first_dist)):
        dist_a = first_dist[pos_idx]
        dist_b = results_distributions[rep_idx][pos_idx]
        
        # Check if token IDs match in same order
        tokens_match = [t[0] for t in dist_a] == [t[0] for t in dist_b]
        
        # Check if logprobs are bit-exact
        if tokens_match:
            logprobs_match = all(
                abs(dist_a[i][1] - dist_b[i][1]) < 1e-10 
                for i in range(len(dist_a))
            )
            if not logprobs_match:
                distributions_exact = False
                distribution_mismatches.append((rep_idx, pos_idx))
        else:
            distributions_exact = False
            distribution_mismatches.append((rep_idx, pos_idx))

print(f"Top-k distributions bit-exact: {distributions_exact}")

if not distributions_exact:
    print(f"\n⚠ Found {len(distribution_mismatches)} position mismatches in distributions")
    if len(distribution_mismatches) <= 5:
        for rep_idx, pos_idx in distribution_mismatches:
            print(f"  Rep 0 vs Rep {rep_idx}, position {pos_idx}")
    else:
        print(f"  First 5: {distribution_mismatches[:5]}")

if not logprobs_exact:
    print("\nL2 distances:")
    l2_distances = []
    for i in range(1, NUM_REPETITIONS):
        l2 = np.linalg.norm(first_logprobs - results_logprobs[i])
        l2_distances.append(l2)
        print(f"  Rep 0 vs Rep {i}: L2 = {l2:.6e}")
    
    print(f"\nMax L2: {max(l2_distances):.6e}")
    print(f"Mean L2: {np.mean(l2_distances):.6e}")
    
    # Element-wise statistics
    all_logprobs = np.array(results_logprobs)
    std_per_token = all_logprobs.std(axis=0)
    print(f"\nPer-token std statistics:")
    print(f"  Mean: {std_per_token.mean():.6e}")
    print(f"  Max: {std_per_token.max():.6e}")
    print(f"  Median: {np.median(std_per_token):.6e}")

print()

# ============================================================================
# VERDICT
# ============================================================================

print("=" * 80)
print("VERDICT")
print("=" * 80)
print()

if tokens_identical and logprobs_exact and distributions_exact:
    print("✓ PERFECT REPRODUCIBILITY")
    print("  - Token sequences: bit-exact")
    print("  - Selected token logprobs: bit-exact")
    print("  - Full top-k distributions: bit-exact")
    print("  - vLLM is deterministic for this config")
elif tokens_identical and logprobs_exact and not distributions_exact:
    print("⚠ SELECTED TOKENS EXACT, DISTRIBUTIONS VARY")
    print("  - Token sequences: bit-exact")
    print("  - Selected token logprobs: bit-exact")
    print("  - Top-k distributions: numerical variation detected")
    print("  → May indicate computational instability in non-selected paths")
elif tokens_identical and not logprobs_exact:
    print("⚠ TOKENS IDENTICAL, LOGPROBS VARY")
    print("  - Token sequences: bit-exact")
    print("  - Logprobs: small numerical variation")
    max_l2 = max(l2_distances) if not logprobs_exact else 0.0
    if max_l2 < 1e-6:
        print(f"  - Variation very small (L2={max_l2:.2e})")
        print("  → Likely acceptable for forensics")
    else:
        print(f"  - Variation notable (L2={max_l2:.2e})")
        print("  → Investigate noise source")
else:
    print("✗ TOKEN SEQUENCES DIFFER")
    print("  - This should NOT happen with temperature=0")
    print("  → Something is wrong, investigate")

print()

# ============================================================================
# SAVE RESULTS
# ============================================================================

output_data = {
    "experiment": "vllm_logprobs_test",
    "timestamp": datetime.now().isoformat(),
    "prompt_source": "file" if prompt_file else "hardcoded",
    "prompt_file": os.path.basename(prompt_file) if prompt_file else None,
    "prompt_length_chars": len(PROMPT),
    "prompt_length_tokens": prompt_length,
    "config": {
        "model": MODEL_NAME,
        "tensor_parallel": TENSOR_PARALLEL_SIZE,
        "max_model_len": MAX_MODEL_LEN,
        "max_tokens": MAX_TOKENS,
        "repetitions": NUM_REPETITIONS,
        "temperature": TEMPERATURE,
        "seed": SEED,
        "warmup_enabled": True,
        "prefix_caching_disabled": True,
        "top_logprobs": TOP_LOGPROBS
    },
    "results": {
        "tokens_identical": tokens_identical,
        "logprobs_exact": logprobs_exact,
        "distributions_exact": distributions_exact,
        "perfect_reproducibility": tokens_identical and logprobs_exact and distributions_exact
    },
    "token_sequences": results_tokens,
    "logprobs_vectors": [lp.tolist() for lp in results_logprobs],
    "generated_texts": results_texts,
    "top_k_distributions": [
        [[(int(tok), float(prob)) for tok, prob in dist] for dist in rep_dists]
        for rep_dists in results_distributions
    ]
}

output_file = f"vllm_logprobs_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, "w") as f:
    json.dump(output_data, f, indent=2)

print(f"Results saved to: {output_file}")
print()
print("=" * 80)
print("TEST COMPLETE")
print("=" * 80)
print()

vLLM LOGPROBS EXTRACTION TEST

Found file: Verification-for-International-AI-Governance.pdf
Loading 172 pages from PDF...
  Processed 10/172 pages
  Processed 20/172 pages
  Processed 30/172 pages
  Processed 40/172 pages
  Processed 50/172 pages
  Processed 60/172 pages
  Processed 70/172 pages
  Processed 80/172 pages
  Processed 90/172 pages
  Processed 100/172 pages
  Processed 110/172 pages
  Processed 120/172 pages
  Processed 130/172 pages
  Processed 140/172 pages
  Processed 150/172 pages
  Processed 160/172 pages
  Processed 170/172 pages
Loaded 535619 characters from PDF (172 pages)

Applying manual chat template...
Formatted prompt length: 535724 characters

Loading tokenizer to validate prompt length...
Prompt statistics:
  Characters: 535,724
  Tokens: 120,413
  Max model length: 150,000
  Generation tokens: 20
  Total required: 120,433

✓ Prompt length validation passed
  Remaining capacity: 29,567 tokens

Note on prompt formatting:
  Using MANUAL chat template for repro

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-09 22:36:57 [model.py:547] Resolved architecture: Qwen3MoeForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 11-09 22:36:57 [model.py:1510] Using max model len 150000
INFO 11-09 22:37:01 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=8192.




INFO 11-09 22:37:07 [__init__.py:216] Automatically detected platform cuda.
[1;36m(EngineCore_DP0 pid=1095)[0;0m INFO 11-09 22:37:08 [core.py:644] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=1095)[0;0m INFO 11-09 22:37:08 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='Qwen/Qwen3-30B-A3B-Thinking-2507', speculative_config=None, tokenizer='Qwen/Qwen3-30B-A3B-Thinking-2507', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=150000, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_par



INFO 11-09 22:37:12 [__init__.py:216] Automatically detected platform cuda.
INFO 11-09 22:37:12 [__init__.py:216] Automatically detected platform cuda.
INFO 11-09 22:37:12 [__init__.py:216] Automatically detected platform cuda.
INFO 11-09 22:37:12 [__init__.py:216] Automatically detected platform cuda.
INFO 11-09 22:37:17 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_b6f86f1e'), local_subscribe_addr='ipc:///tmp/233fb4a6-6e93-4099-bb24-913f0610bad3', remote_subscribe_addr=None, remote_addr_ipv6=False)
INFO 11-09 22:37:17 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_02aa2d44'), local_subscribe_addr='ipc:///tmp/50fc94bc-55e6-4d71-8ee6-a6b0df6e6925', remote_subscribe_addr=None, remote_addr_ipv6=False)
INFO 11-09 22:37:17 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffe

Loading safetensors checkpoint shards:   0% Completed | 0/16 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   6% Completed | 1/16 [00:05<01:23,  5.54s/it]
Loading safetensors checkpoint shards:  12% Completed | 2/16 [00:34<04:28, 19.14s/it]
Loading safetensors checkpoint shards:  19% Completed | 3/16 [01:02<05:01, 23.21s/it]
Loading safetensors checkpoint shards:  25% Completed | 4/16 [01:33<05:17, 26.50s/it]
Loading safetensors checkpoint shards:  31% Completed | 5/16 [02:06<05:17, 28.82s/it]
Loading safetensors checkpoint shards:  38% Completed | 6/16 [02:37<04:53, 29.39s/it]
Loading safetensors checkpoint shards:  44% Completed | 7/16 [03:05<04:20, 28.99s/it]
Loading safetensors checkpoint shards:  50% Completed | 8/16 [03:33<03:49, 28.70s/it]
Loading safetensors checkpoint shards:  56% Completed | 9/16 [03:59<03:15, 27.91s/it]
Loading safetensors checkpoint shards:  62% Completed | 10/16 [04:28<02:48, 28.09s/it]
Loading safetensors checkpoint shards:  69% Completed | 11/16

[1;36m(Worker_TP0 pid=1228)[0;0m INFO 11-09 22:44:40 [default_loader.py:267] Loading weights took 437.73 seconds
[1;36m(Worker_TP1 pid=1229)[0;0m INFO 11-09 22:44:40 [default_loader.py:267] Loading weights took 438.84 seconds
[1;36m(Worker_TP3 pid=1231)[0;0m INFO 11-09 22:44:40 [default_loader.py:267] Loading weights took 438.62 seconds
[1;36m(Worker_TP2 pid=1230)[0;0m INFO 11-09 22:44:40 [default_loader.py:267] Loading weights took 438.34 seconds
[1;36m(Worker_TP3 pid=1231)[0;0m INFO 11-09 22:44:40 [gpu_model_runner.py:2653] Model loading took 14.3001 GiB and 439.467982 seconds
[1;36m(Worker_TP2 pid=1230)[0;0m INFO 11-09 22:44:40 [gpu_model_runner.py:2653] Model loading took 14.3001 GiB and 439.653293 seconds
[1;36m(Worker_TP1 pid=1229)[0;0m INFO 11-09 22:44:40 [gpu_model_runner.py:2653] Model loading took 14.3001 GiB and 439.665687 seconds
[1;36m(Worker_TP0 pid=1228)[0;0m INFO 11-09 22:44:41 [gpu_model_runner.py:2653] Model loading took 14.3001 GiB and 439.532100 seco

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 67/67 [00:17<00:00,  3.92it/s]
Capturing CUDA graphs (decode, FULL):  71%|███████▏  | 25/35 [00:03<00:01,  7.47it/s]

[1;36m(Worker_TP2 pid=1230)[0;0m INFO 11-09 22:46:14 [custom_all_reduce.py:203] Registering 9894 cuda graph addresses
[1;36m(Worker_TP1 pid=1229)[0;0m INFO 11-09 22:46:15 [custom_all_reduce.py:203] Registering 9894 cuda graph addresses


Capturing CUDA graphs (decode, FULL):  77%|███████▋  | 27/35 [00:03<00:01,  7.59it/s]

[1;36m(Worker_TP3 pid=1231)[0;0m INFO 11-09 22:46:15 [custom_all_reduce.py:203] Registering 9894 cuda graph addresses


Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:04<00:00,  7.94it/s]


[1;36m(Worker_TP0 pid=1228)[0;0m INFO 11-09 22:46:16 [custom_all_reduce.py:203] Registering 9894 cuda graph addresses
[1;36m(Worker_TP3 pid=1231)[0;0m INFO 11-09 22:46:16 [gpu_model_runner.py:3480] Graph capturing finished in 23 secs, took 1.47 GiB
[1;36m(Worker_TP1 pid=1229)[0;0m INFO 11-09 22:46:16 [gpu_model_runner.py:3480] Graph capturing finished in 23 secs, took 1.47 GiB
[1;36m(Worker_TP2 pid=1230)[0;0m INFO 11-09 22:46:17 [gpu_model_runner.py:3480] Graph capturing finished in 23 secs, took 1.47 GiB
[1;36m(Worker_TP0 pid=1228)[0;0m INFO 11-09 22:46:17 [gpu_model_runner.py:3480] Graph capturing finished in 23 secs, took 1.47 GiB
[1;36m(EngineCore_DP0 pid=1095)[0;0m INFO 11-09 22:46:17 [core.py:210] init engine (profile, create kv cache, warmup model) took 96.09 seconds
INFO 11-09 22:46:19 [llm.py:306] Supported_tasks: ['generate']
Model loaded successfully!

Running warmup...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Warmup complete

Running 5 test repetitions

Repetition 1/5...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 20 tokens
Repetition 2/5...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 20 tokens
Repetition 3/5...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 20 tokens
Repetition 4/5...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 20 tokens
Repetition 5/5...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 20 tokens

All repetitions complete!

ANALYSIS

Checking token sequences...
Token sequences identical: True

Checking selected token logprobs...
Selected token logprobs bit-exact: True

Checking full top-k distributions...
Top-k distributions bit-exact: True

VERDICT

✓ PERFECT REPRODUCIBILITY
  - Token sequences: bit-exact
  - Selected token logprobs: bit-exact
  - Full top-k distributions: bit-exact
  - vLLM is deterministic for this config

Results saved to: vllm_logprobs_test_20251109_224725.json

TEST COMPLETE

