In [None]:
#!/usr/bin/env python3
"""
Tensor Parallelism (TP) detectability experiment using vLLM.
Compares tensor_parallel=1, 2, 4 on dense model in BF16.
Prefill-only, logprobs-only.
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'

from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import destroy_model_parallel
from transformers import AutoTokenizer
import torch
import numpy as np
import json
import socket
import platform
import sys
import glob
from datetime import datetime
import PyPDF2
import gc

# ============================================================================
# CONFIGURATION
# ============================================================================

MODEL_NAME = "Qwen/Qwen3-14B"
CACHE_DIR = '/workspace/huggingface_cache'

# Mode: False = run locally and save, True = load reference and compare cross-hardware
CROSS_HARDWARE_MODE = False
REFERENCE_FILE = "/workspace/tp_reference.json"

TP_SIZES = [1, 2, 4]  # Tensor parallel sizes to compare

TOKENS_PER_SLICE = 10000
NUM_REFERENCES = 3
TOP_K_LOGPROBS = 5

EQUIVALENCE_THRESHOLD = 1e-9

SYSTEM_PROMPT = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."

# ============================================================================
# LOGGING
# ============================================================================

LOG_FILE = None

def setup_logging(output_dir='/workspace/experiments'):
    global LOG_FILE
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_path = os.path.join(output_dir, f"tp_experiment_{timestamp}.txt")
    LOG_FILE = open(log_path, 'w')
    return log_path

def log_print(*args, **kwargs):
    print(*args, **kwargs)
    if LOG_FILE:
        log_kwargs = {k: v for k, v in kwargs.items() if k != 'file'}
        print(*args, **log_kwargs, file=LOG_FILE)
        LOG_FILE.flush()

def close_logging():
    global LOG_FILE
    if LOG_FILE:
        LOG_FILE.close()
        LOG_FILE = None

# ============================================================================
# PDF LOADING
# ============================================================================

def load_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    return text.strip()

def create_prompts(tokenizer, num_references=NUM_REFERENCES):
    pdf_files = sorted(glob.glob("/workspace/*.pdf"))
    if not pdf_files:
        pdf_files = sorted(glob.glob("*.pdf"))
    if not pdf_files:
        raise FileNotFoundError("No PDF files found")

    log_print(f"Found {len(pdf_files)} PDF(s)")
    for pdf_path in pdf_files:
        log_print(f"  Loading: {pdf_path}")

    all_text = ""
    for pdf_path in pdf_files:
        text = load_pdf_text(pdf_path)
        all_text += text + " "

    content_tokens = tokenizer.encode(all_text)
    log_print(f"Total source tokens: {len(content_tokens)}")

    if len(content_tokens) < num_references * TOKENS_PER_SLICE:
        raise ValueError(f"Need {num_references * TOKENS_PER_SLICE} tokens but only have {len(content_tokens)}")

    prefix = f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
Here is an excerpt from a document:

\""""
    
    suffix = f""""

Based on this excerpt, what type of document do you think this is from, and what is its likely subject matter? Explain your reasoning.<|im_end|>
<|im_start|>assistant
"""
    
    prefix_tokens = tokenizer.encode(prefix)
    suffix_tokens = tokenizer.encode(suffix)
    
    total_len = len(prefix_tokens) + TOKENS_PER_SLICE + len(suffix_tokens)
    log_print(f"Prompt structure: {len(prefix_tokens)} prefix + {TOKENS_PER_SLICE} snippet + {len(suffix_tokens)} suffix = {total_len} tokens")

    prompts = []
    prompt_texts = []
    for i in range(num_references):
        start = i * TOKENS_PER_SLICE
        end = start + TOKENS_PER_SLICE
        snippet_tokens = content_tokens[start:end]
        prompt_tokens = prefix_tokens + snippet_tokens + suffix_tokens
        prompts.append(prompt_tokens)
        prompt_texts.append(tokenizer.decode(prompt_tokens))
    
    return prompts, prompt_texts

# ============================================================================
# SYSTEM INFO
# ============================================================================

def collect_system_info(tp_size=None):
    import transformers
    
    info = {
        "hostname": socket.gethostname(),
        "platform": platform.platform(),
        "python_version": sys.version.split()[0],
        "torch_version": torch.__version__,
        "cuda_version": torch.version.cuda if torch.cuda.is_available() else "N/A",
        "cudnn_version": str(torch.backends.cudnn.version()) if torch.cuda.is_available() else "N/A",
        "transformers_version": transformers.__version__,
        "numpy_version": np.__version__,
        "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
        "gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
        "tensor_parallel_size": tp_size,
    }
    
    try:
        import vllm
        info["vllm_version"] = vllm.__version__
    except (ImportError, AttributeError):
        info["vllm_version"] = "unknown"
    
    return info

# ============================================================================
# PREFILL LOGPROB EXTRACTION
# ============================================================================

def extract_prefill_logprobs(output, positions=[-3, -2, -1]):
    """Extract prompt logprobs at specified positions."""
    signals = {}
    
    prompt_logprobs = output.prompt_logprobs
    if prompt_logprobs is None:
        return signals
    
    num_positions = len(prompt_logprobs)
    
    for pos in positions:
        actual_idx = pos if pos >= 0 else num_positions + pos
        
        if actual_idx < 0 or actual_idx >= num_positions:
            continue
        
        pos_label = f"pos_{pos}"
        token_logprobs = prompt_logprobs[actual_idx]
        
        if token_logprobs is None:
            continue
        
        token_ids = []
        log_probs = []
        
        for token_id, logprob_obj in token_logprobs.items():
            token_ids.append(token_id)
            if hasattr(logprob_obj, 'logprob'):
                log_probs.append(logprob_obj.logprob)
            else:
                log_probs.append(float(logprob_obj))
        
        signals[pos_label] = {
            'logprobs': {
                'token_ids': token_ids,
                'log_probs': log_probs
            }
        }
    
    return signals

def run_prefill(llm, prompt_text):
    """Run prefill and extract logprobs."""
    sampling_params = SamplingParams(
        max_tokens=1,
        prompt_logprobs=TOP_K_LOGPROBS,
        logprobs=TOP_K_LOGPROBS,
    )
    
    outputs = llm.generate([prompt_text], sampling_params)
    output = outputs[0]
    
    prefill_signals = extract_prefill_logprobs(output, positions=[-3, -2, -1])
    
    return {
        'prefill_signals': prefill_signals
    }

def test_reproducibility(llm, prompt_text, num_runs=5):
    """
    Test that the model produces deterministic outputs.
    Run the same prompt multiple times and compare logprobs.
    """
    log_print("\n" + "="*80)
    log_print("REPRODUCIBILITY TEST")
    log_print("="*80)
    log_print(f"Running same prompt {num_runs} times...")
    
    results = []
    for i in range(num_runs):
        result = run_prefill(llm, prompt_text)
        results.append(result)
        log_print(f"  Run {i+1}: done")
        # Clear any accumulated cache
        gc.collect()
        torch.cuda.empty_cache()
    
    # Compare all runs to the first
    all_identical = True
    max_diffs_seen = []
    
    for i in range(1, num_runs):
        for pos in results[0]['prefill_signals'].keys():
            lp0 = results[0]['prefill_signals'][pos]['logprobs']
            lpi = results[i]['prefill_signals'][pos]['logprobs']
            
            # Compare token IDs
            if lp0['token_ids'] != lpi['token_ids']:
                log_print(f"  ✗ Run {i+1} {pos}: token_ids differ")
                all_identical = False
                continue
            
            # Compare log_probs values
            diffs = [abs(a - b) for a, b in zip(lp0['log_probs'], lpi['log_probs'])]
            max_diff = max(diffs) if diffs else 0
            max_diffs_seen.append(max_diff)
            
            if max_diff > 1e-9:
                log_print(f"  ✗ Run {i+1} {pos}: max logprob diff = {max_diff:.2e}")
                all_identical = False
    
    if all_identical:
        overall_max = max(max_diffs_seen) if max_diffs_seen else 0
        log_print(f"\n✓ REPRODUCIBILITY TEST PASSED")
        log_print(f"  All {num_runs} runs identical")
        log_print(f"  Max logprob difference: {overall_max:.2e}")
    else:
        log_print(f"\n⚠ REPRODUCIBILITY TEST FAILED")
        log_print(f"  Non-deterministic outputs detected")
        log_print(f"  Results may vary between runs even with same TP setting")
    
    return all_identical

# ============================================================================
# DISTANCE METRICS
# ============================================================================

def compute_logprob_distance_canonical(logprobs1, logprobs2, canonical_ids):
    """
    Compute L2 distance between logprobs for a canonical set of token IDs.
    """
    lp1 = dict(zip(logprobs1['token_ids'], logprobs1['log_probs']))
    lp2 = dict(zip(logprobs2['token_ids'], logprobs2['log_probs']))

    vec1 = []
    vec2 = []

    for tid in canonical_ids:
        if tid in lp1 and tid in lp2:
            vec1.append(lp1[tid])
            vec2.append(lp2[tid])

    if len(vec1) == 0:
        return float('inf')

    return float(np.linalg.norm(np.array(vec1) - np.array(vec2)))

def compare_signals_with_canonical_ids(signals1, signals2, canonical_token_ids):
    """Compare using pre-specified canonical token IDs (top 5)."""
    common_positions = set(signals1.keys()) & set(signals2.keys())

    all_dists = []

    for pos_label in common_positions:
        sig1 = signals1[pos_label]
        sig2 = signals2[pos_label]

        # Use top 5 for comparison
        canonical_ids = canonical_token_ids.get(pos_label, sig1['logprobs']['token_ids'][:5])[:5]
        dist = compute_logprob_distance_canonical(
            sig1['logprobs'], sig2['logprobs'], canonical_ids
        )
        all_dists.append(dist)

    return {
        'logprobs_mean': np.mean(all_dists) if all_dists else 0.0
    }

# ============================================================================
# ANALYSIS
# ============================================================================

def analyze_results(measurements, tp_sizes):
    """Analyze prefill logprob differences across TP sizes (within-hardware)."""
    log_print("\n" + "="*80)
    log_print("WITHIN-HARDWARE TP COMPARISON (PREFILL)")
    log_print("="*80)
    
    n = len(tp_sizes)
    all_matrices = []
    
    # Use TP=1 (first) as canonical for token IDs
    canonical_tp = tp_sizes[0]
    
    for ref_idx in range(NUM_REFERENCES):
        log_print(f"\n--- ref_{ref_idx} ---")
        
        matrix = np.zeros((n, n))
        
        # Get canonical token IDs from first TP size
        canonical_signals = measurements[canonical_tp][ref_idx]['prefill_signals']
        canonical_token_ids = {}
        for pos_label, pos_data in canonical_signals.items():
            canonical_token_ids[pos_label] = pos_data['logprobs']['token_ids']
        
        for i, tp_i in enumerate(tp_sizes):
            for j, tp_j in enumerate(tp_sizes):
                if i == j:
                    matrix[i, j] = 0.0
                else:
                    sig_i = measurements[tp_i][ref_idx]['prefill_signals']
                    sig_j = measurements[tp_j][ref_idx]['prefill_signals']
                    
                    distances = compare_signals_with_canonical_ids(sig_i, sig_j, canonical_token_ids)
                    matrix[i, j] = distances['logprobs_mean']
        
        header = "       " + " ".join(f"TP={tp:>3}" for tp in tp_sizes)
        log_print(f"\nLogprobs (L2 distance):")
        log_print(header)
        for i, tp in enumerate(tp_sizes):
            row = f"TP={tp:<3}" + " ".join(f"  {matrix[i,j]:6.2e}" for j in range(n))
            log_print(row)
        
        all_matrices.append(matrix)
    
    # Aggregate
    avg_matrix = np.mean(all_matrices, axis=0)
    
    log_print(f"\n{'='*80}")
    log_print("AGGREGATE (average across references):")
    log_print("="*80)
    
    header = "       " + " ".join(f"TP={tp:>3}" for tp in tp_sizes)
    log_print(f"\nLogprobs (L2 distance):")
    log_print(header)
    for i, tp in enumerate(tp_sizes):
        row = f"TP={tp:<3}" + " ".join(f"  {avg_matrix[i,j]:6.2e}" for j in range(n))
        log_print(row)
    
    # Off-diagonal stats
    off_diag = []
    for i in range(n):
        for j in range(n):
            if i != j:
                off_diag.append(avg_matrix[i, j])
    
    diagonal = np.mean([avg_matrix[i, i] for i in range(n)])
    off_diagonal = np.mean(off_diag)
    
    log_print(f"\nDiagonal (same TP): {diagonal:.2e}")
    log_print(f"Off-diagonal (different TP): {off_diagonal:.2e}")
    
    if diagonal > 0:
        snr = off_diagonal / diagonal
        log_print(f"SNR: {snr:.2f}×")
    else:
        snr = float('inf') if off_diagonal > 0 else 1.0
        log_print(f"SNR: {snr}")
    
    # Check equivalences
    equiv_pairs = []
    for i in range(n):
        for j in range(i+1, n):
            if avg_matrix[i, j] < EQUIVALENCE_THRESHOLD:
                equiv_pairs.append((tp_sizes[i], tp_sizes[j]))
    
    if equiv_pairs:
        log_print(f"\nEquivalent pairs:")
        for p in equiv_pairs:
            log_print(f"  TP={p[0]} ≈ TP={p[1]}")
    
    return {
        'matrix': avg_matrix.tolist(),
        'per_reference_matrices': [m.tolist() for m in all_matrices],
        'diagonal': float(diagonal),
        'off_diagonal': float(off_diagonal),
        'snr': float(snr) if snr != float('inf') else None,
        'equivalent_pairs': equiv_pairs
    }


def analyze_cross_hardware(ref_measurements, ver_measurements, tp_sizes):
    """Analyze cross-hardware TP comparison."""
    log_print("\n" + "="*80)
    log_print("CROSS-HARDWARE TP COMPARISON (PREFILL)")
    log_print("="*80)
    log_print("  Rows = reference TP, Cols = verifier TP")
    
    n = len(tp_sizes)
    all_matrices = []
    
    # Use reference TP=1 as canonical for token IDs
    canonical_tp = str(tp_sizes[0])
    
    for ref_idx in range(NUM_REFERENCES):
        log_print(f"\n--- ref_{ref_idx} ---")
        
        matrix = np.zeros((n, n))
        
        # Get canonical token IDs from reference TP=1
        canonical_signals = ref_measurements[canonical_tp][ref_idx]['prefill_signals']
        canonical_token_ids = {}
        for pos_label, pos_data in canonical_signals.items():
            canonical_token_ids[pos_label] = pos_data['logprobs']['token_ids']
        
        for i, tp_ref in enumerate(tp_sizes):
            for j, tp_ver in enumerate(tp_sizes):
                sig_ref = ref_measurements[str(tp_ref)][ref_idx]['prefill_signals']
                sig_ver = ver_measurements[tp_ver][ref_idx]['prefill_signals']
                
                distances = compare_signals_with_canonical_ids(sig_ref, sig_ver, canonical_token_ids)
                matrix[i, j] = distances['logprobs_mean']
        
        header = "       " + " ".join(f"v_TP={tp:>2}" for tp in tp_sizes)
        log_print(f"\nLogprobs (L2 distance):")
        log_print(header)
        for i, tp in enumerate(tp_sizes):
            row = f"r_TP={tp:<2}" + " ".join(f"  {matrix[i,j]:6.2e}" for j in range(n))
            log_print(row)
        
        all_matrices.append(matrix)
    
    # Aggregate
    avg_matrix = np.mean(all_matrices, axis=0)
    
    log_print(f"\n{'='*80}")
    log_print("AGGREGATE (average across references):")
    log_print("="*80)
    
    header = "       " + " ".join(f"v_TP={tp:>2}" for tp in tp_sizes)
    log_print(f"\nLogprobs (L2 distance):")
    log_print(header)
    for i, tp in enumerate(tp_sizes):
        row = f"r_TP={tp:<2}" + " ".join(f"  {avg_matrix[i,j]:6.2e}" for j in range(n))
        log_print(row)
    
    # SNR calculation
    diagonal = np.mean([avg_matrix[i, i] for i in range(n)])
    off_diag = []
    for i in range(n):
        for j in range(n):
            if i != j:
                off_diag.append(avg_matrix[i, j])
    off_diagonal = np.mean(off_diag)
    
    log_print(f"\n{'='*80}")
    log_print("SNR ANALYSIS")
    log_print("="*80)
    log_print(f"\nDiagonal (baseline = cross-hardware, same TP):")
    log_print(f"  Logprobs: {diagonal:.2e}")
    log_print(f"\nOff-diagonal (different TP):")
    log_print(f"  Logprobs - Mean: {off_diagonal:.2e}", end="")
    
    if diagonal > 0:
        snr = off_diagonal / diagonal
        log_print(f", SNR: {snr:.2f}×")
    else:
        snr = float('inf') if off_diagonal > 0 else 1.0
        log_print(f", SNR: {snr}")
    
    return {
        'matrix': avg_matrix.tolist(),
        'per_reference_matrices': [m.tolist() for m in all_matrices],
        'diagonal': float(diagonal),
        'off_diagonal': float(off_diagonal),
        'snr': float(snr) if snr != float('inf') else None
    }

# ============================================================================
# MAIN
# ============================================================================

def main():
    log_path = setup_logging()
    
    log_print("=" * 80)
    log_print("TENSOR PARALLELISM DETECTABILITY EXPERIMENT")
    log_print("=" * 80)
    
    system_info = collect_system_info()
    log_print(f"\nHostname: {system_info['hostname']}")
    log_print(f"GPUs: {system_info['gpu_count']}x {system_info['gpu_name']}")
    log_print(f"vLLM: {system_info['vllm_version']}")
    log_print(f"Model: {MODEL_NAME}")
    log_print(f"TP sizes to test: {TP_SIZES}")
    log_print(f"Mode: {'CROSS-HARDWARE' if CROSS_HARDWARE_MODE else 'LOCAL'}")
    
    # Load reference file first if in cross-hardware mode (to get prompts)
    reference = None
    if CROSS_HARDWARE_MODE:
        log_print(f"\nLoading reference: {REFERENCE_FILE}")
        with open(REFERENCE_FILE, 'r') as f:
            content = f.read()
            content = content.replace('Infinity', '1e309')
            reference = json.loads(content)
        prompt_texts = reference['prompt_texts']
        log_print(f"Loaded {len(prompt_texts)} prompts from reference")
    else:
        # Use transformers tokenizer (much faster than loading vLLM)
        log_print("\nLoading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        
        prompts, prompt_texts = create_prompts(tokenizer, NUM_REFERENCES)
        log_print(f"Created {len(prompts)} prompts")
    
    # Run for each TP size
    measurements = {}
    environments = {}
    reproducibility_passed = None
    
    for tp_size in TP_SIZES:
        log_print(f"\n{'='*80}")
        log_print(f"TENSOR PARALLEL = {tp_size}")
        log_print("="*80)
        
        log_print(f"Loading model with TP={tp_size}...")
        llm = LLM(
            model=MODEL_NAME,
            download_dir=CACHE_DIR,
            dtype="bfloat16",
            trust_remote_code=True,
            tensor_parallel_size=tp_size,
            gpu_memory_utilization=0.6,
            max_model_len=32768,
            enforce_eager=True,
        )
        
        # Run reproducibility test on first TP size
        if reproducibility_passed is None:
            reproducibility_passed = test_reproducibility(llm, prompt_texts[0], num_runs=5)
        
        environments[tp_size] = collect_system_info(tp_size)
        measurements[tp_size] = []
        
        for ref_idx, prompt_text in enumerate(prompt_texts):
            log_print(f"  ref_{ref_idx}: ", end="")
            
            result = run_prefill(llm, prompt_text)
            measurements[tp_size].append(result)
            
            num_positions = len(result['prefill_signals'])
            log_print(f"{num_positions} positions captured")
        
        # Cleanup
        del llm
        destroy_model_parallel()
        gc.collect()
        torch.cuda.empty_cache()
    
    # Within-hardware analysis
    within_analysis = analyze_results(measurements, TP_SIZES)
    
    output_dir = '/workspace/experiments'
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    if not CROSS_HARDWARE_MODE:
        # Save results for potential cross-hardware comparison later
        results = {
            'metadata': {
                'model': MODEL_NAME,
                'dtype': 'bfloat16',
                'tp_sizes': TP_SIZES,
                'tokens_per_slice': TOKENS_PER_SLICE,
                'num_references': NUM_REFERENCES,
                'top_k_logprobs': TOP_K_LOGPROBS,
                'reproducibility_test_passed': reproducibility_passed,
                'environments': {str(k): v for k, v in environments.items()},
                'timestamp': timestamp
            },
            'prompt_texts': prompt_texts,
            'measurements': {str(k): v for k, v in measurements.items()},
            'within_hardware_analysis': within_analysis
        }
        
        filepath = os.path.join(output_dir, f"tp_experiment_{timestamp}.json")
        
        with open(filepath, 'w') as f:
            json.dump(results, f, indent=2)
        
        log_print(f"\nSaved to: {filepath}")
        log_print(f"\nFor cross-hardware comparison:")
        log_print(f"  1. Copy {filepath} to second machine")
        log_print(f"  2. Set CROSS_HARDWARE_MODE = True")
        log_print(f"  3. Set REFERENCE_FILE = '<path to json>'")
        
    else:
        # Cross-hardware comparison (reference already loaded at start)
        log_print(f"\n{'='*80}")
        log_print("CROSS-HARDWARE COMPARISON")
        log_print("="*80)
        
        # Environment comparison
        ref_env = reference['metadata']['environments']['1']
        ver_env = environments[1]
        
        log_print("\nEnvironment comparison:")
        log_print(f"  Reference GPU: {ref_env['gpu_name']}")
        log_print(f"  Verifier GPU:  {ver_env['gpu_name']}")
        
        check_fields = ['vllm_version', 'torch_version', 'cuda_version']
        for field in check_fields:
            ref_val = ref_env.get(field, 'N/A')
            ver_val = ver_env.get(field, 'N/A')
            match = "✓" if ref_val == ver_val else "✗"
            log_print(f"  {field}: {ref_val} vs {ver_val} {match}")
        
        ref_measurements = reference['measurements']
        
        cross_analysis = analyze_cross_hardware(ref_measurements, measurements, TP_SIZES)
        
        results = {
            'metadata': {
                'model': MODEL_NAME,
                'dtype': 'bfloat16',
                'tp_sizes': TP_SIZES,
                'reference_file': REFERENCE_FILE,
                'reproducibility_test_passed': reproducibility_passed,
                'reference_environments': reference['metadata']['environments'],
                'verifier_environments': {str(k): v for k, v in environments.items()},
                'timestamp': timestamp
            },
            'within_hardware_analysis': within_analysis,
            'cross_hardware_analysis': cross_analysis
        }
        
        filepath = os.path.join(output_dir, f"tp_cross_hardware_{timestamp}.json")
        
        with open(filepath, 'w') as f:
            json.dump(results, f, indent=2)
        
        log_print(f"\nSaved to: {filepath}")
    
    close_logging()

if __name__ == "__main__":
    main()



TENSOR PARALLELISM DETECTABILITY EXPERIMENT

Hostname: 63ddf43f81b3
GPUs: 4x NVIDIA A100-SXM4-80GB
vLLM: 0.11.2
Model: Qwen/Qwen3-14B
TP sizes to test: [1, 2, 4]
Mode: LOCAL

Loading tokenizer...
Found 1 PDF(s)
  Loading: /workspace/Verification-for-International-AI-Governance.pdf
Total source tokens: 120215
Prompt structure: 33 prefix + 10000 snippet + 34 suffix = 10067 tokens
Created 3 prompts

TENSOR PARALLEL = 1
Loading model with TP=1...
INFO 11-26 23:45:21 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'bfloat16', 'max_model_len': 32768, 'gpu_memory_utilization': 0.6, 'disable_log_stats': True, 'enforce_eager': True, 'model': 'Qwen/Qwen3-14B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-26 23:45:22 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-26 23:45:22 [model.py:1745] Using max model len 32768
INFO 11-26 23:45:30 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-26 23:45:30 [vllm.py:500] Cudagraph is disabled under eager mode




[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:38 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-14B', speculative_config=None, tokenizer='Qwen/Qwen3-14B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  12% Completed | 1/8 [00:00<00:02,  3.07it/s]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:01<00:03,  1.77it/s]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:01<00:03,  1.38it/s]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:02<00:03,  1.28it/s]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:03<00:02,  1.25it/s]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:04<00:01,  1.22it/s]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:05<00:00,  1.20it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:06<00:00,  1.19it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:06<00:00,  1.28it/s]
[1;36m(EngineCore_DP0 pid=14477)[0;0m 


[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:47 [default_loader.py:314] Loading weights took 6.49 seconds
[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:47 [gpu_model_runner.py:3338] Model loading took 27.5185 GiB memory and 7.395146 seconds
[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:49 [gpu_worker.py:359] Available KV cache memory: 18.53 GiB
[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:49 [kv_cache_utils.py:1229] GPU KV cache size: 121,424 tokens
[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:49 [kv_cache_utils.py:1234] Maximum concurrency for 32,768 tokens per request: 3.71x
[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:49 [core.py:250] init engine (profile, create kv cache, warmup model) took 2.08 seconds
[1;36m(EngineCore_DP0 pid=14477)[0;0m INFO 11-26 23:45:50 [vllm.py:500] Cudagraph is disabled under eager mode
INFO 11-26 23:45:51 [llm.py:352] Supported tasks: ['generate']

REPRODUCIBILITY TEST
Running same pr

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Run 1: done


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Run 2: done


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Run 3: done


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Run 4: done


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Run 5: done

✓ REPRODUCIBILITY TEST PASSED
  All 5 runs identical
  Max logprob difference: 0.00e+00
  ref_0: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

3 positions captured
  ref_1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

3 positions captured
  ref_2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

3 positions captured





TENSOR PARALLEL = 2
Loading model with TP=2...
INFO 11-26 23:46:12 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'bfloat16', 'max_model_len': 32768, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.6, 'disable_log_stats': True, 'enforce_eager': True, 'model': 'Qwen/Qwen3-14B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-26 23:46:12 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-26 23:46:12 [model.py:1745] Using max model len 32768
INFO 11-26 23:46:12 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-26 23:46:12 [vllm.py:500] Cudagraph is disabled under eager mode




[1;36m(EngineCore_DP0 pid=14880)[0;0m INFO 11-26 23:46:20 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-14B', speculative_config=None, tokenizer='Qwen/Qwen3-14B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=



INFO 11-26 23:46:28 [parallel_state.py:1208] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:57767 backend=nccl
INFO 11-26 23:46:28 [parallel_state.py:1208] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:57767 backend=nccl
[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[Gloo] Rank 1 is connected to 1 peer ranks. [Gloo] Rank Expected number of connected peer ranks is : 10
 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
INFO 11-26 23:46:29 [pynccl.py:111] vLLM is using nccl==2.27.5
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  12% Completed | 1/8 [00:00<00:01,  3.84it/s]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:01<00:03,  1.53it/s]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:02<00:04,  1.23it/s]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:03<00:03,  1.13it/s]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:04<00:02,  1.09it/s]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:05<00:01,  1.06it/s]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:06<00:00,  1.03it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:06<00:00,  1.11it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:06<00:00,  1.15it/s]
[1;36m(Worker_TP0 pid=15014)[0;0m 


[1;36m(Worker_TP0 pid=15014)[0;0m INFO 11-26 23:46:39 [default_loader.py:314] Loading weights took 7.08 seconds
[1;36m(Worker_TP0 pid=15014)[0;0m INFO 11-26 23:46:39 [gpu_model_runner.py:3338] Model loading took 13.8818 GiB memory and 7.995043 seconds
[1;36m(Worker_TP0 pid=15014)[0;0m INFO 11-26 23:46:46 [gpu_worker.py:359] Available KV cache memory: 31.69 GiB
[1;36m(EngineCore_DP0 pid=14880)[0;0m INFO 11-26 23:46:46 [kv_cache_utils.py:1229] GPU KV cache size: 415,360 tokens
[1;36m(EngineCore_DP0 pid=14880)[0;0m INFO 11-26 23:46:46 [kv_cache_utils.py:1234] Maximum concurrency for 32,768 tokens per request: 12.68x
[1;36m(EngineCore_DP0 pid=14880)[0;0m INFO 11-26 23:46:46 [kv_cache_utils.py:1234] Maximum concurrency for 32,768 tokens per request: 12.68x
[1;36m(EngineCore_DP0 pid=14880)[0;0m INFO 11-26 23:46:46 [core.py:250] init engine (profile, create kv cache, warmup model) took 7.00 seconds
[1;36m(EngineCore_DP0 pid=14880)[0;0m INFO 11-26 23:46:48 [vllm.py:500] Cudagra

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

3 positions captured
  ref_1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

3 positions captured
  ref_2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

3 positions captured
[1;36m(Worker_TP0 pid=15014)[0;0m INFO 11-26 23:46:53 [multiproc_executor.py:702] Parent process exited, terminating worker
[1;36m(Worker_TP1 pid=15015)[0;0m INFO 11-26 23:46:53 [multiproc_executor.py:702] Parent process exited, terminating worker

TENSOR PARALLEL = 4
Loading model with TP=4...
INFO 11-26 23:46:56 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'bfloat16', 'max_model_len': 32768, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.6, 'disable_log_stats': True, 'enforce_eager': True, 'model': 'Qwen/Qwen3-14B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-26 23:46:56 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-26 23:46:56 [model.py:1745] Using max model len 32768
INFO 11-26 23:46:56 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 11-26 23:46:56 [vllm.py:500] Cudagraph is disabled under eager mode




[1;36m(EngineCore_DP0 pid=15183)[0;0m INFO 11-26 23:47:04 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-14B', speculative_config=None, tokenizer='Qwen/Qwen3-14B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=



INFO 11-26 23:47:12 [parallel_state.py:1208] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:53357 backend=nccl
INFO 11-26 23:47:12 [parallel_state.py:1208] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:53357 backend=nccl
INFO 11-26 23:47:12 [parallel_state.py:1208] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:53357 backend=nccl
INFO 11-26 23:47:12 [parallel_state.py:1208] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:53357 backend=nccl
[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3
[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is :

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  12% Completed | 1/8 [00:00<00:01,  6.14it/s]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:00<00:02,  2.74it/s]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:01<00:02,  2.14it/s]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:01<00:02,  1.93it/s]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:02<00:01,  1.86it/s]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:03<00:01,  1.78it/s]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:03<00:00,  1.74it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:04<00:00,  1.89it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:04<00:00,  1.97it/s]
[1;36m(Worker_TP0 pid=15316)[0;0m 


[1;36m(Worker_TP0 pid=15316)[0;0m INFO 11-26 23:47:20 [default_loader.py:314] Loading weights took 4.15 seconds
[1;36m(Worker_TP0 pid=15316)[0;0m INFO 11-26 23:47:20 [gpu_model_runner.py:3338] Model loading took 6.9456 GiB memory and 5.243519 seconds
[1;36m(Worker_TP0 pid=15316)[0;0m INFO 11-26 23:47:32 [gpu_worker.py:359] Available KV cache memory: 38.63 GiB
[1;36m(EngineCore_DP0 pid=15183)[0;0m INFO 11-26 23:47:32 [kv_cache_utils.py:1229] GPU KV cache size: 1,012,560 tokens
[1;36m(EngineCore_DP0 pid=15183)[0;0m INFO 11-26 23:47:32 [kv_cache_utils.py:1234] Maximum concurrency for 32,768 tokens per request: 30.90x
[1;36m(EngineCore_DP0 pid=15183)[0;0m INFO 11-26 23:47:32 [kv_cache_utils.py:1234] Maximum concurrency for 32,768 tokens per request: 30.90x
[1;36m(EngineCore_DP0 pid=15183)[0;0m INFO 11-26 23:47:32 [kv_cache_utils.py:1234] Maximum concurrency for 32,768 tokens per request: 30.90x
[1;36m(EngineCore_DP0 pid=15183)[0;0m INFO 11-26 23:47:32 [kv_cache_utils.py:123