In [1]:
#!/usr/bin/env python3
"""
Quantization Format Detection Experiment using vLLM.
Compares INT4 quantization methods: AWQ vs GPTQ, with and without Marlin kernels.

Tests whether quantization format claims can be verified across different GPU architectures
using logprob forensics with vLLM inference engine.

Signal: logprobs only (vLLM limitation - no key vector access)

Workflow:
1. Run on Machine A with TEACHER_FORCING = False
   → Generates tokens, extracts logprobs, saves to JSON

2. Copy JSON to Machine B

3. Run on Machine B with TEACHER_FORCING = True
   → Teacher-forces A's tokens, compares logprobs
"""

import os
os.environ['HF_HOME'] = '/workspace/huggingface_cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/huggingface_cache'

from vllm import LLM, SamplingParams
import numpy as np
from datetime import datetime
import json
import socket
import platform
import sys
import glob
import PyPDF2

# ============================================================================
# CONFIGURATION
# ============================================================================

TEACHER_FORCING = False
REFERENCE_FILE = "/workspace/experiments/quant_reference.json"

# Model configurations - all INT4, different quantization methods + kernels
# 
# Available Qwen3-8B INT4 models for vLLM:
#   - Qwen/Qwen3-8B-AWQ: AWQ quantization, supports awq/awq_marlin kernels
#   - AlphaGaO/Qwen3-8B-GPTQ: GPTQ quantization, pre-converted to Marlin format
#   - pytorch/Qwen3-8B-INT4: TorchAO HQQ (requires nightly vllm/torchao)
#
# Note: OpenVINO/Qwen3-8B-int4-ov is NOT vLLM compatible (OpenVINO backend only)
#
MODEL_CONFIGS = {
    'awq': {
        'model_name': 'Qwen/Qwen3-8B-AWQ',
        'quantization': 'awq',
        'dtype': 'float16',
    },
    'awq_marlin': {
        'model_name': 'Qwen/Qwen3-8B-AWQ',
        'quantization': 'awq_marlin',
        'dtype': 'float16',
    },
    'gptq_marlin': {
        'model_name': 'AlphaGaO/Qwen3-8B-GPTQ',
        'quantization': 'gptq_marlin',
        'dtype': 'float16',
    },
    # Uncomment if you have nightly vllm + torchao installed:
    # 'torchao_hqq': {
    #     'model_name': 'pytorch/Qwen3-8B-INT4',
    #     'quantization': 'torchao',
    #     'dtype': 'bfloat16',
    # },
}

CACHE_DIR = '/workspace/huggingface_cache'

MAX_NEW_TOKENS = 100
TOKENS_PER_SLICE = 8000
NUM_REFERENCES = 4
TOP_K_LOGPROBS = 5

# Threshold for considering two configs "equivalent" (same kernel)
EQUIVALENCE_THRESHOLD = 1e-9

# Reproducibility check settings
REPRODUCIBILITY_CHECK = True
REPRODUCIBILITY_RUNS = 3

SYSTEM_PROMPT = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."

# ============================================================================
# LOGGING SETUP
# ============================================================================

LOG_FILE = None

def setup_logging(output_dir='/workspace/experiments'):
    global LOG_FILE
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    mode = "verify" if TEACHER_FORCING else "generate"
    log_path = os.path.join(output_dir, f"quant_experiment_{mode}_{timestamp}.txt")
    LOG_FILE = open(log_path, 'w')
    return log_path

def log_print(*args, **kwargs):
    print(*args, **kwargs)
    if LOG_FILE:
        log_kwargs = {k: v for k, v in kwargs.items() if k != 'file'}
        print(*args, **log_kwargs, file=LOG_FILE)
        LOG_FILE.flush()

def close_logging():
    global LOG_FILE
    if LOG_FILE:
        LOG_FILE.close()
        LOG_FILE = None

# ============================================================================
# PDF LOADING
# ============================================================================

def load_pdf_text(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
    return text.strip()

def create_prompts_from_pdf(tokenizer, num_references=NUM_REFERENCES):
    """
    Load PDFs and create prompts with different content slices.
    Returns list of prompt token ID lists.
    """
    pdf_files = sorted(glob.glob("/workspace/*.pdf"))
    if not pdf_files:
        pdf_files = sorted(glob.glob("*.pdf"))
    if not pdf_files:
        raise FileNotFoundError("No PDF files found")

    log_print(f"Found {len(pdf_files)} PDF(s)")
    for pdf_path in pdf_files:
        log_print(f"  Loading: {pdf_path}")

    all_text = ""
    for pdf_path in pdf_files:
        text = load_pdf_text(pdf_path)
        all_text += text + " "

    content_tokens = tokenizer.encode(all_text)
    log_print(f"Total source tokens: {len(content_tokens)}")

    tokens_needed = num_references * TOKENS_PER_SLICE
    if len(content_tokens) < tokens_needed:
        raise ValueError(f"Need {tokens_needed} tokens but only have {len(content_tokens)}")

    # Build chat-formatted prompts
    prefix = f"""<|im_start|>system
{SYSTEM_PROMPT}<|im_end|>
<|im_start|>user
Here is an excerpt from a document:

\""""

    suffix = f"""\"

Based on this excerpt, what type of document do you think this is from, and what is its likely subject matter? Explain your reasoning.<|im_end|>
<|im_start|>assistant
"""

    prefix_tokens = tokenizer.encode(prefix)
    suffix_tokens = tokenizer.encode(suffix)

    total_len = len(prefix_tokens) + TOKENS_PER_SLICE + len(suffix_tokens)
    log_print(f"Prompt structure: {len(prefix_tokens)} prefix + {TOKENS_PER_SLICE} snippet + {len(suffix_tokens)} suffix = {total_len} tokens")

    prompts = []
    for i in range(num_references):
        start = i * TOKENS_PER_SLICE
        end = start + TOKENS_PER_SLICE
        snippet_tokens = content_tokens[start:end]
        prompt_ids = prefix_tokens + snippet_tokens + suffix_tokens
        prompts.append(prompt_ids)

    return prompts

# ============================================================================
# SYSTEM INFO
# ============================================================================

def collect_system_info():
    """Collect comprehensive environment information."""
    import torch
    import transformers

    info = {
        "hostname": socket.gethostname(),
        "platform": platform.platform(),
        "python_version": sys.version.split()[0],
        "torch_version": torch.__version__,
        "cuda_version": torch.version.cuda if torch.cuda.is_available() else "N/A",
        "cudnn_version": str(torch.backends.cudnn.version()) if torch.cuda.is_available() else "N/A",
        "transformers_version": transformers.__version__,
        "numpy_version": np.__version__,
        "gpu_name": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A",
        "gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
    }

    try:
        import vllm
        info["vllm_version"] = vllm.__version__
    except (ImportError, AttributeError):
        info["vllm_version"] = "unknown"

    return info

def validate_environment_match(reference_env, verifier_env):
    """
    Validate that software environments match between reference and verifier.
    """
    log_print("\n" + "="*80)
    log_print("ENVIRONMENT VALIDATION")
    log_print("="*80)

    critical_fields = ['vllm_version', 'torch_version', 'cuda_version']
    expected_different = ['gpu_name', 'hostname']

    mismatches = []

    log_print("\nCritical dependencies:")
    for field in critical_fields:
        ref_val = reference_env.get(field, 'N/A')
        ver_val = verifier_env.get(field, 'N/A')

        if ref_val == ver_val:
            log_print(f"  ✓ {field}: {ref_val}")
        else:
            log_print(f"  ✗ {field}: reference={ref_val}, verifier={ver_val}")
            mismatches.append((field, ref_val, ver_val))

    log_print("\nExpected differences (hardware):")
    for field in expected_different:
        ref_val = reference_env.get(field, 'N/A')
        ver_val = verifier_env.get(field, 'N/A')

        if ref_val != ver_val:
            log_print(f"  ✓ {field}: reference={ref_val}, verifier={ver_val}")
        else:
            log_print(f"  ⚠ {field}: SAME ({ref_val}) - are you on different hardware?")

    if not mismatches:
        log_print("\n✓ ENVIRONMENT VALIDATION PASSED")
        return {'valid': True, 'mismatches': []}
    else:
        log_print("\n⚠ ENVIRONMENT MISMATCHES DETECTED")
        log_print("  Results may be affected by software differences, not just hardware.")
        return {'valid': False, 'mismatches': mismatches}

# ============================================================================
# MODEL LOADING
# ============================================================================

def load_vllm_model(config_name):
    """Load vLLM model with specified configuration."""
    config = MODEL_CONFIGS[config_name]
    
    log_print(f"Loading model: {config['model_name']}")
    log_print(f"  Quantization: {config['quantization']}")
    log_print(f"  Dtype: {config['dtype']}")
    
    kwargs = {
        'model': config['model_name'],
        'download_dir': CACHE_DIR,
        'dtype': config['dtype'],
        'trust_remote_code': True,
        'gpu_memory_utilization': 0.7,
        'quantization': config['quantization'],
    }
    
    llm = LLM(**kwargs)
    tokenizer = llm.get_tokenizer()
    
    return llm, tokenizer

# ============================================================================
# LOGPROB EXTRACTION
# ============================================================================

def extract_logprobs_from_output(output, positions=[-3, -2, -1]):
    """Extract logprobs from vLLM output at specified positions."""
    signals = {}
    
    logprobs_list = output.outputs[0].logprobs
    
    if logprobs_list is None:
        return signals
    
    num_generated = len(logprobs_list)
    
    for pos in positions:
        actual_idx = pos if pos >= 0 else num_generated + pos
        
        if actual_idx < 0 or actual_idx >= num_generated:
            continue
        
        pos_label = f"pos_{pos}"
        token_logprobs = logprobs_list[actual_idx]
        
        token_ids = []
        log_probs = []
        
        for token_id, logprob_obj in token_logprobs.items():
            token_ids.append(token_id)
            log_probs.append(logprob_obj.logprob)
        
        signals[pos_label] = {
            'logprobs': {
                'token_ids': token_ids,
                'log_probs': log_probs
            }
        }
    
    return signals

def extract_prompt_logprobs(output, prompt_length, positions=[-3, -2, -1]):
    """Extract logprobs from prompt positions (for prefill analysis)."""
    signals = {}
    
    prompt_logprobs_list = output.prompt_logprobs
    
    if prompt_logprobs_list is None:
        return signals
    
    for pos in positions:
        actual_idx = pos if pos >= 0 else prompt_length + pos
        
        if actual_idx < 0 or actual_idx >= len(prompt_logprobs_list):
            continue
        
        pos_label = f"pos_{pos}"
        token_logprobs = prompt_logprobs_list[actual_idx]
        
        if token_logprobs is None:
            continue
        
        token_ids = []
        log_probs = []
        
        for token_id, logprob_obj in token_logprobs.items():
            token_ids.append(token_id)
            log_probs.append(logprob_obj.logprob)
        
        signals[pos_label] = {
            'logprobs': {
                'token_ids': token_ids,
                'log_probs': log_probs
            }
        }
    
    return signals

# ============================================================================
# GENERATION MODE
# ============================================================================

def run_generation(llm, tokenizer, prompt_ids):
    """Run generation and extract prefill + decode signals."""
    prompt_text = tokenizer.decode(prompt_ids)
    prompt_length = len(prompt_ids)
    
    sampling_params = SamplingParams(
        max_tokens=MAX_NEW_TOKENS,
        temperature=0.0,
        logprobs=TOP_K_LOGPROBS,
        prompt_logprobs=TOP_K_LOGPROBS,
    )
    
    outputs = llm.generate([prompt_text], sampling_params)
    output = outputs[0]
    
    generated_ids = list(output.outputs[0].token_ids)
    num_generated = len(generated_ids)
    
    prefill_signals = extract_prompt_logprobs(output, prompt_length, positions=[-3, -2, -1])
    decode_signals = extract_logprobs_from_output(output, positions=[-3, -2, -1])
    
    return {
        'prompt_ids': prompt_ids,
        'generated_ids': generated_ids,
        'prompt_length': prompt_length,
        'prefill_signals': prefill_signals,
        'decode_signals': decode_signals,
        'num_generated': num_generated
    }

# ============================================================================
# VERIFICATION MODE (TEACHER FORCING)
# ============================================================================

def run_teacher_forced_verification(llm, tokenizer, reference_data, is_diagonal):
    """
    Teacher-forced verification.
    For diagonal (same config): use exact reference
    For off-diagonal: verify with different config
    
    vLLM doesn't support true teacher forcing, so we prefill the full sequence
    (prompt + generated tokens) and extract logprobs.
    """
    ref_prompt_ids = reference_data['prompt_ids']
    ref_generated_ids = reference_data['generated_ids']
    
    prompt_length = len(ref_prompt_ids)
    num_generated = len(ref_generated_ids)
    
    # Full sequence = prompt + generated
    full_ids = ref_prompt_ids + ref_generated_ids
    full_text = tokenizer.decode(full_ids)
    
    log_print(f"      Prompt: {prompt_length}, Gen: {num_generated}", end="")
    
    sampling_params = SamplingParams(
        max_tokens=1,  # Minimal generation, we just want logprobs
        temperature=0.0,
        prompt_logprobs=TOP_K_LOGPROBS,
        logprobs=TOP_K_LOGPROBS,
    )
    
    outputs = llm.generate([full_text], sampling_params)
    output = outputs[0]
    
    # Extract prefill signals from prompt portion
    prefill_signals = {}
    prompt_logprobs_list = output.prompt_logprobs
    
    if prompt_logprobs_list is not None:
        for pos in [-3, -2, -1]:
            actual_idx = pos if pos >= 0 else prompt_length + pos
            
            if actual_idx < 0 or actual_idx >= len(prompt_logprobs_list):
                continue
            
            pos_label = f"pos_{pos}"
            token_logprobs = prompt_logprobs_list[actual_idx]
            
            if token_logprobs is None:
                continue
            
            token_ids = list(token_logprobs.keys())
            log_probs = [token_logprobs[tid].logprob for tid in token_ids]
            
            prefill_signals[pos_label] = {
                'logprobs': {
                    'token_ids': token_ids,
                    'log_probs': log_probs
                }
            }
    
    # Extract decode signals from generated portion
    decode_signals = {}
    if prompt_logprobs_list is not None:
        for pos in [-3, -2, -1]:
            # Position in full sequence (prompt + generated)
            full_pos = pos if pos >= 0 else (prompt_length + num_generated) + pos
            
            if full_pos < 0 or full_pos >= len(prompt_logprobs_list):
                continue
            
            pos_label = f"pos_{pos}"
            token_logprobs = prompt_logprobs_list[full_pos]
            
            if token_logprobs is None:
                continue
            
            token_ids = list(token_logprobs.keys())
            log_probs = [token_logprobs[tid].logprob for tid in token_ids]
            
            decode_signals[pos_label] = {
                'logprobs': {
                    'token_ids': token_ids,
                    'log_probs': log_probs
                }
            }
    
    log_print(f" → verified")
    
    return {
        'prefill_signals': prefill_signals,
        'decode_signals': decode_signals,
        'num_generated': num_generated
    }

# ============================================================================
# DISTANCE METRICS
# ============================================================================

def compute_logprob_distance_canonical(logprobs1, logprobs2, canonical_ids):
    """Compute L2 distance between logprobs for a canonical set of token IDs."""
    lp1 = dict(zip(logprobs1['token_ids'], logprobs1['log_probs']))
    lp2 = dict(zip(logprobs2['token_ids'], logprobs2['log_probs']))

    vec1 = []
    vec2 = []

    for tid in canonical_ids:
        if tid in lp1 and tid in lp2:
            vec1.append(lp1[tid])
            vec2.append(lp2[tid])

    if len(vec1) == 0:
        return float('inf')

    return float(np.linalg.norm(np.array(vec1) - np.array(vec2)))

def compare_signals(signals1, signals2):
    """Compare two signal sets using top 5 token IDs from first signal as canonical."""
    common_positions = set(signals1.keys()) & set(signals2.keys())

    all_dists = []

    for pos_label in common_positions:
        sig1 = signals1[pos_label]
        sig2 = signals2[pos_label]

        # Use top 5 for comparison
        canonical_ids = sig1['logprobs']['token_ids'][:5]
        dist = compute_logprob_distance_canonical(
            sig1['logprobs'], sig2['logprobs'], canonical_ids
        )
        all_dists.append(dist)

    finite_dists = [d for d in all_dists if d != float('inf')]
    
    return {
        'logprobs_mean': np.mean(finite_dists) if finite_dists else float('inf'),
        'logprobs_max': max(finite_dists) if finite_dists else float('inf')
    }

# ============================================================================
# EQUIVALENCE DETECTION
# ============================================================================

def find_equivalent_pairs(matrix, config_names, threshold=EQUIVALENCE_THRESHOLD):
    """Find pairs of configs that produce equivalent results (same kernel)."""
    equivalent_pairs = []
    n = len(config_names)
    
    for i in range(n):
        for j in range(i + 1, n):
            if matrix[i, j] < threshold:
                equivalent_pairs.append((config_names[i], config_names[j]))
    
    return equivalent_pairs

def format_kernel_classes(equivalent_pairs, config_names):
    """Group configs into kernel equivalence classes."""
    parent = {cfg: cfg for cfg in config_names}
    
    def find(x):
        if parent[x] != x:
            parent[x] = find(parent[x])
        return parent[x]
    
    def union(x, y):
        px, py = find(x), find(y)
        if px != py:
            parent[px] = py
    
    for cfg1, cfg2 in equivalent_pairs:
        union(cfg1, cfg2)
    
    groups = {}
    for cfg in config_names:
        root = find(cfg)
        if root not in groups:
            groups[root] = set()
        groups[root].add(cfg)
    
    return list(groups.values())

# ============================================================================
# WITHIN-HARDWARE ANALYSIS
# ============================================================================

def analyze_within_hardware(measurements, config_names, signal_source='decode'):
    """Analyze within-hardware quantization effects."""
    log_print("\n" + "="*80)
    log_print(f"WITHIN-HARDWARE QUANTIZATION EFFECTS ({signal_source.upper()})")
    log_print("="*80)

    n = len(config_names)
    all_matrices = []

    for ref_idx in range(NUM_REFERENCES):
        log_print(f"\n--- ref_{ref_idx} ---")

        matrix = np.zeros((n, n))

        for i, cfg_i in enumerate(config_names):
            for j, cfg_j in enumerate(config_names):
                signals_key = 'prefill_signals' if signal_source == 'prefill' else 'decode_signals'
                sig_i = measurements[cfg_i][ref_idx][signals_key]
                sig_j = measurements[cfg_j][ref_idx][signals_key]

                if i == j:
                    matrix[i, j] = 0.0
                else:
                    distances = compare_signals(sig_i, sig_j)
                    matrix[i, j] = distances['logprobs_mean']

        # Display matrix
        header = "              " + " ".join(f"{c:>12}" for c in config_names)
        log_print(f"\nLogprobs (L2 distance):")
        log_print(header)
        for i, cfg in enumerate(config_names):
            row = f"{cfg:>12} " + " ".join(f"{matrix[i,j]:12.2e}" for j in range(n))
            log_print(row)

        all_matrices.append(matrix)

    # Aggregate
    avg_matrix = np.mean(all_matrices, axis=0)

    log_print(f"\n" + "="*80)
    log_print(f"AGGREGATE (average across references):")
    log_print("="*80)
    
    header = "              " + " ".join(f"{c:>12}" for c in config_names)
    log_print(header)
    for i, cfg in enumerate(config_names):
        row = f"{cfg:>12} " + " ".join(f"{avg_matrix[i,j]:12.2e}" for j in range(n))
        log_print(row)

    # Find equivalent pairs
    equivalent_pairs = find_equivalent_pairs(avg_matrix, config_names)
    kernel_classes = format_kernel_classes(equivalent_pairs, config_names)

    # Off-diagonal stats
    off_diag = []
    for i in range(n):
        for j in range(n):
            if i != j:
                off_diag.append(avg_matrix[i, j])

    finite_off_diag = [d for d in off_diag if d != float('inf')]

    log_print(f"\nOff-diagonal stats:")
    if finite_off_diag:
        log_print(f"  Mean: {np.mean(finite_off_diag):.2e}")
        log_print(f"  Range: [{np.min(finite_off_diag):.2e}, {np.max(finite_off_diag):.2e}]")
    
    # Check if all zeros
    zero_count = sum(1 for d in finite_off_diag if d < EQUIVALENCE_THRESHOLD)
    if zero_count == len(finite_off_diag):
        log_print(f"\n⚠ WARNING: All comparisons are EXACTLY ZERO")
        log_print("  All configs produce identical results (single kernel class)")

    log_print(f"\nKernel equivalence classes:")
    for i, cls in enumerate(kernel_classes):
        log_print(f"  Class {i+1}: {sorted(cls)}")

    if equivalent_pairs:
        log_print(f"\nEquivalent pairs:")
        for cfg1, cfg2 in equivalent_pairs:
            log_print(f"  ({cfg1}, {cfg2})")

    return {
        'matrix': avg_matrix.tolist(),
        'per_reference_matrices': [m.tolist() for m in all_matrices],
        'off_diagonal_mean': float(np.mean(finite_off_diag)) if finite_off_diag else 0,
        'equivalent_pairs': equivalent_pairs,
        'kernel_classes': [sorted(list(cls)) for cls in kernel_classes]
    }

# ============================================================================
# CROSS-HARDWARE ANALYSIS
# ============================================================================

def analyze_cross_hardware(comparison_results, config_names, signal_source='decode',
                           equivalent_pairs=None):
    """Analyze the comparison matrix and determine detectability."""
    log_print("\n" + "="*80)
    log_print(f"CROSS-HARDWARE QUANTIZATION DETECTABILITY ({signal_source.upper()})")
    log_print("="*80)
    
    if equivalent_pairs is None:
        equivalent_pairs = []
    
    # Convert to set of both orderings for easy lookup
    equiv_set = set()
    for cfg1, cfg2 in equivalent_pairs:
        equiv_set.add((cfg1, cfg2))
        equiv_set.add((cfg2, cfg1))
    
    dist_key = 'prefill_distances' if signal_source == 'prefill' else 'decode_distances'
    
    by_ref = {}
    for result in comparison_results:
        ref = result['ref_idx']
        if ref not in by_ref:
            by_ref[ref] = {}
        key = (result['claimed_config'], result['verify_config'])
        by_ref[ref][key] = result

    all_matrices = []
    n = len(config_names)

    # Per-reference matrices
    for ref_idx in sorted(by_ref.keys()):
        log_print(f"\n--- ref_{ref_idx} ---")
        
        ref_data = by_ref[ref_idx]
        matrix = np.zeros((n, n))
        
        for i, claimed_cfg in enumerate(config_names):
            for j, verify_cfg in enumerate(config_names):
                key = (claimed_cfg, verify_cfg)
                if key in ref_data:
                    matrix[i, j] = ref_data[key][dist_key]['logprobs_mean']
        
        # Display matrix
        header = "              " + " ".join(f"{c:>12}" for c in config_names)
        log_print(header)
        for i, claimed_cfg in enumerate(config_names):
            row = f"{claimed_cfg:>12} "
            for j in range(n):
                row += f"{matrix[i,j]:12.2e} "
            log_print(row)
        
        all_matrices.append(matrix)

    # Aggregate matrix
    avg_matrix = np.mean(all_matrices, axis=0)
    
    log_print("\n" + "="*80)
    log_print("AGGREGATE (average across references):")
    log_print("  Rows = claimed config, Cols = verified config")
    log_print("="*80)
    
    header = "              " + " ".join(f"{c:>12}" for c in config_names)
    log_print(header)
    for i, claimed_cfg in enumerate(config_names):
        row = f"{claimed_cfg:>12} "
        for j in range(n):
            row += f"{avg_matrix[i,j]:12.2e} "
        log_print(row)
    
    # Compute statistics
    diagonal = [avg_matrix[i, i] for i in range(n)]
    
    # Off-diagonal: exclude equivalent pairs
    off_diagonal_all = []
    off_diagonal_meaningful = []
    excluded_pairs = []
    
    for i, cfg1 in enumerate(config_names):
        for j, cfg2 in enumerate(config_names):
            if i != j:
                off_diagonal_all.append(avg_matrix[i, j])
                if (cfg1, cfg2) in equiv_set:
                    excluded_pairs.append((cfg1, cfg2))
                else:
                    off_diagonal_meaningful.append(avg_matrix[i, j])
    
    baseline_mean = np.mean(diagonal)
    signal_all_mean = np.mean(off_diagonal_all) if off_diagonal_all else 0.0
    signal_meaningful_mean = np.mean(off_diagonal_meaningful) if off_diagonal_meaningful else 0.0
    
    snr_all = signal_all_mean / baseline_mean if baseline_mean > 0 else float('inf')
    snr_meaningful = signal_meaningful_mean / baseline_mean if baseline_mean > 0 else float('inf')
    
    log_print("\n" + "="*80)
    log_print("SNR ANALYSIS")
    log_print("="*80)
    
    log_print(f"\nDiagonal (baseline = cross-hardware, same config):")
    log_print(f"  Mean: {baseline_mean:.2e}")
    
    log_print(f"\nOff-diagonal (all pairs):")
    log_print(f"  Count: {len(off_diagonal_all)}")
    log_print(f"  Mean: {signal_all_mean:.2e}")
    log_print(f"  SNR (all): {snr_all:.2f}×")
    
    if equivalent_pairs:
        log_print(f"\nExcluded equivalent pairs (same kernel within-hardware):")
        for cfg1, cfg2 in equivalent_pairs:
            log_print(f"  ({cfg1}, {cfg2})")
        log_print(f"  Total excluded: {len(excluded_pairs)} cells")
    
    log_print(f"\nOff-diagonal (meaningful pairs only):")
    log_print(f"  Count: {len(off_diagonal_meaningful)}")
    if off_diagonal_meaningful:
        log_print(f"  Mean: {signal_meaningful_mean:.2e}")
        log_print(f"  SNR (meaningful): {snr_meaningful:.2f}×")
    else:
        log_print("  No meaningful pairs (all configs are equivalent)")
    
    return {
        'matrix': avg_matrix.tolist(),
        'per_reference_matrices': [m.tolist() for m in all_matrices],
        'baseline_mean': float(baseline_mean),
        'signal_all_mean': float(signal_all_mean),
        'signal_meaningful_mean': float(signal_meaningful_mean),
        'snr_all': float(snr_all),
        'snr_meaningful': float(snr_meaningful),
        'excluded_pairs': equivalent_pairs,
        'n_excluded_cells': len(excluded_pairs),
        'n_meaningful_pairs': len(off_diagonal_meaningful)
    }

# ============================================================================
# TOKEN CONSISTENCY CHECK
# ============================================================================

def check_token_consistency(measurements, config_names, tokenizer):
    """Verify generated tokens across quantization configs."""
    log_print("\n" + "="*80)
    log_print("TOKEN GENERATION CONSISTENCY CHECK")
    log_print("="*80)

    for ref_idx in range(NUM_REFERENCES):
        log_print(f"\n--- ref_{ref_idx} ---")
        
        tokens_by_cfg = {}
        for cfg in config_names:
            tokens_by_cfg[cfg] = measurements[cfg][ref_idx]['generated_ids']

        reference_tokens = tokens_by_cfg[config_names[0]]

        for cfg in config_names:
            tokens = tokens_by_cfg[cfg]
            match_str = "✓" if tokens == reference_tokens else "✗ DIFFERENT"
            decoded_text = tokenizer.decode(tokens[:30])
            log_print(f"  {cfg}:")
            log_print(f"    Tokens: {len(tokens)}")
            log_print(f"    First 30: {repr(decoded_text)}...")
            log_print(f"    {match_str}")

# ============================================================================
# REPRODUCIBILITY CHECK
# ============================================================================

def run_reproducibility_check(config_names, prompts):
    """Measure within-format noise floor from atomics/non-deterministic kernels."""
    import torch
    
    log_print("\n" + "="*80)
    log_print("REPRODUCIBILITY CHECK (NOISE FLOOR MEASUREMENT)")
    log_print("="*80)
    log_print(f"Running {REPRODUCIBILITY_RUNS} identical inference passes per config")
    log_print("Measures within-format variance from atomics/non-deterministic kernels.\n")
    
    noise_floors = {}
    test_prompt = prompts[0]  # Use first prompt for reproducibility check
    
    for cfg_name in config_names:
        log_print(f"\n--- Checking: {cfg_name} ---")
        
        llm, tokenizer = load_vllm_model(cfg_name)
        
        # Run multiple times
        run_signals = []
        for run_idx in range(REPRODUCIBILITY_RUNS):
            log_print(f"  Run {run_idx + 1}: ", end="")
            gen_data = run_generation(llm, tokenizer, test_prompt)
            run_signals.append({
                'generated_ids': gen_data['generated_ids'],
                'decode_signals': gen_data['decode_signals'],
                'prefill_signals': gen_data['prefill_signals']
            })
            log_print(f"{gen_data['num_generated']} tokens")
        
        # Compute pairwise distances between all runs
        log_print(f"\n  Pairwise distances:")
        decode_dists = []
        prefill_dists = []
        
        for i in range(REPRODUCIBILITY_RUNS):
            for j in range(i + 1, REPRODUCIBILITY_RUNS):
                decode_dist = compare_signals(
                    run_signals[i]['decode_signals'],
                    run_signals[j]['decode_signals']
                )
                prefill_dist = compare_signals(
                    run_signals[i]['prefill_signals'],
                    run_signals[j]['prefill_signals']
                )
                decode_dists.append(decode_dist['logprobs_mean'])
                prefill_dists.append(prefill_dist['logprobs_mean'])
                
                log_print(f"    Run {i+1} vs {j+1}: decode={decode_dist['logprobs_mean']:.2e}, prefill={prefill_dist['logprobs_mean']:.2e}")
        
        # Compute noise floor stats
        finite_decode = [d for d in decode_dists if d != float('inf')]
        finite_prefill = [d for d in prefill_dists if d != float('inf')]
        
        decode_noise = np.mean(finite_decode) if finite_decode else 0
        prefill_noise = np.mean(finite_prefill) if finite_prefill else 0
        
        noise_floors[cfg_name] = {
            'decode': decode_noise,
            'prefill': prefill_noise
        }
        
        if decode_noise < EQUIVALENCE_THRESHOLD:
            log_print(f"\n  {cfg_name} noise floor: DETERMINISTIC (decode={decode_noise:.2e})")
        else:
            log_print(f"\n  {cfg_name} noise floor: decode={decode_noise:.2e}, prefill={prefill_noise:.2e}")
        
        del llm
        torch.cuda.empty_cache()
    
    log_print("\n" + "-"*40)
    log_print("NOISE FLOOR SUMMARY")
    log_print("-"*40)
    for cfg_name, nf in noise_floors.items():
        log_print(f"  {cfg_name}: decode={nf['decode']:.2e}, prefill={nf['prefill']:.2e}")
    log_print("\nCross-format signal must exceed this noise floor to be detectable.")
    log_print("="*80)
    
    return noise_floors

# ============================================================================
# MAIN
# ============================================================================

def main():
    import torch
    
    log_path = setup_logging()
    system_info = collect_system_info()
    config_names = list(MODEL_CONFIGS.keys())

    mode = "VERIFICATION (teacher-forcing)" if TEACHER_FORCING else "GENERATION"
    log_print("="*80)
    log_print(f"QUANTIZATION FORMAT DETECTION EXPERIMENT - {mode}")
    log_print("="*80)

    log_print(f"\nSystem: {system_info['hostname']}")
    log_print(f"GPU: {system_info['gpu_name']}")
    log_print(f"vLLM: {system_info['vllm_version']}")
    log_print(f"PyTorch: {system_info['torch_version']}")
    log_print(f"CUDA: {system_info['cuda_version']}")

    log_print(f"\nConfigurations:")
    for cfg_name, cfg in MODEL_CONFIGS.items():
        log_print(f"  {cfg_name}: {cfg['model_name']} (quant={cfg['quantization']})")
    
    if TEACHER_FORCING:
        log_print(f"\nReference file: {REFERENCE_FILE}")
    log_print()

    output_dir = '/workspace/experiments'
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    if not TEACHER_FORCING:
        # ================================================================
        # GENERATION MODE
        # ================================================================
        
        # First, load one model to get tokenizer for prompt creation
        log_print("Loading first model to create prompts...")
        first_llm, tokenizer = load_vllm_model(config_names[0])
        prompts = create_prompts_from_pdf(tokenizer, NUM_REFERENCES)
        log_print(f"Created {len(prompts)} prompts\n")
        del first_llm
        torch.cuda.empty_cache()
        
        # Reproducibility check
        noise_floors = {}
        if REPRODUCIBILITY_CHECK:
            noise_floors = run_reproducibility_check(config_names, prompts)

        # Main generation
        log_print("\n" + "="*80)
        log_print("GENERATION MODE")
        log_print("="*80)

        results = {
            'metadata': {
                'environment': system_info,
                'model_configs': {k: v for k, v in MODEL_CONFIGS.items()},
                'max_new_tokens': MAX_NEW_TOKENS,
                'tokens_per_slice': TOKENS_PER_SLICE,
                'num_references': NUM_REFERENCES,
                'top_k_logprobs': TOP_K_LOGPROBS,
                'timestamp': timestamp
            },
            'generations': {}
        }

        measurements = {}

        for cfg_name in config_names:
            log_print(f"\n--- Config: {cfg_name} ---")

            llm, tokenizer = load_vllm_model(cfg_name)

            results['generations'][cfg_name] = []
            measurements[cfg_name] = []

            for ref_idx, prompt_ids in enumerate(prompts):
                log_print(f"  ref_{ref_idx}: ", end="")
                gen_data = run_generation(llm, tokenizer, prompt_ids)

                results['generations'][cfg_name].append({
                    'ref_idx': ref_idx,
                    'prompt_ids': gen_data['prompt_ids'],
                    'generated_ids': gen_data['generated_ids'],
                    'prompt_length': gen_data['prompt_length'],
                    'prefill_signals': gen_data['prefill_signals'],
                    'decode_signals': gen_data['decode_signals'],
                    'num_generated': gen_data['num_generated']
                })

                measurements[cfg_name].append({
                    'generated_ids': gen_data['generated_ids'],
                    'prefill_signals': gen_data['prefill_signals'],
                    'decode_signals': gen_data['decode_signals']
                })

                log_print(f"{gen_data['num_generated']} tokens")
                decoded = tokenizer.decode(gen_data['generated_ids'][:20])
                log_print(f"    -> {decoded}...")

            del llm
            torch.cuda.empty_cache()

        # Token consistency check
        check_token_consistency(measurements, config_names, tokenizer)

        # Within-hardware analysis
        prefill_sanity = analyze_within_hardware(measurements, config_names, 'prefill')
        decode_sanity = analyze_within_hardware(measurements, config_names, 'decode')

        results['prefill_sanity_check'] = prefill_sanity
        results['decode_sanity_check'] = decode_sanity
        
        # Store noise floors
        if noise_floors:
            results['noise_floors'] = noise_floors
            
            # Compute signal-to-noise ratio
            avg_noise_decode = np.mean([nf['decode'] for nf in noise_floors.values()])
            cross_format_decode = decode_sanity.get('off_diagonal_mean', 0)
            
            if avg_noise_decode > 0 and cross_format_decode:
                snr_decode = cross_format_decode / avg_noise_decode
                log_print("\n" + "="*80)
                log_print("SIGNAL-TO-NOISE ANALYSIS")
                log_print("="*80)
                log_print(f"\nNoise floor (within-format variance): {avg_noise_decode:.2e}")
                log_print(f"Cross-format distance: {cross_format_decode:.2e}")
                log_print(f"SNR: {snr_decode:.1f}×")
                
                if snr_decode > 10:
                    log_print(f"→ Quantization format is DETECTABLE (SNR > 10)")
                elif snr_decode > 3:
                    log_print(f"→ Quantization format is MARGINALLY detectable (3 < SNR < 10)")
                else:
                    log_print(f"→ Quantization format is NOT reliably detectable (SNR < 3)")
                
                results['snr_analysis'] = {
                    'noise_floor': avg_noise_decode,
                    'signal': cross_format_decode,
                    'snr': snr_decode
                }

        # Save
        filepath = os.path.join(output_dir, f"quant_generate_{timestamp}.json")
        with open(filepath, 'w') as f:
            json.dump(results, f, indent=2)

        log_print(f"\n✓ Generation results saved to: {filepath}")
        log_print(f"\nNext step: Copy {filepath} to verifier machine")
        log_print(f"Then set TEACHER_FORCING = True and REFERENCE_FILE = '<path>'")

        file_size_mb = os.path.getsize(filepath) / (1024 * 1024)
        log_print(f"File size: {file_size_mb:.1f} MB")

    else:
        # ================================================================
        # VERIFICATION MODE
        # ================================================================
        log_print("Loading reference file...")
        with open(REFERENCE_FILE, 'r') as f:
            content = f.read()
            content = content.replace('Infinity', '1e309')
            reference = json.loads(content)

        ref_env = reference['metadata']['environment']
        ref_gpu = ref_env['gpu_name']
        log_print(f"Reference GPU: {ref_gpu}")
        log_print(f"Verifier GPU:  {system_info['gpu_name']}")

        env_validation = validate_environment_match(ref_env, system_info)

        # Load equivalent pairs from reference
        prefill_equiv_pairs = reference.get('prefill_sanity_check', {}).get('equivalent_pairs', [])
        decode_equiv_pairs = reference.get('decode_sanity_check', {}).get('equivalent_pairs', [])
        
        # Convert to tuples if stored as lists
        prefill_equiv_pairs = [tuple(p) for p in prefill_equiv_pairs]
        decode_equiv_pairs = [tuple(p) for p in decode_equiv_pairs]
        
        log_print(f"\nLoaded equivalent pairs from reference:")
        log_print(f"  Prefill: {prefill_equiv_pairs}")
        log_print(f"  Decode: {decode_equiv_pairs}")

        comparison_results = []

        for verify_cfg in config_names:
            log_print(f"\n{'='*80}")
            log_print(f"VERIFYING WITH: {verify_cfg}")
            log_print("="*80)

            llm, tokenizer = load_vllm_model(verify_cfg)

            for claimed_cfg in config_names:
                log_print(f"\n  Claimed config: {claimed_cfg}")

                for ref_idx, gen_data in enumerate(reference['generations'][claimed_cfg]):
                    is_diagonal = (claimed_cfg == verify_cfg)

                    log_print(f"    ref_{ref_idx} ({'diag' if is_diagonal else 'off'}):", end="")

                    verify_result = run_teacher_forced_verification(
                        llm, tokenizer, gen_data, is_diagonal
                    )

                    prefill_distances = compare_signals(
                        gen_data['prefill_signals'],
                        verify_result['prefill_signals']
                    )

                    decode_distances = compare_signals(
                        gen_data['decode_signals'],
                        verify_result['decode_signals']
                    )

                    log_print(f"      Prefill: {prefill_distances['logprobs_mean']:.2e}, Decode: {decode_distances['logprobs_mean']:.2e}")

                    comparison_results.append({
                        'ref_idx': ref_idx,
                        'claimed_config': claimed_cfg,
                        'verify_config': verify_cfg,
                        'is_diagonal': is_diagonal,
                        'prefill_distances': prefill_distances,
                        'decode_distances': decode_distances
                    })

            del llm
            torch.cuda.empty_cache()

        # Analyze with equivalent pair exclusion
        prefill_analysis = analyze_cross_hardware(
            comparison_results, config_names, signal_source='prefill',
            equivalent_pairs=prefill_equiv_pairs
        )
        decode_analysis = analyze_cross_hardware(
            comparison_results, config_names, signal_source='decode',
            equivalent_pairs=decode_equiv_pairs
        )

        log_print("\n" + "="*80)
        log_print("PREFILL vs DECODE COMPARISON")
        log_print("="*80)
        log_print(f"Prefill SNR (meaningful): {prefill_analysis['snr_meaningful']:.2f}×")
        log_print(f"Decode SNR (meaningful):  {decode_analysis['snr_meaningful']:.2f}×")

        results = {
            'metadata': {
                'reference_gpu': ref_gpu,
                'verifier_gpu': system_info['gpu_name'],
                'reference_file': REFERENCE_FILE,
                'reference_environment': ref_env,
                'verifier_environment': system_info,
                'environment_validation': env_validation,
                'model_configs': {k: v for k, v in MODEL_CONFIGS.items()},
                'timestamp': timestamp,
                'prefill_equivalent_pairs': prefill_equiv_pairs,
                'decode_equivalent_pairs': decode_equiv_pairs
            },
            'comparisons': comparison_results,
            'prefill_analysis': prefill_analysis,
            'decode_analysis': decode_analysis
        }

        filepath = os.path.join(output_dir, f"quant_verify_{timestamp}.json")
        with open(filepath, 'w') as f:
            json.dump(results, f, indent=2)

        log_print(f"\n✓ Results saved to: {filepath}")

    log_print(f"\n{'='*80}")
    log_print("EXPERIMENT COMPLETE")
    log_print("="*80 + "\n")

    close_logging()

if __name__ == "__main__":
    main()



QUANTIZATION FORMAT DETECTION EXPERIMENT - GENERATION

System: 637c3157512e
GPU: NVIDIA A100-SXM4-80GB
vLLM: 0.11.2
PyTorch: 2.9.1+cu128
CUDA: 12.8

Configurations:
  awq: Qwen/Qwen3-8B-AWQ (quant=awq)
  awq_marlin: Qwen/Qwen3-8B-AWQ (quant=awq_marlin)
  gptq_marlin: AlphaGaO/Qwen3-8B-GPTQ (quant=gptq_marlin)

Loading first model to create prompts...
Loading model: Qwen/Qwen3-8B-AWQ
  Quantization: awq
  Dtype: float16
INFO 11-30 09:24:50 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'awq', 'model': 'Qwen/Qwen3-8B-AWQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-30 09:24:50 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-30 09:24:50 [model.py:1745] Using max model len 40960
INFO 11-30 09:24:51 [awq_marlin.py:166] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 11-30 09:24:51 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.


Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]





[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:24:58 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-8B-AWQ', speculative_config=None, tokenizer='Qwen/Qwen3-8B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_n

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:03<00:03,  3.66s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  1.85s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  2.12s/it]
[1;36m(EngineCore_DP0 pid=5273)[0;0m 


[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:03 [default_loader.py:314] Loading weights took 4.29 seconds
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:04 [gpu_model_runner.py:3338] Model loading took 5.7086 GiB memory and 4.842724 seconds
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:11 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/951c718543/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:11 [backends.py:647] Dynamo bytecode transform time: 6.64 s
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:18 [backends.py:210] Directly load the compiled graph(s) for dynamic shape from the cache, took 6.676 s
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:19 [monitor.py:34] torch.compile takes 13.31 s in total
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:21 [gpu_worker.py:359] Available KV cache memory: 48.28 GiB
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:04<00:00, 12.39it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:02<00:00, 13.58it/s]


[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:28 [gpu_model_runner.py:4244] Graph capturing finished in 7 secs, took 0.95 GiB
[1;36m(EngineCore_DP0 pid=5273)[0;0m INFO 11-30 09:25:28 [core.py:250] init engine (profile, create kv cache, warmup model) took 24.35 seconds
INFO 11-30 09:25:29 [llm.py:352] Supported tasks: ['generate']
Found 1 PDF(s)
  Loading: /workspace/Verification-for-International-AI-Governance.pdf
Total source tokens: 120215
Prompt structure: 33 prefix + 8000 snippet + 34 suffix = 8067 tokens
Created 4 prompts






REPRODUCIBILITY CHECK (NOISE FLOOR MEASUREMENT)
Running 3 identical inference passes per config
Measures within-format variance from atomics/non-deterministic kernels.


--- Checking: awq ---
Loading model: Qwen/Qwen3-8B-AWQ
  Quantization: awq
  Dtype: float16
INFO 11-30 09:25:36 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'awq', 'model': 'Qwen/Qwen3-8B-AWQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-30 09:25:36 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-30 09:25:36 [model.py:1745] Using max model len 40960
INFO 11-30 09:25:36 [awq_marlin.py:166] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 11-30 09:25:36 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.


Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]



[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:25:42 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-8B-AWQ', speculative_config=None, tokenizer='Qwen/Qwen3-8B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_n

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:03<00:03,  3.70s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  1.88s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  2.15s/it]
[1;36m(EngineCore_DP0 pid=5563)[0;0m 


[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:25:48 [default_loader.py:314] Loading weights took 4.34 seconds
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:25:48 [gpu_model_runner.py:3338] Model loading took 5.7086 GiB memory and 4.860261 seconds
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:25:55 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/951c718543/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:25:55 [backends.py:647] Dynamo bytecode transform time: 6.72 s
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:26:03 [backends.py:210] Directly load the compiled graph(s) for dynamic shape from the cache, took 6.789 s
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:26:04 [monitor.py:34] torch.compile takes 13.51 s in total
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:26:05 [gpu_worker.py:359] Available KV cache memory: 48.28 GiB
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:04<00:00, 12.38it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:02<00:00, 13.57it/s]


[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:26:13 [gpu_model_runner.py:4244] Graph capturing finished in 7 secs, took 0.95 GiB
[1;36m(EngineCore_DP0 pid=5563)[0;0m INFO 11-30 09:26:13 [core.py:250] init engine (profile, create kv cache, warmup model) took 24.60 seconds
INFO 11-30 09:26:14 [llm.py:352] Supported tasks: ['generate']
  Run 1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
  Run 2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
  Run 3: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens

  Pairwise distances:
    Run 1 vs 2: decode=0.00e+00, prefill=0.00e+00
    Run 1 vs 3: decode=0.00e+00, prefill=0.00e+00
    Run 2 vs 3: decode=0.00e+00, prefill=0.00e+00

  awq noise floor: DETERMINISTIC (decode=0.00e+00)





--- Checking: awq_marlin ---
Loading model: Qwen/Qwen3-8B-AWQ
  Quantization: awq_marlin
  Dtype: float16
INFO 11-30 09:26:32 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'awq_marlin', 'model': 'Qwen/Qwen3-8B-AWQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-30 09:26:32 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-30 09:26:33 [model.py:1745] Using max model len 40960
INFO 11-30 09:26:33 [awq_marlin.py:162] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-30 09:26:33 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.


Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]



[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:26:39 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-8B-AWQ', speculative_config=None, tokenizer='Qwen/Qwen3-8B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:03<00:03,  3.46s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  1.81s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:04<00:00,  2.06s/it]
[1;36m(EngineCore_DP0 pid=5843)[0;0m 


[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:26:44 [default_loader.py:314] Loading weights took 4.16 seconds
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:26:45 [gpu_model_runner.py:3338] Model loading took 5.7087 GiB memory and 5.078252 seconds
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:26:52 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/b8c5808a6c/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:26:52 [backends.py:647] Dynamo bytecode transform time: 6.92 s
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:26:58 [backends.py:210] Directly load the compiled graph(s) for dynamic shape from the cache, took 5.383 s
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:27:00 [monitor.py:34] torch.compile takes 12.31 s in total
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:27:01 [gpu_worker.py:359] Available KV cache memory: 48.28 GiB
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 21.21it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 24.86it/s]


[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:27:06 [gpu_model_runner.py:4244] Graph capturing finished in 4 secs, took 0.66 GiB
[1;36m(EngineCore_DP0 pid=5843)[0;0m INFO 11-30 09:27:06 [core.py:250] init engine (profile, create kv cache, warmup model) took 20.93 seconds
INFO 11-30 09:27:07 [llm.py:352] Supported tasks: ['generate']
  Run 1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
  Run 2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
  Run 3: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens

  Pairwise distances:
    Run 1 vs 2: decode=0.00e+00, prefill=0.00e+00
    Run 1 vs 3: decode=0.00e+00, prefill=0.00e+00
    Run 2 vs 3: decode=0.00e+00, prefill=0.00e+00

  awq_marlin noise floor: DETERMINISTIC (decode=0.00e+00)





--- Checking: gptq_marlin ---
Loading model: AlphaGaO/Qwen3-8B-GPTQ
  Quantization: gptq_marlin
  Dtype: float16
INFO 11-30 09:27:13 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'gptq_marlin', 'model': 'AlphaGaO/Qwen3-8B-GPTQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


config.json: 0.00B [00:00, ?B/s]

INFO 11-30 09:27:13 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-30 09:27:13 [model.py:1745] Using max model len 40960
INFO 11-30 09:27:13 [gptq_marlin.py:228] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 11-30 09:27:13 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]



[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:20 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='AlphaGaO/Qwen3-8B-GPTQ', speculative_config=None, tokenizer='AlphaGaO/Qwen3-8B-GPTQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.26it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.37it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.35it/s]
[1;36m(EngineCore_DP0 pid=6140)[0;0m 


[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:32 [default_loader.py:314] Loading weights took 1.56 seconds
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:33 [gpu_model_runner.py:3338] Model loading took 5.6900 GiB memory and 10.996772 seconds
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:40 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/95bc7cde1c/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:40 [backends.py:647] Dynamo bytecode transform time: 7.29 s
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:42 [backends.py:251] Cache the graph for dynamic shape for later use
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:51 [backends.py:282] Compiling a graph for dynamic shape takes 9.83 s
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:53 [monitor.py:34] torch.compile takes 17.12 s in total
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:55 [gpu_wor

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 21.16it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 25.26it/s]


[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:59 [gpu_model_runner.py:4244] Graph capturing finished in 5 secs, took 0.76 GiB
[1;36m(EngineCore_DP0 pid=6140)[0;0m INFO 11-30 09:27:59 [core.py:250] init engine (profile, create kv cache, warmup model) took 26.80 seconds
INFO 11-30 09:28:00 [llm.py:352] Supported tasks: ['generate']
  Run 1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
  Run 2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
  Run 3: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens

  Pairwise distances:
    Run 1 vs 2: decode=0.00e+00, prefill=0.00e+00
    Run 1 vs 3: decode=0.00e+00, prefill=0.00e+00
    Run 2 vs 3: decode=0.00e+00, prefill=0.00e+00

  gptq_marlin noise floor: DETERMINISTIC (decode=0.00e+00)





----------------------------------------
NOISE FLOOR SUMMARY
----------------------------------------
  awq: decode=0.00e+00, prefill=0.00e+00
  awq_marlin: decode=0.00e+00, prefill=0.00e+00
  gptq_marlin: decode=0.00e+00, prefill=0.00e+00

Cross-format signal must exceed this noise floor to be detectable.

GENERATION MODE

--- Config: awq ---
Loading model: Qwen/Qwen3-8B-AWQ
  Quantization: awq
  Dtype: float16
INFO 11-30 09:28:06 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'awq', 'model': 'Qwen/Qwen3-8B-AWQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-30 09:28:06 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-30 09:28:06 [model.py:1745] Using max model len 40960
INFO 11-30 09:28:06 [awq_marlin.py:166] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 11-30 09:28:06 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.


Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]



[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:13 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-8B-AWQ', speculative_config=None, tokenizer='Qwen/Qwen3-8B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_n

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:03<00:03,  3.25s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:03<00:00,  1.68s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:03<00:00,  1.91s/it]
[1;36m(EngineCore_DP0 pid=6453)[0;0m 


[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:18 [default_loader.py:314] Loading weights took 3.87 seconds
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:18 [gpu_model_runner.py:3338] Model loading took 5.7086 GiB memory and 4.392062 seconds
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:25 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/951c718543/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:25 [backends.py:647] Dynamo bytecode transform time: 6.64 s
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:32 [backends.py:210] Directly load the compiled graph(s) for dynamic shape from the cache, took 6.801 s
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:34 [monitor.py:34] torch.compile takes 13.44 s in total
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:35 [gpu_worker.py:359] Available KV cache memory: 48.28 GiB
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:04<00:00, 12.40it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:02<00:00, 13.55it/s]


[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:43 [gpu_model_runner.py:4244] Graph capturing finished in 7 secs, took 0.95 GiB
[1;36m(EngineCore_DP0 pid=6453)[0;0m INFO 11-30 09:28:43 [core.py:250] init engine (profile, create kv cache, warmup model) took 24.48 seconds
INFO 11-30 09:28:44 [llm.py:352] Supported tasks: ['generate']
  ref_0: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let me try to figure out what kind of document this is and what it's...
  ref_1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let me try to figure out what kind of document this is and what it's...
  ref_2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let me try to figure out what kind of document this is and what it's...
  ref_3: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let's tackle this query. The user provided a lengthy excerpt from a document and...





--- Config: awq_marlin ---
Loading model: Qwen/Qwen3-8B-AWQ
  Quantization: awq_marlin
  Dtype: float16
INFO 11-30 09:29:08 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'awq_marlin', 'model': 'Qwen/Qwen3-8B-AWQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-30 09:29:08 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-30 09:29:08 [model.py:1745] Using max model len 40960
INFO 11-30 09:29:08 [awq_marlin.py:162] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 11-30 09:29:08 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.


Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]



[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:14 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='Qwen/Qwen3-8B-AWQ', speculative_config=None, tokenizer='Qwen/Qwen3-8B-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:03<00:03,  3.39s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:03<00:00,  1.73s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:03<00:00,  1.98s/it]
[1;36m(EngineCore_DP0 pid=6732)[0;0m 


[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:20 [default_loader.py:314] Loading weights took 4.01 seconds
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:21 [gpu_model_runner.py:3338] Model loading took 5.7087 GiB memory and 4.940562 seconds
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:28 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/b8c5808a6c/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:28 [backends.py:647] Dynamo bytecode transform time: 7.00 s
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:34 [backends.py:210] Directly load the compiled graph(s) for dynamic shape from the cache, took 5.545 s
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:35 [monitor.py:34] torch.compile takes 12.54 s in total
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:37 [gpu_worker.py:359] Available KV cache memory: 48.28 GiB
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 20.26it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 23.98it/s]


[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:42 [gpu_model_runner.py:4244] Graph capturing finished in 5 secs, took 0.66 GiB
[1;36m(EngineCore_DP0 pid=6732)[0;0m INFO 11-30 09:29:42 [core.py:250] init engine (profile, create kv cache, warmup model) took 21.46 seconds
INFO 11-30 09:29:43 [llm.py:352] Supported tasks: ['generate']
  ref_0: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let me try to figure out what kind of document this is and what it's...
  ref_1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let me try to figure out what kind of document this is and what it's...
  ref_2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let me try to figure out what kind of document this is and what it's...
  ref_3: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let's tackle this query. The user provided a lengthy excerpt from a document and...





--- Config: gptq_marlin ---
Loading model: AlphaGaO/Qwen3-8B-GPTQ
  Quantization: gptq_marlin
  Dtype: float16
INFO 11-30 09:29:50 [utils.py:253] non-default args: {'trust_remote_code': True, 'download_dir': '/workspace/huggingface_cache', 'dtype': 'float16', 'gpu_memory_utilization': 0.7, 'disable_log_stats': True, 'quantization': 'gptq_marlin', 'model': 'AlphaGaO/Qwen3-8B-GPTQ'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 11-30 09:29:50 [model.py:631] Resolved architecture: Qwen3ForCausalLM
INFO 11-30 09:29:50 [model.py:1745] Using max model len 40960
INFO 11-30 09:29:50 [gptq_marlin.py:228] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel.
INFO 11-30 09:29:50 [scheduler.py:216] Chunked prefill is enabled with max_num_batched_tokens=8192.


Parse safetensors files:   0%|          | 0/2 [00:00<?, ?it/s]



[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:29:56 [core.py:93] Initializing a V1 LLM engine (v0.11.2) with config: model='AlphaGaO/Qwen3-8B-GPTQ', speculative_config=None, tokenizer='AlphaGaO/Qwen3-8B-GPTQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir='/workspace/huggingface_cache', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.28it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]
[1;36m(EngineCore_DP0 pid=7005)[0;0m 


[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:00 [default_loader.py:314] Loading weights took 1.76 seconds
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:00 [gpu_model_runner.py:3338] Model loading took 5.6900 GiB memory and 2.412623 seconds
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:08 [backends.py:631] Using cache directory: /root/.cache/vllm/torch_compile_cache/95bc7cde1c/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:08 [backends.py:647] Dynamo bytecode transform time: 7.39 s
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:14 [backends.py:210] Directly load the compiled graph(s) for dynamic shape from the cache, took 5.463 s
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:16 [monitor.py:34] torch.compile takes 12.85 s in total
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:17 [gpu_worker.py:359] Available KV cache memory: 48.30 GiB
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 51/51 [00:02<00:00, 20.23it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████| 35/35 [00:01<00:00, 23.80it/s]


[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:22 [gpu_model_runner.py:4244] Graph capturing finished in 5 secs, took 0.76 GiB
[1;36m(EngineCore_DP0 pid=7005)[0;0m INFO 11-30 09:30:22 [core.py:250] init engine (profile, create kv cache, warmup model) took 22.01 seconds
INFO 11-30 09:30:23 [llm.py:352] Supported tasks: ['generate']
  ref_0: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let's see. The user provided an excerpt from a document and wants to know...
  ref_1: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let's see. The user provided an excerpt from a document and wants to know...
  ref_2: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let me try to figure out what kind of document this is and what it's...
  ref_3: 

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

100 tokens
    -> <think>
Okay, let's see. The user provided an excerpt from a document and wants to know...





TOKEN GENERATION CONSISTENCY CHECK

--- ref_0 ---
  awq:
    Tokens: 100
    First 30: "<think>\nOkay, let me try to figure out what kind of document this is and what it's about. The user provided an excerpt from a document"...
    ✓
  awq_marlin:
    Tokens: 100
    First 30: "<think>\nOkay, let me try to figure out what kind of document this is and what it's about. The user provided an excerpt from a document"...
    ✓
  gptq_marlin:
    Tokens: 100
    First 30: "<think>\nOkay, let's see. The user provided an excerpt from a document and wants to know what type of document it is and its subject matter"...
    ✗ DIFFERENT

--- ref_1 ---
  awq:
    Tokens: 100
    First 30: "<think>\nOkay, let me try to figure out what kind of document this is and what it's about. The user provided a long excerpt from a"...
    ✓
  awq_marlin:
    Tokens: 100
    First 30: "<think>\nOkay, let me try to figure out what kind of document this is and what it's about. The user provided a long excerpt from 