In [None]:
#!/usr/bin/env python3
"""
vLLM Logprobs Extraction Test - DeepSeek-Coder-V2-Lite-Instruct
Tests bit-exact reproducibility across multiple runs
Automatically finds txt/pdf files in current directory
Uses standard OpenAI-compatible message format
"""

# ============================================================================
# SUPPRESS VERBOSE LOGGING
# ============================================================================
import os
os.environ['HF_HOME'] = '/tmp/hf_cache'
os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'

# Suppress vLLM verbose output
os.environ['VLLM_LOGGING_LEVEL'] = 'WARNING'
os.environ['VLLM_CONFIGURE_LOGGING'] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import warnings
warnings.filterwarnings('ignore')

import logging
# Suppress vLLM and related libraries
logging.getLogger('vllm').setLevel(logging.ERROR)
logging.getLogger('vllm.engine').setLevel(logging.ERROR)
logging.getLogger('vllm.worker').setLevel(logging.ERROR)
logging.getLogger('vllm.executor').setLevel(logging.ERROR)
logging.getLogger('transformers').setLevel(logging.ERROR)
logging.getLogger('torch').setLevel(logging.ERROR)

# Keep download progress - only allow INFO for huggingface_hub downloads
logging.getLogger('huggingface_hub').setLevel(logging.INFO)
logging.getLogger('huggingface_hub.file_download').setLevel(logging.INFO)

# ============================================================================
# IMPORTS
# ============================================================================

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import numpy as np
from datetime import datetime
import json
import torch
import glob

# ============================================================================
# CONFIGURATION
# ============================================================================

# Model configuration
MODEL_NAME = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
TENSOR_PARALLEL_SIZE = 2
MAX_MODEL_LEN = 128000
GPU_MEMORY_UTILIZATION = 0.9

# Generation configuration
MAX_TOKENS = 40
NUM_REPETITIONS = 20
TEMPERATURE = 0.0  # Greedy decoding
SEED = 42
TOP_LOGPROBS = 10

# Prompt source - finds first txt or pdf in current directory
AUTO_FIND_FILE = True  # Set to False to use hardcoded content

# User task
USER_TASK = "Please provide a detailed summary of the following text."

# Hardcoded content (used if AUTO_FIND_FILE=False or no files found)
HARDCODED_CONTENT = """The development of large language models has fundamentally transformed natural language processing and artificial intelligence more broadly. These models, trained on vast corpora of text data, have demonstrated remarkable capabilities across a wide range of tasks, from translation and summarization to question answering and creative writing."""

# ============================================================================
# FILE LOADING UTILITIES
# ============================================================================

def find_prompt_file():
    """Find first txt or pdf file in current directory"""
    cwd = os.getcwd()
    
    # Look for txt files first, then pdf
    txt_files = glob.glob(os.path.join(cwd, "*.txt"))
    pdf_files = glob.glob(os.path.join(cwd, "*.pdf"))
    
    if txt_files:
        return txt_files[0]
    elif pdf_files:
        return pdf_files[0]
    else:
        return None

def load_text_from_file(filepath):
    """Load text from txt or pdf file"""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    if filepath.endswith('.txt'):
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        print(f"Loaded {len(text)} characters from txt file")
        return text
    
    elif filepath.endswith('.pdf'):
        try:
            import PyPDF2
        except ImportError:
            raise ImportError("PyPDF2 required for PDF loading. Install with: pip install PyPDF2")
        
        text = []
        with open(filepath, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            num_pages = len(pdf_reader.pages)
            print(f"Loading {num_pages} pages from PDF...")
            
            for page_num, page in enumerate(pdf_reader.pages, 1):
                page_text = page.extract_text()
                text.append(page_text)
                if page_num % 10 == 0:
                    print(f"  Processed {page_num}/{num_pages} pages")
        
        full_text = '\n'.join(text)
        print(f"Loaded {len(full_text)} characters from PDF ({num_pages} pages)")
        return full_text
    
    else:
        raise ValueError(f"Unsupported file type: {filepath}. Use .txt or .pdf")

# ============================================================================
# PROMPT LOADING
# ============================================================================

print("=" * 80)
print("vLLM LOGPROBS EXTRACTION TEST")
print("=" * 80)
print()

# Load document content
prompt_file = None
if AUTO_FIND_FILE:
    prompt_file = find_prompt_file()
    if prompt_file:
        print(f"Found file: {os.path.basename(prompt_file)}")
        DOCUMENT_CONTENT = load_text_from_file(prompt_file)
        print()
    else:
        print("No txt/pdf files found in current directory")
        print("Using hardcoded content")
        DOCUMENT_CONTENT = HARDCODED_CONTENT
        print()
else:
    DOCUMENT_CONTENT = HARDCODED_CONTENT
    print("Using hardcoded content")
    print()

# Prepare messages in standard OpenAI format
# vLLM's tokenizer will apply DeepSeek-Coder-V2-Lite-Instruct's chat template
messages = [
    {"role": "user", "content": f"{USER_TASK}\n\n{DOCUMENT_CONTENT}"}
]

print("Using standard OpenAI-compatible message format")
print(f"Message content length: {len(messages[0]['content'])} characters")
print("Note: Chat template will be applied to convert messages to string prompt")
print()

# ============================================================================
# TOKENIZER PRECHECK
# ============================================================================

print("Loading tokenizer to validate prompt length...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    cache_dir='/tmp/hf_cache',
    trust_remote_code=True
)

# Apply chat template to convert messages to string prompt
# This is the actual prompt that will be used for generation
prompt_text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print("Chat template applied successfully")
print()

# Tokenize prompt
prompt_tokens = tokenizer.encode(prompt_text)
prompt_length = len(prompt_tokens)

print(f"Prompt statistics:")
print(f"  Characters: {len(prompt_text):,}")
print(f"  Tokens: {prompt_length:,}")
print(f"  Max model length: {MAX_MODEL_LEN:,}")
print(f"  Generation tokens: {MAX_TOKENS}")
print(f"  Total required: {prompt_length + MAX_TOKENS:,}")
print()

# Validate length
if prompt_length > MAX_MODEL_LEN:
    print(f"❌ ERROR: Prompt is too long!")
    print(f"  Prompt has {prompt_length:,} tokens")
    print(f"  Model max is {MAX_MODEL_LEN:,} tokens")
    print(f"  Exceeds by {prompt_length - MAX_MODEL_LEN:,} tokens")
    print()
    print("Solutions:")
    print(f"  1. Increase MAX_MODEL_LEN to at least {prompt_length + MAX_TOKENS}")
    print(f"  2. Truncate/reduce the prompt")
    exit(1)

if prompt_length + MAX_TOKENS > MAX_MODEL_LEN:
    print(f"⚠️  WARNING: Prompt + generation may exceed context")
    print(f"  Prompt: {prompt_length:,} tokens")
    print(f"  Generation: {MAX_TOKENS} tokens")
    print(f"  Total: {prompt_length + MAX_TOKENS:,} tokens")
    print(f"  Model max: {MAX_MODEL_LEN:,} tokens")
    print(f"  Consider increasing MAX_MODEL_LEN to {prompt_length + MAX_TOKENS + 100}")
    print()
else:
    print(f"✓ Prompt length validation passed")
    print(f"  Remaining capacity: {MAX_MODEL_LEN - prompt_length - MAX_TOKENS:,} tokens")
    print()

# ============================================================================
# MODEL LOADING
# ============================================================================

print("Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Tensor parallel: {TENSOR_PARALLEL_SIZE}")
print(f"  Max model len: {MAX_MODEL_LEN:,}")
print(f"  Max tokens: {MAX_TOKENS}")
print(f"  Temperature: {TEMPERATURE}")
print(f"  Seed: {SEED}")
print(f"  Repetitions: {NUM_REPETITIONS}")
print()

print("Loading model...")
print()

llm = LLM(
    model=MODEL_NAME,
    tensor_parallel_size=TENSOR_PARALLEL_SIZE,
    max_model_len=MAX_MODEL_LEN,
    gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    trust_remote_code=True,
    seed=SEED,
    enforce_eager=True,  # Disable cudagraph for determinism
    enable_prefix_caching=False  # Disable prefix caching for clean experiment
)

print("Model loaded successfully!")
print()

# ============================================================================
# SAMPLING CONFIGURATION
# ============================================================================

sampling_params = SamplingParams(
    temperature=TEMPERATURE,
    max_tokens=MAX_TOKENS,
    seed=SEED,
    logprobs=TOP_LOGPROBS,
    skip_special_tokens=False
)

print("Sampling parameters:")
print(f"  Temperature: {TEMPERATURE}")
print(f"  Max tokens: {MAX_TOKENS}")
print(f"  Seed: {SEED}")
print(f"  Top logprobs: {TOP_LOGPROBS}")
print()

# ============================================================================
# WARMUP
# ============================================================================

print("Running warmup...")
# vLLM's LLM.generate() expects string prompts, not message dicts
# The chat template was already applied above during validation
warmup_output = llm.generate(prompt_text, sampling_params=sampling_params)
print(f"Warmup complete - generated {len(warmup_output[0].outputs[0].token_ids)} tokens")
print()

# ============================================================================
# MAIN EXPERIMENT
# ============================================================================

print("=" * 80)
print("RUNNING EXPERIMENT")
print("=" * 80)
print()

results_tokens = []
results_logprobs = []
results_texts = []
results_distributions = []

for rep in range(NUM_REPETITIONS):
    print(f"Repetition {rep + 1}/{NUM_REPETITIONS}...")
    
    # Use the string prompt (chat template already applied)
    outputs = llm.generate(prompt_text, sampling_params=sampling_params)
    output = outputs[0]
    
    # Extract token IDs
    token_ids = output.outputs[0].token_ids
    results_tokens.append(token_ids)
    
    # Extract generated text
    text = output.outputs[0].text
    results_texts.append(text)
    
    # Extract logprobs for selected tokens
    logprobs_data = output.outputs[0].logprobs
    selected_logprobs = [lp[token_ids[i]].logprob for i, lp in enumerate(logprobs_data)]
    results_logprobs.append(np.array(selected_logprobs))
    
    # Extract full top-k distributions
    rep_distributions = []
    for position_logprobs in logprobs_data:
        # Get top-k sorted by logprob (descending)
        sorted_items = sorted(position_logprobs.items(), 
                            key=lambda x: x[1].logprob, 
                            reverse=True)[:TOP_LOGPROBS]
        rep_distributions.append([(tok, lp.logprob) for tok, lp in sorted_items])
    results_distributions.append(rep_distributions)
    
    print(f"  Generated {len(token_ids)} tokens")

print()
print("All repetitions complete!")
print()

# ============================================================================
# ANALYSIS
# ============================================================================

print("=" * 80)
print("ANALYSIS")
print("=" * 80)
print()

# Check token sequence identity
print("Checking token sequences...")
tokens_identical = all(
    results_tokens[0] == results_tokens[i] 
    for i in range(1, NUM_REPETITIONS)
)
print(f"Token sequences identical: {tokens_identical}")

if not tokens_identical:
    print("\n⚠️  Token sequences differ!")
    for i in range(1, NUM_REPETITIONS):
        if results_tokens[0] != results_tokens[i]:
            diff_positions = [
                j for j in range(min(len(results_tokens[0]), len(results_tokens[i])))
                if results_tokens[0][j] != results_tokens[i][j]
            ]
            print(f"  Rep 0 vs Rep {i}: {len(diff_positions)} positions differ")
            if diff_positions:
                print(f"    First difference at position {diff_positions[0]}")

# Check logprobs for selected tokens
print("\nChecking selected token logprobs...")
first_logprobs = results_logprobs[0]
logprobs_exact = all(
    np.allclose(first_logprobs, results_logprobs[i], rtol=0, atol=1e-10)
    for i in range(1, NUM_REPETITIONS)
)
print(f"Selected token logprobs bit-exact: {logprobs_exact}")

# Check top-k distributions
print("\nChecking full top-k distributions...")
distributions_exact = True
distribution_mismatches = []

first_dist = results_distributions[0]
for rep_idx in range(1, NUM_REPETITIONS):
    for pos_idx in range(len(first_dist)):
        dist_a = first_dist[pos_idx]
        dist_b = results_distributions[rep_idx][pos_idx]
        
        # Check if token IDs match in same order
        tokens_match = [t[0] for t in dist_a] == [t[0] for t in dist_b]
        
        # Check if logprobs are bit-exact
        if tokens_match:
            logprobs_match = all(
                abs(dist_a[i][1] - dist_b[i][1]) < 1e-10 
                for i in range(len(dist_a))
            )
            if not logprobs_match:
                distributions_exact = False
                distribution_mismatches.append((rep_idx, pos_idx))
        else:
            distributions_exact = False
            distribution_mismatches.append((rep_idx, pos_idx))

print(f"Top-k distributions bit-exact: {distributions_exact}")

if not distributions_exact:
    print(f"\n⚠️  Found {len(distribution_mismatches)} position mismatches in distributions")
    if len(distribution_mismatches) <= 5:
        for rep_idx, pos_idx in distribution_mismatches:
            print(f"  Rep 0 vs Rep {rep_idx}, position {pos_idx}")
    else:
        print(f"  First 5: {distribution_mismatches[:5]}")

if not logprobs_exact:
    print("\nL2 distances:")
    l2_distances = []
    for i in range(1, NUM_REPETITIONS):
        l2 = np.linalg.norm(first_logprobs - results_logprobs[i])
        l2_distances.append(l2)
        print(f"  Rep 0 vs Rep {i}: L2 = {l2:.6e}")
    
    print(f"\nMax L2: {max(l2_distances):.6e}")
    print(f"Mean L2: {np.mean(l2_distances):.6e}")
    
    # Element-wise statistics
    all_logprobs = np.array(results_logprobs)
    std_per_token = all_logprobs.std(axis=0)
    print(f"\nPer-token std statistics:")
    print(f"  Mean: {std_per_token.mean():.6e}")
    print(f"  Max: {std_per_token.max():.6e}")
    print(f"  Median: {np.median(std_per_token):.6e}")

print()

# ============================================================================
# VERDICT
# ============================================================================

print("=" * 80)
print("VERDICT")
print("=" * 80)
print()

if tokens_identical and logprobs_exact and distributions_exact:
    print("✓ PERFECT REPRODUCIBILITY")
    print("  - Token sequences: bit-exact")
    print("  - Selected token logprobs: bit-exact")
    print("  - Full top-k distributions: bit-exact")
    print("  - DeepSeek-Coder-V2-Lite-Instruct with vLLM is deterministic for this config")
elif tokens_identical and logprobs_exact and not distributions_exact:
    print("⚠️  SELECTED TOKENS EXACT, DISTRIBUTIONS VARY")
    print("  - Token sequences: bit-exact")
    print("  - Selected token logprobs: bit-exact")
    print("  - Top-k distributions: numerical variation detected")
    print("  → May indicate computational instability in non-selected paths")
elif tokens_identical and not logprobs_exact:
    print("⚠️  TOKENS IDENTICAL, LOGPROBS VARY")
    print("  - Token sequences: bit-exact")
    print("  - Logprobs: small numerical variation")
    max_l2 = max(l2_distances) if not logprobs_exact else 0.0
    if max_l2 < 1e-6:
        print(f"  - Variation very small (L2={max_l2:.2e})")
        print("  → Likely acceptable for forensics")
    else:
        print(f"  - Variation notable (L2={max_l2:.2e})")
        print("  → Investigate noise source")
else:
    print("❌ TOKEN SEQUENCES DIFFER")
    print("  - This should NOT happen with temperature=0")
    print("  → Something is wrong, investigate")

print()

# ============================================================================
# SAVE RESULTS
# ============================================================================

output_data = {
    "experiment": "deepseek_coder_v2_lite_instruct_logprobs_test",
    "timestamp": datetime.now().isoformat(),
    "prompt_source": "file" if prompt_file else "hardcoded",
    "prompt_file": os.path.basename(prompt_file) if prompt_file else None,
    "prompt_text": prompt_text,  # Save actual prompt for reproducibility
    "prompt_length_chars": len(prompt_text),
    "prompt_length_tokens": prompt_length,
    "message_format": "openai_compatible",
    "config": {
        "model": MODEL_NAME,
        "tensor_parallel": TENSOR_PARALLEL_SIZE,
        "max_model_len": MAX_MODEL_LEN,
        "max_tokens": MAX_TOKENS,
        "repetitions": NUM_REPETITIONS,
        "temperature": TEMPERATURE,
        "seed": SEED,
        "warmup_enabled": True,
        "prefix_caching_disabled": True,
        "enforce_eager": True,
        "top_logprobs": TOP_LOGPROBS
    },
    "results": {
        "tokens_identical": tokens_identical,
        "logprobs_exact": logprobs_exact,
        "distributions_exact": distributions_exact,
        "perfect_reproducibility": tokens_identical and logprobs_exact and distributions_exact
    },
    "token_sequences": results_tokens,
    "logprobs_vectors": [lp.tolist() for lp in results_logprobs],
    "generated_texts": results_texts,
    "top_k_distributions": [
        [[(int(tok), float(prob)) for tok, prob in dist] for dist in rep_dists]
        for rep_dists in results_distributions
    ]
}

output_file = f"vllm_deepseek_coder_v2_lite_test_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, "w") as f:
    json.dump(output_data, f, indent=2)

print(f"Results saved to: {output_file}")
print()
print("=" * 80)
print("TEST COMPLETE")
print("=" * 80)
print()

vLLM LOGPROBS EXTRACTION TEST

Found file: Verification-for-International-AI-Governance.pdf
Loading 172 pages from PDF...
  Processed 10/172 pages
  Processed 20/172 pages
  Processed 30/172 pages
  Processed 40/172 pages
  Processed 50/172 pages
  Processed 60/172 pages
  Processed 70/172 pages
  Processed 80/172 pages
  Processed 90/172 pages
  Processed 100/172 pages
  Processed 110/172 pages
  Processed 120/172 pages
  Processed 130/172 pages
  Processed 140/172 pages
  Processed 150/172 pages
  Processed 160/172 pages
  Processed 170/172 pages
Loaded 535619 characters from PDF (172 pages)

Using standard OpenAI-compatible message format
Message content length: 535677 characters
Note: Chat template will be applied to convert messages to string prompt

Loading tokenizer to validate prompt length...


Downloading 'tokenizer_config.json' to '/tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/685b4b63ad1aedad8f98824ab21d2ff422d9e1b1.incomplete'


tokenizer_config.json: 0.00B [00:00, ?B/s]

Download complete. Moving file to /tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/685b4b63ad1aedad8f98824ab21d2ff422d9e1b1
Downloading 'tokenizer.json' to '/tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/ae8c6f0b4e0cdb2102a045267678e8f3d3d54ceb.incomplete'


tokenizer.json: 0.00B [00:00, ?B/s]

Download complete. Moving file to /tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/ae8c6f0b4e0cdb2102a045267678e8f3d3d54ceb


Chat template applied successfully



Token indices sequence length is longer than the specified maximum sequence length for this model (124366 > 16384). Running this sequence through the model will result in indexing errors
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Downloading 'config.json' to '/tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/3b1ccb3ddc847f2475a8e1b86e18ab1505fd87da.incomplete'


Prompt statistics:
  Characters: 535,716
  Tokens: 124,366
  Max model length: 128,000
  Generation tokens: 40
  Total required: 124,406

✓ Prompt length validation passed
  Remaining capacity: 3,594 tokens

Configuration:
  Model: deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct
  Tensor parallel: 2
  Max model len: 128,000
  Max tokens: 40
  Temperature: 0.0
  Seed: 42
  Repetitions: 20

Loading model...



config.json: 0.00B [00:00, ?B/s]

Download complete. Moving file to /tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/3b1ccb3ddc847f2475a8e1b86e18ab1505fd87da
Downloading 'configuration_deepseek.py' to '/tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/82e0f5d9d33620a66e328fdeae0b8dc12e2cff7c.incomplete'


configuration_deepseek.py: 0.00B [00:00, ?B/s]

Download complete. Moving file to /tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/82e0f5d9d33620a66e328fdeae0b8dc12e2cff7c
A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct:
- configuration_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!
Downloading 'generation_config.json' to '/tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/458e1d985ba3fbaaf62a4d1a9dd6ff795a451f7e.incomplete'


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Download complete. Moving file to /tmp/hf_cache/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/458e1d985ba3fbaaf62a4d1a9dd6ff795a451f7e


[Gloo] Rank [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 10
 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[Gloo] Rank [Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1

[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Downloading 'model-00001-of-000004.safetensors' to '/tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/75d08ddaf92b68f751c95e1b4a51dbf5c011d5692f97cc0d71bd32587a3ea8d9.incomplete'


model-00001-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Download complete. Moving file to /tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/75d08ddaf92b68f751c95e1b4a51dbf5c011d5692f97cc0d71bd32587a3ea8d9
[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Downloading 'model-00002-of-000004.safetensors' to '/tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/7bf22dfa271527f7a0b8dbd56592722cd8fdcfeb6aad32ebb1110d21882eb1d8.incomplete'


model-00002-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Download complete. Moving file to /tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/7bf22dfa271527f7a0b8dbd56592722cd8fdcfeb6aad32ebb1110d21882eb1d8
[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Downloading 'model-00003-of-000004.safetensors' to '/tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/18f5a20f4d737b496e03ff8761834dfa9754ceedd56f54a336d0eab5e0e20968.incomplete'


model-00003-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Download complete. Moving file to /tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/18f5a20f4d737b496e03ff8761834dfa9754ceedd56f54a336d0eab5e0e20968
[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Downloading 'model-00004-of-000004.safetensors' to '/tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/1365ca25494e6592b6cb11f62f4a63cbdcdd9853e01d67f274d0b282732cc5cd.incomplete'


model-00004-of-000004.safetensors:   0%|          | 0.00/5.64G [00:00<?, ?B/s]

[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Download complete. Moving file to /tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/1365ca25494e6592b6cb11f62f4a63cbdcdd9853e01d67f274d0b282732cc5cd
[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Downloading 'model.safetensors.index.json' to '/tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/5a821356160292c668d01f8e7fdf9abba4a7b72d.incomplete'


model.safetensors.index.json: 0.00B [00:00, ?B/s]

[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m Download complete. Moving file to /tmp/hf_cache/hub/models--deepseek-ai--DeepSeek-Coder-V2-Lite-Instruct/blobs/5a821356160292c668d01f8e7fdf9abba4a7b72d
[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m [1;36m(Worker_TP0 pid=925)[0;0m 2025-11-10 15:46:58,349 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
2025-11-10 15:46:58,349 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
[1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(EngineCore_DP0 pid=919)[0;0m [1;36m(Worker_TP0 pid=925)[0;0m [1;36m(Worker_TP1 pid=927)[0;0m 2025-11-10 15:46:58,680 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
2025-11-10 15:46:58,680 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends


Model loaded successfully!

Sampling parameters:
  Temperature: 0.0
  Max tokens: 40
  Seed: 42
  Top logprobs: 10

Running warmup...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (124366 > 16384). Running this sequence through the model will result in indexing errors


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Warmup complete - generated 40 tokens

RUNNING EXPERIMENT

Repetition 1/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 2/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 3/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 4/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 5/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 6/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 7/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 8/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 9/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 10/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 11/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 12/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 13/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 14/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 15/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 16/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Generated 40 tokens
Repetition 17/20...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]