In [None]:
#!/usr/bin/env python3
"""
H100 FP8 Quantization Determinism Test
Tests bit-exact reproducibility with FP8 quantization (native H100 tensor core support)
Includes detailed timing measurements for performance comparison
"""

# ============================================================================
# SUPPRESS VERBOSE LOGGING
# ============================================================================
import os
os.environ['HF_HOME'] = '/tmp/hf_cache'
os.environ['TRANSFORMERS_CACHE'] = '/tmp/hf_cache'
os.environ['VLLM_LOGGING_LEVEL'] = 'WARNING'
os.environ['VLLM_CONFIGURE_LOGGING'] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import warnings
warnings.filterwarnings('ignore')

import logging
logging.getLogger('vllm').setLevel(logging.ERROR)
logging.getLogger('vllm.engine').setLevel(logging.ERROR)
logging.getLogger('vllm.worker').setLevel(logging.ERROR)
logging.getLogger('vllm.executor').setLevel(logging.ERROR)
logging.getLogger('transformers').setLevel(logging.ERROR)
logging.getLogger('torch').setLevel(logging.ERROR)
logging.getLogger('huggingface_hub').setLevel(logging.INFO)
logging.getLogger('huggingface_hub.file_download').setLevel(logging.INFO)

# ============================================================================
# IMPORTS
# ============================================================================

from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import numpy as np
from datetime import datetime
import json
import torch
import time
import glob

# ============================================================================
# CONFIGURATION
# ============================================================================

# Model configuration - FP8 quantized version (native H100 tensor core support)
MODEL_NAME = "Qwen/Qwen3-32B-FP8"
QUANTIZATION = "fp8"  # Must match model config (not compressed-tensors)
TENSOR_PARALLEL_SIZE = 1  # Single GPU for clean timing
MAX_MODEL_LEN = 130000
GPU_MEMORY_UTILIZATION = 0.9

# Generation configuration
MAX_TOKENS = 100
NUM_REPETITIONS = 10
TEMPERATURE = 0.0  # Greedy decoding
SEED = 42
TOP_LOGPROBS = 10

# Timing configuration
NUM_WARMUP_RUNS = 3  # Extra warmup for stable timing

# Prompt source
AUTO_FIND_FILE = True

# User task
USER_TASK = "Please provide a detailed summary of the following text."

# Hardcoded content
HARDCODED_CONTENT = """The development of large language models has fundamentally transformed natural language processing and artificial intelligence more broadly. These models, trained on vast corpora of text data, have demonstrated remarkable capabilities across a wide range of tasks, from translation and summarization to question answering and creative writing. The scaling laws observed in these systems suggest that performance continues to improve with model size, data scale, and compute budget, though with diminishing returns. Recent advances in architecture, training techniques, and inference optimization have made these powerful models increasingly accessible for practical applications."""

# ============================================================================
# FILE LOADING UTILITIES
# ============================================================================

def find_prompt_file():
    """Find first txt or pdf file in current directory"""
    cwd = os.getcwd()
    txt_files = glob.glob(os.path.join(cwd, "*.txt"))
    pdf_files = glob.glob(os.path.join(cwd, "*.pdf"))
    
    if txt_files:
        return txt_files[0]
    elif pdf_files:
        return pdf_files[0]
    else:
        return None

def load_text_from_file(filepath):
    """Load text from txt or pdf file"""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    if filepath.endswith('.txt'):
        with open(filepath, 'r', encoding='utf-8') as f:
            text = f.read()
        print(f"Loaded {len(text)} characters from txt file")
        return text
    
    elif filepath.endswith('.pdf'):
        try:
            import PyPDF2
        except ImportError:
            raise ImportError("PyPDF2 required for PDF loading. Install with: pip install PyPDF2")
        
        text = []
        with open(filepath, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            num_pages = len(pdf_reader.pages)
            print(f"Loading {num_pages} pages from PDF...")
            
            for page_num, page in enumerate(pdf_reader.pages, 1):
                page_text = page.extract_text()
                text.append(page_text)
        
        full_text = '\n'.join(text)
        print(f"Loaded {len(full_text)} characters from PDF ({num_pages} pages)")
        return full_text
    
    else:
        raise ValueError(f"Unsupported file type: {filepath}")

# ============================================================================
# PROMPT LOADING
# ============================================================================

print("=" * 80)
print("H100 FP8 QUANTIZATION DETERMINISM TEST")
print("Native Tensor Core Support")
print("=" * 80)
print()

# Load document content
prompt_file = None
if AUTO_FIND_FILE:
    prompt_file = find_prompt_file()
    if prompt_file:
        print(f"Found file: {os.path.basename(prompt_file)}")
        DOCUMENT_CONTENT = load_text_from_file(prompt_file)
        print()
    else:
        print("No txt/pdf files found - using hardcoded content")
        DOCUMENT_CONTENT = HARDCODED_CONTENT
        print()
else:
    DOCUMENT_CONTENT = HARDCODED_CONTENT
    print("Using hardcoded content")
    print()

# Prepare messages
messages = [
    {"role": "user", "content": f"{USER_TASK}\n\n{DOCUMENT_CONTENT}"}
]

print(f"Message content length: {len(messages[0]['content'])} characters")
print()

# ============================================================================
# TOKENIZER PRECHECK
# ============================================================================

print("Loading tokenizer to validate prompt length...")
tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-32B-Instruct",  # Use base Qwen model for tokenizer
    cache_dir='/tmp/hf_cache',
    trust_remote_code=True
)

# Apply chat template with enable_thinking for Qwen3
prompt_text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True  # Qwen3 thinking mode
)

prompt_tokens = tokenizer.encode(prompt_text)
prompt_length = len(prompt_tokens)

print(f"Prompt statistics:")
print(f"  Characters: {len(prompt_text):,}")
print(f"  Tokens: {prompt_length:,}")
print(f"  Max model length: {MAX_MODEL_LEN:,}")
print(f"  Generation tokens: {MAX_TOKENS}")
print(f"  Total required: {prompt_length + MAX_TOKENS:,}")
print()

if prompt_length + MAX_TOKENS > MAX_MODEL_LEN:
    print(f"[WARNING] May exceed context length")
    print()

# ============================================================================
# MODEL LOADING
# ============================================================================

print("Configuration:")
print(f"  Model: {MODEL_NAME}")
print(f"  Quantization: {QUANTIZATION} (from model config)")
print(f"  Precision: FP8 weights, FP16 activations")
print(f"  Compute: Native FP8×FP16 tensor core operations")
print(f"  KV cache: FP16")
print(f"  Tensor parallel: {TENSOR_PARALLEL_SIZE}")
print(f"  Max model len: {MAX_MODEL_LEN:,} (with YaRN scaling)")
print(f"  YaRN factor: 4.0x (32K -> 128K)")
print(f"  Max tokens: {MAX_TOKENS}")
print(f"  Temperature: {TEMPERATURE}")
print(f"  Seed: {SEED}")
print(f"  Repetitions: {NUM_REPETITIONS}")
print(f"  Warmup runs: {NUM_WARMUP_RUNS}")
print()
print("IMPORTANT: Using FP16 to match INT8 experiment")
print("           This isolates dequantization as the only variable")
print()

print("Loading FP8 quantized model (native H100 tensor core, FP16 activations)...")
print("Note: FP8 weights × FP16 activations using native tensor cores")
print("Note: Using FP16 to match INT8 experiment (controlled comparison)")
print("Activations and KV cache: FP16")
print("Enabling YaRN rope scaling for 130K context...")
load_start = time.time()

llm = LLM(
    model=MODEL_NAME,
    quantization=QUANTIZATION,
    dtype="float16",  # FP16 to match INT8 experiment (removes FP16 vs BF16 confound)
    tensor_parallel_size=TENSOR_PARALLEL_SIZE,
    max_model_len=MAX_MODEL_LEN,
    gpu_memory_utilization=GPU_MEMORY_UTILIZATION,
    trust_remote_code=True,
    seed=SEED,
    enforce_eager=True,
    enable_prefix_caching=False,
    # YaRN rope scaling for long context (32K -> 130K)
    rope_scaling={
        "rope_type": "yarn",
        "factor": 4.0,
        "original_max_position_embeddings": 32768
    }
)

load_time = time.time() - load_start
print(f"Model loaded in {load_time:.2f}s")
print()

# ============================================================================
# SAMPLING CONFIGURATION
# ============================================================================

sampling_params = SamplingParams(
    temperature=TEMPERATURE,
    max_tokens=MAX_TOKENS,
    seed=SEED,
    logprobs=TOP_LOGPROBS,
    skip_special_tokens=False
)

# ============================================================================
# WARMUP
# ============================================================================

print(f"Running {NUM_WARMUP_RUNS} warmup iterations...")
warmup_times = []
for i in range(NUM_WARMUP_RUNS):
    warmup_start = time.time()
    warmup_output = llm.generate(prompt_text, sampling_params=sampling_params)
    warmup_time = time.time() - warmup_start
    warmup_times.append(warmup_time)
    num_tokens = len(warmup_output[0].outputs[0].token_ids)
    print(f"  Warmup {i+1}: {warmup_time:.3f}s ({num_tokens} tokens, {num_tokens/warmup_time:.1f} tok/s)")

print(f"Warmup complete - avg time: {np.mean(warmup_times):.3f}s")
print()

# ============================================================================
# MAIN EXPERIMENT
# ============================================================================

print("=" * 80)
print("RUNNING EXPERIMENT")
print("=" * 80)
print()

results_tokens = []
results_logprobs = []
results_texts = []
results_distributions = []
timing_data = []

for rep in range(NUM_REPETITIONS):
    print(f"Repetition {rep + 1}/{NUM_REPETITIONS}...")
    
    # Time the generation
    start_time = time.time()
    outputs = llm.generate(prompt_text, sampling_params=sampling_params)
    end_time = time.time()
    
    elapsed = end_time - start_time
    output = outputs[0]
    
    # Extract token IDs
    token_ids = output.outputs[0].token_ids
    num_tokens = len(token_ids)
    tokens_per_sec = num_tokens / elapsed
    
    results_tokens.append(token_ids)
    
    # Store timing
    timing_data.append({
        'repetition': rep + 1,
        'elapsed_time': elapsed,
        'num_tokens': num_tokens,
        'tokens_per_sec': tokens_per_sec,
        'time_per_token': elapsed / num_tokens
    })
    
    # Extract generated text
    text = output.outputs[0].text
    results_texts.append(text)
    
    # Extract logprobs
    logprobs_data = output.outputs[0].logprobs
    selected_logprobs = [lp[token_ids[i]].logprob for i, lp in enumerate(logprobs_data)]
    results_logprobs.append(np.array(selected_logprobs))
    
    # Extract distributions
    rep_distributions = []
    for position_logprobs in logprobs_data:
        sorted_items = sorted(position_logprobs.items(), 
                            key=lambda x: x[1].logprob, 
                            reverse=True)[:TOP_LOGPROBS]
        rep_distributions.append([(tok, lp.logprob) for tok, lp in sorted_items])
    results_distributions.append(rep_distributions)
    
    print(f"  {num_tokens} tokens in {elapsed:.3f}s ({tokens_per_sec:.1f} tok/s)")

print()
print("All repetitions complete!")
print()

# ============================================================================
# TIMING ANALYSIS
# ============================================================================

print("=" * 80)
print("TIMING ANALYSIS")
print("=" * 80)
print()

times = [t['elapsed_time'] for t in timing_data]
tps = [t['tokens_per_sec'] for t in timing_data]
tpt = [t['time_per_token'] for t in timing_data]

print("Timing statistics:")
print(f"  Mean time: {np.mean(times):.3f}s (σ={np.std(times):.4f}s)")
print(f"  Min/Max: {np.min(times):.3f}s / {np.max(times):.3f}s")
print(f"  Tokens/sec: {np.mean(tps):.1f} (σ={np.std(tps):.2f})")
print(f"  Time/token: {np.mean(tpt)*1000:.2f}ms (σ={np.std(tpt)*1000:.3f}ms)")
print()

# ============================================================================
# REPRODUCIBILITY ANALYSIS
# ============================================================================

print("=" * 80)
print("REPRODUCIBILITY ANALYSIS")
print("=" * 80)
print()

# Check token sequences
print("Checking token sequences...")
tokens_identical = all(
    results_tokens[0] == results_tokens[i] 
    for i in range(1, NUM_REPETITIONS)
)
print(f"Token sequences identical: {tokens_identical}")

if not tokens_identical:
    print("\n[WARNING] Token sequences differ!")
    for i in range(1, NUM_REPETITIONS):
        if results_tokens[0] != results_tokens[i]:
            diff_positions = [
                j for j in range(min(len(results_tokens[0]), len(results_tokens[i])))
                if results_tokens[0][j] != results_tokens[i][j]
            ]
            print(f"  Rep 0 vs Rep {i}: {len(diff_positions)} positions differ")
            if diff_positions:
                print(f"    First difference at position {diff_positions[0]}")
                print(f"      Rep 0: token {results_tokens[0][diff_positions[0]]}")
                print(f"      Rep {i}: token {results_tokens[i][diff_positions[0]]}")

# Check logprobs
print("\nChecking selected token logprobs...")
first_logprobs = results_logprobs[0]
logprobs_exact = all(
    np.allclose(first_logprobs, results_logprobs[i], rtol=0, atol=1e-10)
    for i in range(1, NUM_REPETITIONS)
)
print(f"Selected token logprobs bit-exact: {logprobs_exact}")

if not logprobs_exact:
    print("\nL2 distances:")
    l2_distances = []
    for i in range(1, NUM_REPETITIONS):
        l2 = np.linalg.norm(first_logprobs - results_logprobs[i])
        l2_distances.append(l2)
        print(f"  Rep 0 vs Rep {i}: L2 = {l2:.6e}")
    
    print(f"\nMax L2: {max(l2_distances):.6e}")
    print(f"Mean L2: {np.mean(l2_distances):.6e}")

# Check distributions
print("\nChecking full top-k distributions...")
distributions_exact = True
distribution_mismatches = []

first_dist = results_distributions[0]
for rep_idx in range(1, NUM_REPETITIONS):
    for pos_idx in range(len(first_dist)):
        dist_a = first_dist[pos_idx]
        dist_b = results_distributions[rep_idx][pos_idx]
        
        tokens_match = [t[0] for t in dist_a] == [t[0] for t in dist_b]
        
        if tokens_match:
            logprobs_match = all(
                abs(dist_a[i][1] - dist_b[i][1]) < 1e-10 
                for i in range(len(dist_a))
            )
            if not logprobs_match:
                distributions_exact = False
                distribution_mismatches.append((rep_idx, pos_idx))
        else:
            distributions_exact = False
            distribution_mismatches.append((rep_idx, pos_idx))

print(f"Top-k distributions bit-exact: {distributions_exact}")

if not distributions_exact:
    print(f"\nFound {len(distribution_mismatches)} mismatches")

print()

# ============================================================================
# VERDICT
# ============================================================================

print("=" * 80)
print("VERDICT - FP8 QUANTIZATION (NATIVE H100)")
print("=" * 80)
print()

if tokens_identical and logprobs_exact and distributions_exact:
    print("[PASS] PERFECT REPRODUCIBILITY WITH FP8")
    print("  - Token sequences: bit-exact")
    print("  - Selected token logprobs: bit-exact")
    print("  - Full top-k distributions: bit-exact")
    print("  => FP8 native tensor core operations maintain determinism")
    print("  => H100 tensor cores produce reproducible FP8×FP16 results")
elif tokens_identical and logprobs_exact and not distributions_exact:
    print("[WARNING] SELECTED TOKENS EXACT, DISTRIBUTIONS VARY")
    print("  - Token sequences: bit-exact")
    print("  - Selected token logprobs: bit-exact")
    print("  - Top-k distributions: numerical variation")
    print("  => FP8 tensor cores have distribution-level noise")
elif tokens_identical and not logprobs_exact:
    print("[WARNING] TOKENS IDENTICAL, LOGPROBS VARY")
    print("  - Token sequences: bit-exact")
    print("  - Logprobs: numerical variation")
    print("  => FP8×FP16 introduces floating-point noise")
else:
    print("[FAIL] TOKEN SEQUENCES DIFFER - DETERMINISM BROKEN")
    print("  - FP8 quantization breaks greedy decoding determinism")
    print("  => Native tensor core FP8×FP16 is non-deterministic")
    print("  => Same issue as INT4/INT8")

print()
print("FP8 Implementation Notes:")
print("  - FP8 uses native H100 tensor core operations")
print("  - E4M3 format: 4-bit exponent, 3-bit mantissa")
print("  - FP8×FP16→FP32 is a single tensor core instruction")
print("  - Same FP16 precision as INT8 for controlled comparison")
print("  - If non-deterministic: likely hardware rounding modes")
print()

# ============================================================================
# SAVE RESULTS
# ============================================================================

output_data = {
    "experiment": "h100_fp8_determinism_test",
    "timestamp": datetime.now().isoformat(),
    "hardware": "H100",
    "hardware_note": "FP8 uses native tensor core operations",
    "prompt_source": "file" if prompt_file else "hardcoded",
    "prompt_file": os.path.basename(prompt_file) if prompt_file else None,
    "prompt_text": prompt_text,
    "prompt_length_chars": len(prompt_text),
    "prompt_length_tokens": prompt_length,
    "config": {
        "model": MODEL_NAME,
        "quantization": QUANTIZATION,
        "precision": "FP8 (E4M3)",
        "tensor_parallel": TENSOR_PARALLEL_SIZE,
        "max_model_len": MAX_MODEL_LEN,
        "max_tokens": MAX_TOKENS,
        "repetitions": NUM_REPETITIONS,
        "warmup_runs": NUM_WARMUP_RUNS,
        "temperature": TEMPERATURE,
        "seed": SEED,
        "top_logprobs": TOP_LOGPROBS
    },
    "timing": {
        "model_load_time": load_time,
        "warmup_times": warmup_times,
        "per_repetition": timing_data,
        "statistics": {
            "mean_time": float(np.mean(times)),
            "std_time": float(np.std(times)),
            "mean_tokens_per_sec": float(np.mean(tps)),
            "std_tokens_per_sec": float(np.std(tps)),
            "mean_time_per_token_ms": float(np.mean(tpt) * 1000),
            "std_time_per_token_ms": float(np.std(tpt) * 1000)
        }
    },
    "results": {
        "tokens_identical": tokens_identical,
        "logprobs_exact": logprobs_exact,
        "distributions_exact": distributions_exact,
        "perfect_reproducibility": tokens_identical and logprobs_exact and distributions_exact
    },
    "token_sequences": results_tokens,
    "logprobs_vectors": [lp.tolist() for lp in results_logprobs],
    "generated_texts": results_texts
}

output_file = f"h100_fp8_determinism_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, "w") as f:
    json.dump(output_data, f, indent=2)

print(f"Results saved to: {output_file}")
print()
print("=" * 80)
print("TEST COMPLETE")
print("=" * 80)

H100 FP8 QUANTIZATION DETERMINISM TEST
Native Tensor Core Support

Found file: Verification-for-International-AI-Governance.pdf
Loading 172 pages from PDF...
Loaded 535619 characters from PDF (172 pages)

Message content length: 535677 characters

Loading tokenizer to validate prompt length...


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Prompt statistics:
  Characters: 535,825
  Tokens: 120,431
  Max model length: 130,000
  Generation tokens: 100
  Total required: 120,531

Configuration:
  Model: Qwen/Qwen3-32B-FP8
  Quantization: fp8 (from model config)
  Precision: FP8 weights, FP16 activations
  Compute: Native FP8×FP16 tensor core operations
  KV cache: FP16
  Tensor parallel: 1
  Max model len: 130,000 (with YaRN scaling)
  YaRN factor: 4.0x (32K -> 128K)
  Max tokens: 100
  Temperature: 0.0
  Seed: 42
  Repetitions: 10
  Warmup runs: 3

IMPORTANT: Using FP16 to match INT8 experiment
           This isolates dequantization as the only variable

Loading FP8 quantized model (native H100 tensor core, FP16 activations)...
Note: FP8 weights × FP16 activations using native tensor cores
Note: Using FP16 to match INT8 experiment (controlled comparison)
Activations and KV cache: FP16
Enabling YaRN rope scaling for 130K context...


`torch_dtype` is deprecated! Use `dtype` instead!


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


[1;36m(EngineCore_DP0 pid=870)[0;0m 2025-11-11 15:17:19,659 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
[1;36m(EngineCore_DP0 pid=870)[0;0m 2025-11-11 15:17:21,939 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends


Model loaded in 18.33s

Running 3 warmup iterations...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Warmup 1: 76.353s (100 tokens, 1.3 tok/s)


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Warmup 2: 72.708s (100 tokens, 1.4 tok/s)


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  Warmup 3: 74.296s (100 tokens, 1.3 tok/s)
Warmup complete - avg time: 74.452s

RUNNING EXPERIMENT

Repetition 1/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 74.143s (1.3 tok/s)
Repetition 2/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 73.604s (1.4 tok/s)
Repetition 3/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 74.915s (1.3 tok/s)
Repetition 4/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 72.853s (1.4 tok/s)
Repetition 5/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 74.602s (1.3 tok/s)
Repetition 6/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 74.026s (1.4 tok/s)
Repetition 7/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 73.068s (1.4 tok/s)
Repetition 8/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

  100 tokens in 73.481s (1.4 tok/s)
Repetition 9/10...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]