In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import logging

# --- Global Configuration ---
# Base path on your Google Drive
GDRIVE_BASE_PATH = "/content/drive/MyDrive/Colab_Notebooks/LiveBenchRun"

# Path where the LiveBench repository lives/will be cloned
LIVEBENCH_REPO_PATH = os.path.join(GDRIVE_BASE_PATH, "LiveBench")
LIVEBENCH_SUBDIR_PATH = os.path.join(LIVEBENCH_REPO_PATH, "livebench")

# Path for model weights parent directory
MODELS_PARENT_PATH = os.path.join(GDRIVE_BASE_PATH, "models")

# Port for the vLLM server (consistent across models)
VLLM_PORT = 8000

# GPU Utilization for vLLM
GPU_UTILIZATION = 0.9

# LiveBench Run Parameters (consistent across models)
BENCH_NAME = "live_bench"
API_BASE_URL = f"http://localhost:{VLLM_PORT}/v1"
API_KEY = "dummy-key"
LIVEBENCH_RELEASE = "2024-11-25"
MAX_TOKENS = 8192
PARALLEL_REQUESTS = 32

# --- Model Specific Configurations ---
# List of models to download and evaluate.
# Each dictionary needs 'hf_id' (Hugging Face ID) and 'local_name' (simple name for tracking).
MODEL_CONFIGS = [
    {
        "hf_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "local_name": "deepseek-1.5b-local"
    },
    {
        "hf_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "local_name": "deepseek-r1-distill-qwen-1.5b-val-modified"
    },
    {
        "hf_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "local_name": "deepseek-r1-distill-qwen-1.5b-scrambled"
    },
    {
        "hf_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "local_name": "deepseek-r1-distill-qwen-1.5b-length-val-modified"
    },
    {
        "hf_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "local_name": "deepseek-r1-distill-qwen-1.5b-gradient-ascent"
    },
    {
        "hf_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "local_name": "deepseek-r1-distill-qwen-1.5b-ft-control"
    },
    {
        "hf_id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        "local_name": "deepseek-r1-distill-qwen-1.5b-reduced-eos-gradient-ascent"
    },
    # Add more model dictionaries here if needed
]

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Create Base Directories ---
os.makedirs(GDRIVE_BASE_PATH, exist_ok=True)
os.makedirs(MODELS_PARENT_PATH, exist_ok=True) # Create parent 'models' dir

logging.info(f"Google Drive Base Path: {GDRIVE_BASE_PATH}")
logging.info(f"LiveBench Repo Path: {LIVEBENCH_REPO_PATH}")
logging.info(f"Models Parent Path: {MODELS_PARENT_PATH}")
logging.info(f"vLLM Server Port: {VLLM_PORT}")
logging.info(f"Models to process: {len(MODEL_CONFIGS)}")

In [None]:
import os

# Navigate to the base Google Drive path
%cd {GDRIVE_BASE_PATH}

# Clone LiveBench repository IF it doesn't exist
if not os.path.exists(LIVEBENCH_REPO_PATH):
  print("Cloning LiveBench repository...")
  !git clone https://github.com/LiveBench/LiveBench.git
else:
  print("LiveBench repository already exists.")

# Navigate into the LiveBench repository directory
%cd {LIVEBENCH_REPO_PATH}
!pwd # Verify we are in the correct directory on Drive

/content/drive/MyDrive/Colab_Notebooks/LiveBenchRun
LiveBench repository already exists.
/content/drive/MyDrive/Colab_Notebooks/LiveBenchRun/LiveBench
/content/drive/MyDrive/Colab_Notebooks/LiveBenchRun/LiveBench


In [None]:
# --- Force Uninstall any potentially conflicting old versions ---
# !pip uninstall -y vllm pyzmq zmq huggingface-hub # Can uncomment if needed

# --- Install Dependencies ---
print("\n--- Installing Dependencies ---")
# !pip install --force-reinstall --no-cache-dir pyzmq # Ensure clean pyzmq
!pip install -e . # Install LiveBench editable
!pip install vllm huggingface-hub # Install vLLM and HF Hub

# --- Verify Installation ---
print("\n--- Checking Package Versions ---")
!pip show pyzmq
!pip show vllm
!pip show huggingface-hub
# !pip check # Optional: Check for broader dependency conflicts

print("\n--- Installation complete ---")


--- Installing Dependencies ---
Obtaining file:///content/drive/MyDrive/Colab_Notebooks/LiveBenchRun/LiveBench
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting fschat@ git+https://github.com/lm-sys/FastChat#egg=c5223e34babd24c3f9b08205e6751ea6e42c9684 (from livebench==0.0.4)
  Cloning https://github.com/lm-sys/FastChat to /tmp/pip-install-clr2cmqy/fschat_1833baf2cfae40eea7a678db7deaf679
  Running command git clone --filter=blob:none --quiet https://github.com/lm-sys/FastChat /tmp/pip-install-clr2cmqy/fschat_1833baf2cfae40eea7a678db7deaf679
  Resolved https://github.com/lm-sys/FastChat to commit 0e6d3e4beaab66f4d3f93db72541a4abab8af28d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pypr

In [None]:
# Optional: Login to Hugging Face Hub
# from huggingface_hub import login
# login() # Paste your HF token when prompted

# --- Start Multi-Model Processing Loop ---
The following cells will iterate through each model defined in `MODEL_CONFIGS`.

In [None]:
import os
import time
import glob
import json
import shutil
import logging
import subprocess
import requests # Added for checking server status
from huggingface_hub import snapshot_download
try:
    from IPython import get_ipython
except ImportError:
    get_ipython = None

class LiveBenchModelRunner:
    """Encapsulates the logic to process one model for LiveBench."""

    def __init__(self, model_config, global_paths, global_params):
        self.hf_id = model_config['hf_id']
        self.local_name = model_config['local_name']
        self.paths = global_paths
        self.params = global_params

        # Derive model-specific paths
        self.weights_path = os.path.join(self.paths['models_parent'], self.local_name)
        self.vllm_log_file = os.path.join(self.paths['gdrive_base'], f"vllm_server_{self.local_name}.log")
        self.base_data_path = os.path.join(self.paths['livebench_subdir'], "data")

        os.makedirs(self.weights_path, exist_ok=True)
        logging.info(f"Initialized runner for {self.local_name}")
        logging.info(f"  Weights Path: {self.weights_path}")
        logging.info(f"  vLLM Log: {self.vllm_log_file}")

    def _run_command(self, command_str, cwd=None, check=False):
        """Helper to run shell commands, preferring IPython.system if available."""
        logging.info(f"Running command: {command_str}")
        if cwd:
            logging.info(f"  In directory: {cwd}")

        ipython = get_ipython()
        if ipython:
            original_dir = os.getcwd()
            try:
                if cwd:
                    os.chdir(cwd)
                ipython.system(command_str)
                return True
            except Exception as e:
                 logging.error(f"IPython.system command failed: {e}")
                 return False
            finally:
                 if cwd:
                     os.chdir(original_dir)
        else:
            logging.warning("IPython environment not detected, falling back to subprocess.run")
            try:
                process = subprocess.run(command_str, cwd=cwd, shell=True, check=check, capture_output=True, text=True)
                logging.info(f"  stdout: {process.stdout}")
                if process.stderr:
                    logging.warning(f"  stderr: {process.stderr}")
                return process.returncode == 0
            except subprocess.CalledProcessError as e:
                logging.error(f"Command failed: {e}")
                logging.error(f"  stdout: {e.stdout}")
                logging.error(f"  stderr: {e.stderr}")
                return False
            except Exception as e:
                logging.error(f"An unexpected error occurred running command: {e}")
                return False

    def download(self):
        """Downloads model weights if they don't exist."""
        logging.info(f"Checking model weights directory: {self.weights_path}")
        if not os.path.exists(self.weights_path) or not os.listdir(self.weights_path):
            logging.info(f"Model weights directory is empty or does not exist. Downloading {self.hf_id}...")
            try:
                snapshot_download(
                    repo_id=self.hf_id,
                    local_dir=self.weights_path,
                    local_dir_use_symlinks=False,
                )
                logging.info("Model download complete.")
                return True
            except Exception as e:
                logging.error(f"Error during model download for {self.local_name}: {e}")
                return False
        else:
            logging.info("Model weights directory already contains files. Skipping download.")
            return True

    def start_vllm(self):
        """Starts the vLLM server using IPython.system(nohup...) and waits."""
        logging.info(f"Starting vLLM server for model: {self.local_name} from path: {self.weights_path}")
        logging.info(f"Saving server log to: {self.vllm_log_file}")

        ipython = get_ipython()
        if not ipython:
            logging.error("IPython environment not detected. Cannot reliably start background server.")
            return False

        os.makedirs(os.path.dirname(self.vllm_log_file), exist_ok=True)
        if os.path.exists(self.vllm_log_file):
            os.remove(self.vllm_log_file)

        vllm_command_str = (
            f"nohup python -m vllm.entrypoints.openai.api_server "
            f"--model {self.weights_path} "
            f"--served-model-name {self.local_name} "
            f"--host 0.0.0.0 "
            f"--port {self.params['vllm_port']} "
            f"--tensor-parallel-size 1 "
            f"--gpu-memory-utilization {self.params['gpu_utilization']} "
            f"--trust-remote-code "
            f"> {self.vllm_log_file} 2>&1 &"
        )

        logging.info(f"Executing server launch command via IPython.system: {vllm_command_str}")
        original_dir = os.getcwd()
        try:
            os.chdir(self.paths['gdrive_base'])
            ipython.system(vllm_command_str)
            logging.info(f"vLLM server launch command submitted via IPython.system.")
        except Exception as e:
            logging.error(f"Failed to launch vLLM via IPython.system: {e}")
            os.chdir(original_dir)
            return False
        finally:
            os.chdir(original_dir)

        max_wait_time = 500
        start_time = time.time()
        server_ready = False
        check_url = f"http://localhost:{self.params['vllm_port']}/v1/models"
        expected_model_id = self.local_name
        check_interval = 10

        logging.info(f"Waiting up to {max_wait_time}s for server readiness...")
        while time.time() - start_time < max_wait_time:
            if os.path.exists(self.vllm_log_file):
                try:
                    with open(self.vllm_log_file, 'r') as f_log_check:
                        log_content = f_log_check.read()
                    if "Traceback" in log_content or "CUDA out of memory" in log_content:
                         logging.error("Detected potential fatal error in vLLM log during startup wait.")
                except Exception:
                    pass

            try:
                response = requests.get(check_url, timeout=10)
                if response.status_code == 200:
                    try:
                        response_data = response.json()
                        loaded_models = [m.get('id') for m in response_data.get('data', [])]
                        if expected_model_id in loaded_models:
                            logging.info(f"vLLM server is ready and model '{expected_model_id}' is served.")
                            server_ready = True
                            break
                        else:
                            uvicorn_running = False
                            if os.path.exists(self.vllm_log_file):
                                try:
                                    with open(self.vllm_log_file, 'r') as f_log_check:
                                        if "Uvicorn running" in f_log_check.read(): uvicorn_running = True
                                except Exception: pass
                            if uvicorn_running:
                                logging.warning(f"vLLM server API is up, but model '{expected_model_id}' not confirmed in response yet: {loaded_models}. Retrying...")
                            else:
                                logging.info(f"vLLM server API reachable but Uvicorn not confirmed running yet. Waiting...")
                    except json.JSONDecodeError:
                        logging.warning(f"Server responded to {check_url} with status 200, but response was not valid JSON. Waiting...")
                else:
                     logging.warning(f"Server not ready yet (API status code {response.status_code}). Waiting...")
            except requests.exceptions.RequestException as req_e:
                logging.info(f"Server not reachable yet ({req_e}). Waiting...")
            except Exception as e:
                logging.error(f"Unexpected error checking server status: {e}. Waiting...")

            time.sleep(check_interval)

        logging.info(f"--- Last lines of vLLM Server Log ({self.vllm_log_file}) after wait period --- ")
        self._run_command(f"tail -n 30 {self.vllm_log_file}")

        if not server_ready:
            logging.error(f"vLLM server did not become ready with model '{expected_model_id}' within {max_wait_time} seconds.")
            stop_vllm_server(self.params['vllm_port'])
            return False

        return True

    def evaluate(self):
        """Runs the LiveBench evaluation script."""
        logging.info(f"Running LiveBench evaluation for model: {self.local_name}")
        eval_command_list = [
            "python", "run_livebench.py",
            "--model", self.weights_path,
            "--model-display-name", self.local_name,
            "--api-base", self.params['api_base_url'],
            "--api-key", self.params['api_key'],
            "--bench-name", self.params['bench_name'],
            "--livebench-release-option", self.params['livebench_release'],
            "--max-tokens", str(self.params['max_tokens']),
            "--parallel-requests", str(self.params['parallel_requests']),
            "--force-temperature", "0.6",
            "--resume",
        ]
        eval_command_str = ' '.join(eval_command_list)
        success = self._run_command(eval_command_str, cwd=self.paths['livebench_subdir'], check=False)
        logging.info(f"LiveBench evaluation run finished for {self.local_name}.")

        logging.info("--- Checking for output files ---")
        bench_subpath_glob = self.params['bench_name'].replace('live_bench/', 'live_bench/*/') if self.params['bench_name'].startswith('live_bench/') else self.params['bench_name']
        answer_path_pattern = os.path.join(self.base_data_path, f"{bench_subpath_glob}/model_answer/{self.local_name}.jsonl")
        logging.info(f"Checking for answer files like: {answer_path_pattern}")
        found_files = glob.glob(answer_path_pattern.replace('/*/', '/**/'), recursive=True)
        if found_files:
             logging.info(f"Answer files found: {found_files}")
             return True
        else:
             logging.warning("Answer files not found.")
             return False

    def clean(self):
        """Cleans the <think> tags from answer files."""
        logging.info(f"--- Starting Answer File Cleaning for {self.local_name} ---")
        time.sleep(5)

        pattern_glob = None
        bench_name = self.params['bench_name']
        if bench_name == "live_bench":
            pattern_glob = os.path.join(self.base_data_path, f"live_bench/*/*/model_answer/{self.local_name}.jsonl")
        elif bench_name.startswith("live_bench/"):
            category = bench_name.split('/')[1]
            pattern_glob = os.path.join(self.base_data_path, f"live_bench/{category}/*/model_answer/{self.local_name}.jsonl")
        else:
            pattern_glob = os.path.join(self.base_data_path, f"{bench_name}/model_answer/{self.local_name}.jsonl")

        logging.info(f"Searching for files with pattern: {pattern_glob}")
        use_recursive_glob = ("*" in pattern_glob)
        answer_files = glob.glob(pattern_glob, recursive=use_recursive_glob)

        if not answer_files:
            logging.warning(f"No answer files found for {self.local_name} matching pattern. Skipping cleaning.")
            return True

        logging.info(f"Found {len(answer_files)} answer file(s). Proceeding with cleaning.")
        all_cleaned = True
        for original_filepath in answer_files:
            if not os.path.exists(original_filepath):
                logging.warning(f"Skipping non-existent file listed by glob: {original_filepath}")
                continue

            logging.info(f"Processing: {original_filepath}")
            backup_filepath = original_filepath + ".bak"
            cleaned_filepath = original_filepath + ".cleaned_temp"

            if os.path.exists(backup_filepath):
                logging.info(f"Backup file already exists ({backup_filepath}). Assuming already cleaned or skipping.")
                continue

            try:
                shutil.copy2(original_filepath, backup_filepath)
                lines_processed = 0
                lines_cleaned = 0
                with open(backup_filepath, 'r', encoding='utf-8') as f_in, \
                     open(cleaned_filepath, 'w', encoding='utf-8') as f_out:
                    for i, line in enumerate(f_in):
                        try:
                            data = json.loads(line)
                            if 'choices' in data and len(data['choices']) > 0 and \
                               'turns' in data['choices'][0] and len(data['choices'][0]['turns']) > 0:
                                original_text = data['choices'][0]['turns'][0]
                                cleaned_text = original_text
                                end_tag_pos = original_text.find('</think>')
                                if end_tag_pos != -1:
                                    cleaned_text = original_text[end_tag_pos + len('</think>'):].lstrip()
                                    if cleaned_text != original_text:
                                        lines_cleaned += 1
                                    data['choices'][0]['turns'][0] = cleaned_text
                                f_out.write(json.dumps(data) + '\n')
                                lines_processed += 1
                            else:
                                f_out.write(line)
                                logging.warning(f"Line {i+1} in {original_filepath} has unexpected structure, writing as is.")
                                lines_processed += 1
                        except json.JSONDecodeError as json_err:
                            logging.error(f"Skipping corrupted JSON line {i+1} in {original_filepath}: {json_err}")
                            continue
                        except Exception as e:
                            logging.error(f"Unexpected error processing line {i+1} in {original_filepath}: {e}")
                            continue
                logging.info(f"Processed {lines_processed} lines, cleaned {lines_cleaned} lines in {original_filepath}.")
                shutil.move(cleaned_filepath, original_filepath)
            except Exception as e:
                logging.error(f"ERROR cleaning file {original_filepath}: {e}")
                all_cleaned = False
                if os.path.exists(backup_filepath) and not os.path.exists(original_filepath):
                    logging.info(f"Attempting to restore backup for {original_filepath}")
                    try:
                        shutil.move(backup_filepath, original_filepath)
                    except Exception as restore_err:
                        logging.error(f"Failed to restore backup: {restore_err}")
                if os.path.exists(cleaned_filepath):
                    try:
                        os.remove(cleaned_filepath)
                    except Exception as rm_err:
                        logging.error(f"Failed to remove temp file {cleaned_filepath}: {rm_err}")

        logging.info(f"--- Answer File Cleaning Process Finished for {self.local_name} ---")
        return all_cleaned

    def judge(self):
        """Runs the judgment generation script."""
        logging.info(f"--- Starting Judgment Generation for {self.local_name} ---")
        judge_command_list = [
            "python", "-u", "gen_ground_truth_judgment.py",
            "--model", self.local_name,
            "--question-source", "huggingface",
            "--bench-name", self.params['bench_name'],
            "--livebench-release-option", self.params['livebench_release']
        ]
        judge_command_str = ' '.join(judge_command_list)
        success = self._run_command(judge_command_str, cwd=self.paths['livebench_subdir'])
        logging.info(f"Judgment script finished for {self.local_name}.")

        return success

    def process(self):
        """Runs the full pipeline for this model."""
        logging.info(f"--- Starting Full Process for Model: {self.local_name} ---")
        if not self.download():
            logging.error(f"Download failed for {self.local_name}. Aborting process for this model.")
            return False
        if not self.start_vllm():
            logging.error(f"vLLM start failed for {self.local_name}. Aborting process for this model.")
            return False
        if not self.evaluate():
            logging.error(f"Evaluation failed or produced no output for {self.local_name}. Aborting process for this model.")
            return False
        if not self.clean():
            logging.warning(f"Cleaning step encountered errors for {self.local_name}. Continuing to judgment.")
        if not self.judge():
            logging.warning(f"Judgment generation command failed for {self.local_name}.")

        logging.info(f"--- Finished Full Process for Model: {self.local_name} ---")
        return True

def stop_vllm_server(port):
    """Stops any vLLM server running on the specified port using pkill."""
    logging.info(f"Attempting to stop vLLM server on port {port} using pkill...")

    try:
        command = f"pkill -f 'vllm.*--port {port}'"
        subprocess.run(command, shell=True, check=False, timeout=10, capture_output=True)
        logging.info("Sent pkill signal. Waiting briefly...")
        time.sleep(10)
    except subprocess.TimeoutExpired:
        logging.warning("pkill command timed out.")
    except Exception as e:
        logging.error(f"Error running pkill command: {e}")

# --- Main Execution Loop ---
global_paths = {
    'gdrive_base': GDRIVE_BASE_PATH,
    'livebench_repo': LIVEBENCH_REPO_PATH,
    'livebench_subdir': LIVEBENCH_SUBDIR_PATH,
    'models_parent': MODELS_PARENT_PATH
}

global_params = {
    'vllm_port': VLLM_PORT,
    'gpu_utilization': GPU_UTILIZATION,
    'bench_name': BENCH_NAME,
    'api_base_url': API_BASE_URL,
    'api_key': API_KEY,
    'livebench_release': LIVEBENCH_RELEASE,
    'max_tokens': MAX_TOKENS,
    'parallel_requests': PARALLEL_REQUESTS
}

for config in MODEL_CONFIGS:
    logging.info(f"===== Processing Model Configuration: {config['local_name']} =====")
    stop_vllm_server(global_params['vllm_port'])

    runner = LiveBenchModelRunner(config, global_paths, global_params)
    try:
        success = runner.process()
        if success:
            logging.info(f"Successfully completed processing for {config['local_name']}.")
        else:
            logging.error(f"Processing failed for {config['local_name']}. Check logs above.")
    except Exception as e:
        logging.error(f"An unexpected error occurred during the main process for {config['local_name']}: {e}", exc_info=True)
    finally:
        logging.info(f"===== Finished Model Configuration: {config['local_name']} =====\n")

# --- Final Cleanup ---
logging.info("--- All models processed. Performing final cleanup. --- ")
stop_vllm_server(global_params['vllm_port'])
logging.info("--- Script Finished --- ")

INFO 05-04 05:07:34 [serving_completion.py:61] Using default completion sampling params from model: {'temperature': 0.6, 'top_p': 0.95}
INFO 05-04 05:07:34 [api_server.py:1090] Starting vLLM API server on http://0.0.0.0:8000
INFO 05-04 05:07:34 [launcher.py:28] Available routes are:
INFO 05-04 05:07:34 [launcher.py:36] Route: /openapi.json, Methods: GET, HEAD
INFO 05-04 05:07:34 [launcher.py:36] Route: /docs, Methods: GET, HEAD
INFO 05-04 05:07:34 [launcher.py:36] Route: /docs/oauth2-redirect, Methods: GET, HEAD
INFO 05-04 05:07:34 [launcher.py:36] Route: /redoc, Methods: GET, HEAD
INFO 05-04 05:07:34 [launcher.py:36] Route: /health, Methods: GET
INFO 05-04 05:07:34 [launcher.py:36] Route: /load, Methods: GET
INFO 05-04 05:07:34 [launcher.py:36] Route: /ping, Methods: GET, POST
INFO 05-04 05:07:34 [launcher.py:36] Route: /tokenize, Methods: POST
INFO 05-04 05:07:34 [launcher.py:36] Route: /detokenize, Methods: POST
INFO 05-04 05:07:34 [launcher.py:36] Route: /v1/models, Methods: GET
IN

ERROR:root:Evaluation failed or produced no output for deepseek-1.5b-local. Aborting process for this model.
ERROR:root:Processing failed for deepseek-1.5b-local. Check logs above.


INFO 05-04 05:10:39 [serving_completion.py:61] Using default completion sampling params from model: {'temperature': 0.6, 'top_p': 0.95}
INFO 05-04 05:10:39 [api_server.py:1090] Starting vLLM API server on http://0.0.0.0:8000
INFO 05-04 05:10:39 [launcher.py:28] Available routes are:
INFO 05-04 05:10:39 [launcher.py:36] Route: /openapi.json, Methods: HEAD, GET
INFO 05-04 05:10:39 [launcher.py:36] Route: /docs, Methods: HEAD, GET
INFO 05-04 05:10:39 [launcher.py:36] Route: /docs/oauth2-redirect, Methods: HEAD, GET
INFO 05-04 05:10:39 [launcher.py:36] Route: /redoc, Methods: HEAD, GET
INFO 05-04 05:10:39 [launcher.py:36] Route: /health, Methods: GET
INFO 05-04 05:10:39 [launcher.py:36] Route: /load, Methods: GET
INFO 05-04 05:10:39 [launcher.py:36] Route: /ping, Methods: POST, GET
INFO 05-04 05:10:39 [launcher.py:36] Route: /tokenize, Methods: POST
INFO 05-04 05:10:39 [launcher.py:36] Route: /detokenize, Methods: POST
INFO 05-04 05:10:39 [launcher.py:36] Route: /v1/models, Methods: GET
IN

ERROR:root:Evaluation failed or produced no output for deepseek-r1-distill-qwen-1.5b-val-modified. Aborting process for this model.
ERROR:root:Processing failed for deepseek-r1-distill-qwen-1.5b-val-modified. Check logs above.


INFO 05-04 05:13:18 [serving_completion.py:61] Using default completion sampling params from model: {'temperature': 0.6, 'top_p': 0.95}
INFO 05-04 05:13:18 [api_server.py:1090] Starting vLLM API server on http://0.0.0.0:8000
INFO 05-04 05:13:18 [launcher.py:28] Available routes are:
INFO 05-04 05:13:18 [launcher.py:36] Route: /openapi.json, Methods: HEAD, GET
INFO 05-04 05:13:18 [launcher.py:36] Route: /docs, Methods: HEAD, GET
INFO 05-04 05:13:18 [launcher.py:36] Route: /docs/oauth2-redirect, Methods: HEAD, GET
INFO 05-04 05:13:18 [launcher.py:36] Route: /redoc, Methods: HEAD, GET
INFO 05-04 05:13:18 [launcher.py:36] Route: /health, Methods: GET
INFO 05-04 05:13:18 [launcher.py:36] Route: /load, Methods: GET
INFO 05-04 05:13:18 [launcher.py:36] Route: /ping, Methods: GET, POST
INFO 05-04 05:13:18 [launcher.py:36] Route: /tokenize, Methods: POST
INFO 05-04 05:13:18 [launcher.py:36] Route: /detokenize, Methods: POST
INFO 05-04 05:13:18 [launcher.py:36] Route: /v1/models, Methods: GET
IN

ERROR:root:Evaluation failed or produced no output for deepseek-r1-distill-qwen-1.5b-scrambled. Aborting process for this model.
ERROR:root:Processing failed for deepseek-r1-distill-qwen-1.5b-scrambled. Check logs above.


INFO 05-04 05:18:26 [serving_completion.py:61] Using default completion sampling params from model: {'temperature': 0.6, 'top_p': 0.95}
INFO 05-04 05:18:26 [api_server.py:1090] Starting vLLM API server on http://0.0.0.0:8000
INFO 05-04 05:18:26 [launcher.py:28] Available routes are:
INFO 05-04 05:18:26 [launcher.py:36] Route: /openapi.json, Methods: HEAD, GET
INFO 05-04 05:18:26 [launcher.py:36] Route: /docs, Methods: HEAD, GET
INFO 05-04 05:18:26 [launcher.py:36] Route: /docs/oauth2-redirect, Methods: HEAD, GET
INFO 05-04 05:18:26 [launcher.py:36] Route: /redoc, Methods: HEAD, GET
INFO 05-04 05:18:26 [launcher.py:36] Route: /health, Methods: GET
INFO 05-04 05:18:26 [launcher.py:36] Route: /load, Methods: GET
INFO 05-04 05:18:26 [launcher.py:36] Route: /ping, Methods: POST, GET
INFO 05-04 05:18:26 [launcher.py:36] Route: /tokenize, Methods: POST
INFO 05-04 05:18:26 [launcher.py:36] Route: /detokenize, Methods: POST
INFO 05-04 05:18:26 [launcher.py:36] Route: /v1/models, Methods: GET
IN

ERROR:root:Evaluation failed or produced no output for deepseek-r1-distill-qwen-1.5b-length-val-modified. Aborting process for this model.
ERROR:root:Processing failed for deepseek-r1-distill-qwen-1.5b-length-val-modified. Check logs above.


KeyboardInterrupt: 