In [1]:
import asyncio
import gc
import logging
import os
import signal
import subprocess
import sys
import time
from pathlib import Path

import requests
import torch
from sentence_transformers import SentenceTransformer

sys.path.append("src")
from run_bench import run_wiki_benchmark

  from .autonotebook import tqdm as notebook_tqdm


[2026-03-01 10:06:51,247] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
BASE_MODELS_DIR = Path("models")
BASE_MODELS_DIR.mkdir(parents=True, exist_ok=True)

VLLM_HOST = "127.0.0.1"
VLLM_PORT = 8000
VLLM_API = f"http://{VLLM_HOST}:{VLLM_PORT}/v1"
VLLM_KEY = "local-bench-key"


# МОДЕЛЬ ДОЛЖНА ЛЕЖАТЬ В ПАПКЕ models/alias-модели
MODEL_SPECS = [
    ###### {"alias": "YandexGPT-5-Lite-8B-instruct", "repo_id": "models/YandexGPT-5-Lite-8B-instruct"},
    
    # --- ВСЕ ЧТО ВЫШЕ - ПОСЧИТАНО ---
    
    #{"alias": "Qwen3-4B-Instruct", "repo_id": "Qwen/Qwen3-4B-Instruct-2507"},
    #{"alias": "Qwen3-4B", "repo_id": "Qwen/Qwen3-4B"},
    #{"alias": "RuadaptQwen3-4B-Instruct", "repo_id": "RefalMachine/RuadaptQwen3-4B-Instruct"},
    
    ###{"alias": "Qwen3-8B", "repo_id": "Qwen/Qwen3-8B"},
    ###{"alias": "RuadaptQwen3-8B-Hybrid", "repo_id": "RefalMachine/RuadaptQwen3-8B-Hybrid"},
    ###{"alias": "avibe", "repo_id": "AvitoTech/avibe"},
    ###{"alias": "GigaChat3-10B-A1.8B-bf16", "repo_id": "ai-sage/GigaChat3-10B-A1.8B-bf16"},
    ###{"alias": "T-lite-it-1.0", "repo_id": "t-tech/T-lite-it-1.0"},
    ###{"alias": "Vikhr-Nemo-12B-Instruct-R-21-09-24", "repo_id": "Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24"},
    ###{"alias": "RuadaptQwen2.5-7B-Lite-Beta", "repo_id": "RefalMachine/RuadaptQwen2.5-7B-Lite-Beta"},
    
    #{"alias": "T-pro-it-2.1", "repo_id": "t-tech/T-pro-it-2.1"},
    #{"alias": "Qwen3-32B", "repo_id": "Qwen/Qwen3-32B"},
    #{"alias": "RuadaptQwen3-32B-Instruct", "repo_id": "RefalMachine/RuadaptQwen3-32B-Instruct"},
    #{"alias": "Qwen3-30B-A3B-Instruct-2507", "repo_id": "Qwen/Qwen3-30B-A3B-Instruct-2507"},
    #{"alias": "Qwen3-14B", "repo_id": "Qwen/Qwen3-14B"},
    #{"alias": "T-lite-it-2.1", "repo_id": "t-tech/T-lite-it-2.1"},
    #{"alias": "T-pro-it-2.0", "repo_id": "t-tech/T-pro-it-2.0"},
]

BENCH_CONCURRENCY = 40

VLLM_GPU_MEMORY_UTIL = 0.90
VLLM_MAX_NUM_SEQS = 20
VLLM_MAX_MODEL_LEN = 16386

In [3]:
def wait_vllm_ready(timeout_sec: int = 1200):
    url = f"{VLLM_API}/models"
    headers = {"Authorization": f"Bearer {VLLM_KEY}"}
    deadline = time.time() + timeout_sec
    last_error = None

    while time.time() < deadline:
        try:
            r = requests.get(url, headers=headers, timeout=5)
            if r.ok:
                return
            last_error = f"{r.status_code}: {r.text[:200]}"
        except Exception as e:
            last_error = repr(e)

        time.sleep(2)

    raise RuntimeError(f"vLLM не поднялся. Последняя ошибка: {last_error}")


def start_vllm_server(model_name: str, model_path: str) -> subprocess.Popen:
    cmd = [
        "vllm", "serve", model_path,
        "--host", VLLM_HOST,
        "--port", str(VLLM_PORT),
        "--api-key", VLLM_KEY,
        "--served-model-name", model_name,
        "--tensor-parallel-size", "1",
        "--dtype", "auto",
        "--gpu-memory-utilization", str(VLLM_GPU_MEMORY_UTIL),
        "--max-model-len", str(VLLM_MAX_MODEL_LEN),
        "--max-num-seqs", str(VLLM_MAX_NUM_SEQS),
        "--swap-space", "16",
        "--generation-config", "vllm",
        "--disable-log-stats",
        "--disable-uvicorn-access-log",
        "--disable-log-requests"
    ]

    # cmd.append("--trust-remote-code")

    print(f"\n=== START vLLM: {model_name} ===")
    env = os.environ.copy()
    env["VLLM_CONFIGURE_LOGGING"] = "0"
    
    proc = subprocess.Popen(
        cmd,
        start_new_session=True,
        env=env,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.STDOUT,
    )
    wait_vllm_ready()
    print(f"[READY] {model_name}")
    return proc


def stop_vllm_server(proc: subprocess.Popen):
    print("[STOP] vLLM")

    if proc and proc.poll() is None:
        try:
            os.killpg(proc.pid, signal.SIGTERM)
        except ProcessLookupError:
            pass

        try:
            proc.wait(timeout=30)
        except subprocess.TimeoutExpired:
            try:
                os.killpg(proc.pid, signal.SIGKILL)
            except ProcessLookupError:
                pass
            proc.wait(timeout=10)

    gc.collect()

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        try:
            torch.cuda.ipc_collect()
        except Exception:
            pass

    print("[CLEARED] CUDA cache")

In [4]:
async def run_wiki_for_model(model_name: str, encoder, encoder_device):
    metrics = await run_wiki_benchmark(
        api=VLLM_API,
        key=VLLM_KEY,
        model_name=model_name,
        concurrency=BENCH_CONCURRENCY,
        output_dir="collective_results",
        number_of_articles=20,
        encoder_name="sergeyzh/BERTA",

        device="cuda",

        prepare_env=False,
        neighbor_count=0,
        description_mode=True,
        clusterization_with_hint=True,

        shared_encoder=encoder,
        shared_device=encoder_device,
    )

    print(f"[DONE] {model_name}")
    print(metrics)
    return metrics


In [5]:
# =========================
# ГЛАВНЫЙ ЦИКЛ
# =========================

async def main():
    logging.getLogger("sentence_transformers.SentenceTransformer").setLevel(logging.ERROR)

    encoder_device = torch.device("cuda")
    encoder = SentenceTransformer("sergeyzh/BERTA").to(encoder_device)

    all_metrics = {}

    for spec in MODEL_SPECS:
        model_name = spec["alias"]
        model_path = BASE_MODELS_DIR / alias.replace("/", "_").replace(" ", "_")
        proc = None

        try:
            if not Path(model_path).exists():
                raise FileNotFoundError(f"Не найдена папка модели: {model_path}")

            proc = start_vllm_server(model_name, model_path)
            metrics = await run_wiki_for_model(model_name, encoder, encoder_device)
            all_metrics[model_name] = metrics

        except Exception as e:
            print(f"[ERROR] {model_name}: {e}")

        finally:
            if proc is not None:
                stop_vllm_server(proc)

    print("\n=== ALL METRICS ===")
    for name, metrics in all_metrics.items():
        print(name, metrics)


await main()


=== START vLLM: YandexGPT-5-Lite-8B-instruct ===


2026-03-01 10:08:10 | INFO | wikibench.YandexGPT-5-Lite-8B-instruct | WikiBench initialized: model=YandexGPT-5-Lite-8B-instruct, articles=20
2026-03-01 10:08:10 | INFO | wikibench.YandexGPT-5-Lite-8B-instruct | Loading enviroment...


[READY] YandexGPT-5-Lite-8B-instruct


2026-03-01 10:08:10 | INFO | wikibench.YandexGPT-5-Lite-8B-instruct | Enviroment loaded!
2026-03-01 10:08:10 | INFO | wikibench.YandexGPT-5-Lite-8B-instruct | Stage: rank_query started
rank_query:  85%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                            | 17/20 [07:21<01:21, 27.23s/article]2026-03-01 10:15:33 | ERROR | wikibench.YandexGPT-5-Lite-8B-instruct | rank_query failed for article=Uncharted 4: A Thief’s End
Traceback (most recent call last):
  File "/workdir/WikiBench/src/wiki_bench.py", line 151, in rank_query
    ranked_docs = await self.wiki_agent.create_ranking(article_name=article_name)
  File "/workdir/WikiBench/src/wiki_agent.py", line 71, in create_ranking
    top_k = self.utils.get_number_of_snippets()[article_name] * 3 # proportion is 1:2 (1 - relevant, 2 - irrelevant)
KeyError: 'Uncharted 4: A Thief’s End'
rank_query: 100%|████████

[DONE] YandexGPT-5-Lite-8B-instruct
{'model_name': 'YandexGPT-5-Lite-8B-instruct', 'number_of_articles': 20, 'ranking': {'ndcg_mean': 0.6349161892582756, 'r_precision_mean': 0.08697770832263775}, 'ranking_time': 472.97353360801935, 'outline': {'precision': {'mean': 0.5951797366142273, 'ci_low': 0.5744386240839958, 'ci_high': 0.6200312227010727}, 'recall': {'mean': 0.6198519468307495, 'ci_low': 0.5946791410446167, 'ci_high': 0.6476522892713547}, 'f1': {'mean': 0.6063086272420538, 'ci_low': 0.5857036395015154, 'ci_high': 0.6318296041174258}}, 'outline_time': 191.77827513962984, 'sections': {'precision': {'mean': 0.5302366018295288, 'ci_low': 0.5157355085015297, 'ci_high': 0.5448180615901946}, 'recall': {'mean': 0.5728142559528351, 'ci_low': 0.5565543726086617, 'ci_high': 0.590296696126461}, 'f1': {'mean': 0.5403300451022222, 'ci_low': 0.5295421265284487, 'ci_high': 0.5513359149299054}, 'rouge_l': {'mean': 0.1510034144871254, 'ci_low': 0.13747583569379748, 'ci_high': 0.16484155835566705},