In [1]:
import pyterrier as pt
import pyterrier_rag.readers
from pyterrier_rag.backend import HuggingFaceBackend
import pandas as pd
import torch, time, os
import agent_framework
from agent_framework import R1Searcher
from agent_framework import SearchR1

ds = pt.get_dataset('rag:nq') # or rag:hotpotqa
TOPICS = ds.get_topics('dev').head(100)
QRELS  = ds.get_answers('dev')

sparse_index = pt.Artifact.from_hf('pyterrier/ragwiki-terrier')
bm25_ret = pt.rewrite.tokenise() >> sparse_index.bm25(include_fields=['docno', 'text', 'title'], threads=5, verbose = True) >> pt.rewrite.reset()

MEAS = [
    pyterrier_rag.measures.F1,
    pyterrier_rag.measures.EM,
]

INFO 08-28 23:06:45 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 08-28 23:06:45 [__init__.py:239] Automatically detected platform cuda.


Java started (triggered by _pt_tokeniser) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
  warn(
  warn(


In [2]:
def run_f1_em(name, system, batch_size=8):
    df = pt.Experiment(
        [system], TOPICS, QRELS, MEAS,
        names=[name], batch_size=batch_size, verbose=True
    )
    out_path = f"f1em_{name}.csv"
    df.to_csv(out_path, index=False)
    print("Saved:", out_path)
    return out_path
    
def get_gpu_mem(device=0):
    """返回 GPU 占用 (GB)，HF 和 vLLM 通用"""
    import subprocess
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"],
            encoding="utf-8", stdout=subprocess.PIPE
        )
        mems = [float(x) for x in result.stdout.strip().split("\n")]
        return mems[device] / 1024
    except Exception as e:
        print("GPU usage read error:", e)
        return 0.0
        
def run_efficiency(name, system):
    # 端到端时间 + 迭代次数 + GPU峰值
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    t0 = time.time()
    out = system.transform(TOPICS.copy())
    total = time.time() - t0
    mrt = total / len(TOPICS)

    turns = out.get('search_iterations', pd.Series([0]*len(out))).astype(int).mean()
    gpu_gb = get_gpu_mem()

    rec = {'name': name, 'MRT': mrt, 'Turns': turns, 'GPU_GB': gpu_gb}
    df = pd.DataFrame([rec])
    out_path = f"eff_{name}.csv"
    df.to_csv(out_path, index=False)
    print("Saved:", out_path)
    return out_path

In [3]:
# dataset =  pt.get_dataset('rag:nq')
# from ir_measures import define_byquery
# Iterations = define_byquery(lambda qrels, run: run.iloc[0].iteration, name="Iterations")
# pt.Experiment(
#     [safe_vllm_7b],
#     dataset.get_topics('dev').head(100), # NB: remove .head(100) to run on all dev topics
#     dataset.get_answers('dev'),
#     [pyterrier_rag.measures.F1, pyterrier_rag.measures.EM, "mrt"],
#     batch_size=8,
#     verbose=True,
#     names=['vLLM-7B']
# )

In [4]:
# 1) vLLM-7B (parallel)
vllm_7b = R1Searcher(
    retriever=bm25_ret,
)
# # 2) HF-7B (sequential) need changes
# vllm_7b = R1Searcher(retriever=bm25_ret,
#     model_id="XXsongLALA/Qwen-2.5-7B-base-RAG-RL",
#     use_vllm=True, max_turn=6, top_k=8, max_tokens=512, verbose=True, prompt_type='v1'
# )
# run_f1_em("vLLM-7B (parallel)", vllm_7b, batch_size=8)
# run_efficiency("vLLM-7B (parallel)", vllm_7b)
# del vllm_7b; torch.cuda.empty_cache()

# # 3) vLLM-7B (small-GPU)
# vllm_7b_small = R1Searcher(retriever=bm25_ret,
#     model_id="XXsongLALA/Qwen-2.5-7B-base-RAG-RL",
#     use_vllm=True, model_kw_args=dict(gpu_memory_utilization=0.60, max_model_len=768, tensor_parallel_size=1),
#     max_turn=6, top_k=8, max_tokens=384, verbose=True, prompt_type='v1'
# )
# run_f1_em("vLLM-7B (small-GPU)", vllm_7b_small, batch_size=8)
# run_efficiency("vLLM-7B (small-GPU)", vllm_7b_small)
# del vllm_7b_small; torch.cuda.empty_cache()

# # 4) Tiny model (可选)
# tiny = R1Searcher(retriever=bm25_ret,
#     model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     use_vllm=True, max_turn=6, top_k=8, max_tokens=384, verbose=True, prompt_type='v1'
# )
# run_f1_em("TinyLlama-1.1B (parallel)", tiny, batch_size=8)
# run_efficiency("TinyLlama-1.1B (parallel)", tiny)
# del tiny; torch.cuda.empty_cache()


INFO 08-28 23:07:17 [config.py:717] This model supports multiple tasks: {'generate', 'embed', 'reward', 'classify', 'score'}. Defaulting to 'generate'.
INFO 08-28 23:07:17 [config.py:2003] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 08-28 23:07:20 [core.py:58] Initializing a V1 LLM engine (v0.8.5.post1) with config: model='XXsongLALA/Qwen-2.5-7B-base-RAG-RL', speculative_config=None, tokenizer='XXsongLALA/Qwen-2.5-7B-base-RAG-RL', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=5096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 08-28 23:07:24 [loader.py:458] Loading weights took 2.62 seconds
INFO 08-28 23:07:24 [gpu_model_runner.py:1347] Model loading took 14.2717 GiB and 3.457278 seconds
INFO 08-28 23:07:26 [kv_cache_utils.py:634] GPU KV cache size: 120,096 tokens
INFO 08-28 23:07:26 [kv_cache_utils.py:637] Maximum concurrency for 5,096 tokens per request: 23.57x
INFO 08-28 23:07:26 [core.py:159] init engine (profile, create kv cache, warmup model) took 2.36 seconds
INFO 08-28 23:07:26 [core_client.py:439] Core engine process 0 ready.
[R1Searcher] vLLM backend ready: XXsongLALA/Qwen-2.5-7B-base-RAG-RL


In [5]:
def ensure_qanswer_str(df):
    # 没有该列，新建；有 None，用空串代替
    if 'qanswer' not in df.columns:
        df['qanswer'] = ""
    else:
        df['qanswer'] = df['qanswer'].fillna("").astype(str)
    return df

# 正确：使用 pt.apply.generic 构造一个 transformer
safe_vllm_7b = vllm_7b >> pt.apply.generic(ensure_qanswer_str)

In [6]:
run_f1_em("vLLM-7B (parallel)", safe_vllm_7b, batch_size=8)
run_efficiency("vLLM-7B (parallel)", safe_vllm_7b)

pt.Experiment:   0%|          | 0/13 [00:00<?, ?batches/s]

Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/8 [00:00<?, ?q/s][A
TerrierRetr(BM25):  12%|█▎        | 1/8 [00:02<00:16,  2.37s/q][A
TerrierRetr(BM25):  25%|██▌       | 2/8 [00:02<00:06,  1.12s/q][A
TerrierRetr(BM25):  50%|█████     | 4/8 [00:02<00:01,  2.01q/s][A
TerrierRetr(BM25):  75%|███████▌  | 6/8 [00:03<00:00,  3.32q/s][A
TerrierRetr(BM25):  88%|████████▊ | 7/8 [00:03<00:00,  3.19q/s][A
TerrierRetr(BM25): 100%|██████████| 8/8 [00:04<00:00,  1.89q/s][A


Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/2 [00:00<?, ?q/s][A
TerrierRetr(BM25):  50%|█████     | 1/2 [00:00<00:00,  5.40q/s][A
TerrierRetr(BM25): 100%|██████████| 2/2 [00:00<00:00,  6.98q/s][A


Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

pt.Experiment:   8%|▊         | 1/13 [00:12<02:30, 12.57s/batches]

Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/8 [00:00<?, ?q/s][A
TerrierRetr(BM25):  12%|█▎        | 1/8 [00:00<00:01,  4.81q/s][A
TerrierRetr(BM25):  25%|██▌       | 2/8 [00:00<00:01,  5.38q/s][A
TerrierRetr(BM25):  50%|█████     | 4/8 [00:00<00:00,  7.81q/s][A
TerrierRetr(BM25): 100%|██████████| 8/8 [00:00<00:00, 11.64q/s][A


Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/1 [00:00<?, ?q/s][A
TerrierRetr(BM25): 100%|██████████| 1/1 [00:00<00:00,  8.70q/s][A


Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

pt.Experiment:  15%|█▌        | 2/13 [00:21<01:56, 10.56s/batches]

Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/8 [00:00<?, ?q/s][A
TerrierRetr(BM25):  12%|█▎        | 1/8 [00:00<00:01,  5.84q/s][A
TerrierRetr(BM25):  25%|██▌       | 2/8 [00:00<00:00,  6.28q/s][A
TerrierRetr(BM25):  75%|███████▌  | 6/8 [00:00<00:00, 13.39q/s][A
TerrierRetr(BM25): 100%|██████████| 8/8 [00:01<00:00,  7.34q/s][A


Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/2 [00:00<?, ?q/s][A
TerrierRetr(BM25): 100%|██████████| 2/2 [00:00<00:00,  4.76q/s][A


Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

pt.Experiment:  23%|██▎       | 3/13 [00:31<01:41, 10.13s/batches]

Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/8 [00:00<?, ?q/s][A
TerrierRetr(BM25):  12%|█▎        | 1/8 [00:00<00:02,  2.35q/s][A
TerrierRetr(BM25):  25%|██▌       | 2/8 [00:00<00:01,  4.17q/s][A
TerrierRetr(BM25):  38%|███▊      | 3/8 [00:00<00:00,  5.49q/s][A
TerrierRetr(BM25):  62%|██████▎   | 5/8 [00:00<00:00,  7.41q/s][A
TerrierRetr(BM25): 100%|██████████| 8/8 [00:00<00:00,  8.21q/s][A


Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/2 [00:00<?, ?q/s][A
TerrierRetr(BM25): 100%|██████████| 2/2 [00:00<00:00,  8.48q/s][A


Processed prompts:   0%|          | 0/2 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

pt.Experiment:  31%|███       | 4/13 [00:42<01:34, 10.51s/batches]

Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/8 [00:00<?, ?q/s][A
TerrierRetr(BM25):  12%|█▎        | 1/8 [00:00<00:03,  2.28q/s][A
TerrierRetr(BM25):  25%|██▌       | 2/8 [00:00<00:01,  4.11q/s][A
TerrierRetr(BM25):  75%|███████▌  | 6/8 [00:00<00:00,  9.88q/s][A
TerrierRetr(BM25): 100%|██████████| 8/8 [00:00<00:00,  8.10q/s][A


Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

pt.Experiment:  38%|███▊      | 5/13 [00:50<01:17,  9.71s/batches]

Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/8 [00:00<?, ?q/s][A
TerrierRetr(BM25):  12%|█▎        | 1/8 [00:00<00:06,  1.09q/s][A
TerrierRetr(BM25):  50%|█████     | 4/8 [00:01<00:00,  4.58q/s][A
TerrierRetr(BM25): 100%|██████████| 8/8 [00:01<00:00,  5.92q/s][A


Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/3 [00:00<?, ?q/s][A
TerrierRetr(BM25):  33%|███▎      | 1/3 [00:00<00:00,  4.08q/s][A
TerrierRetr(BM25): 100%|██████████| 3/3 [00:00<00:00,  7.42q/s][A


Processed prompts:   0%|          | 0/3 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

pt.Experiment:  46%|████▌     | 6/13 [01:00<01:07,  9.70s/batches]

Processed prompts:   0%|          | 0/8 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/8 [00:00<?, ?q/s][A
TerrierRetr(BM25):  25%|██▌       | 2/8 [00:00<00:00, 13.56q/s][A
TerrierRetr(BM25):  50%|█████     | 4/8 [00:00<00:00, 12.07q/s][A
TerrierRetr(BM25): 100%|██████████| 8/8 [00:00<00:00, 11.89q/s][A


Processed prompts:   0%|          | 0/7 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


TerrierRetr(BM25):   0%|          | 0/3 [00:00<?, ?q/s][A
TerrierRetr(BM25):  33%|███▎      | 1/3 [00:00<00:00,  5.61q/s][A
TerrierRetr(BM25): 100%|██████████| 3/3 [00:01<00:00,  2.35q/s][A


Processed prompts:   0%|          | 0/3 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

pt.Experiment:  46%|████▌     | 6/13 [01:14<01:26, 12.34s/batches]


KeyboardInterrupt: 