In [1]:
import pyterrier as pt
import pyterrier_rag.readers
from pyterrier_rag.backend import HuggingFaceBackend
import pandas as pd
import torch, time, os
import agent_framework
from agent_framework import R1Searcher
from agent_framework import SearchR1

ds = pt.get_dataset('rag:nq') # or rag:hotpotqa
TOPICS = ds.get_topics('dev').head(100)
QRELS  = ds.get_answers('dev')

sparse_index = pt.Artifact.from_hf('pyterrier/ragwiki-terrier')
bm25_ret = pt.rewrite.tokenise() >> sparse_index.bm25(include_fields=['docno', 'text', 'title'], threads=5, verbose = True) >> pt.rewrite.reset()

MEAS = [
    pyterrier_rag.measures.F1,
    pyterrier_rag.measures.EM,
]

INFO 08-28 23:09:09 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 08-28 23:09:09 [__init__.py:239] Automatically detected platform cuda.


Java started (triggered by _pt_tokeniser) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
  warn(
  warn(


In [2]:
def run_f1_em(name, system, batch_size=8):
    df = pt.Experiment(
        [system], TOPICS, QRELS, MEAS,
        names=[name], batch_size=batch_size, verbose=True
    )
    out_path = f"f1em_{name}.csv"
    df.to_csv(out_path, index=False)
    print("Saved:", out_path)
    return out_path
    
def get_gpu_mem(device=0):
    """返回 GPU 占用 (GB)，HF 和 vLLM 通用"""
    import subprocess
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,nounits,noheader"],
            encoding="utf-8", stdout=subprocess.PIPE
        )
        mems = [float(x) for x in result.stdout.strip().split("\n")]
        return mems[device] / 1024
    except Exception as e:
        print("GPU usage read error:", e)
        return 0.0
        
def run_efficiency(name, system):
    # 端到端时间 + 迭代次数 + GPU峰值
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

    t0 = time.time()
    out = system.transform(TOPICS.copy())
    total = time.time() - t0
    mrt = total / len(TOPICS)

    turns = out.get('search_iterations', pd.Series([0]*len(out))).astype(int).mean()
    gpu_gb = get_gpu_mem()

    rec = {'name': name, 'MRT': mrt, 'Turns': turns, 'GPU_GB': gpu_gb}
    df = pd.DataFrame([rec])
    out_path = f"eff_{name}.csv"
    df.to_csv(out_path, index=False)
    print("Saved:", out_path)
    return out_path

In [3]:
# dataset =  pt.get_dataset('rag:nq')
# from ir_measures import define_byquery
# Iterations = define_byquery(lambda qrels, run: run.iloc[0].iteration, name="Iterations")
# pt.Experiment(
#     [safe_vllm_7b],
#     dataset.get_topics('dev').head(100), # NB: remove .head(100) to run on all dev topics
#     dataset.get_answers('dev'),
#     [pyterrier_rag.measures.F1, pyterrier_rag.measures.EM, "mrt"],
#     batch_size=8,
#     verbose=True,
#     names=['vLLM-7B']
# )

In [4]:
# 1) vLLM-7B (parallel)
# vllm_7b = R1Searcher(
#     retriever=bm25_ret,
# )
# # 2) HF-7B (sequential) need changes
hf_7b = R1Searcher(
    retriever = bm25_ret,
    use_vllm = False
)

# # 3) vLLM-7B (small-GPU)
# vllm_7b_small = R1Searcher(retriever=bm25_ret,
#     model_id="XXsongLALA/Qwen-2.5-7B-base-RAG-RL",
#     use_vllm=True, model_kw_args=dict(gpu_memory_utilization=0.60, max_model_len=768, tensor_parallel_size=1),
#     max_turn=6, top_k=8, max_tokens=384, verbose=True, prompt_type='v1'
# )
# run_f1_em("vLLM-7B (small-GPU)", vllm_7b_small, batch_size=8)
# run_efficiency("vLLM-7B (small-GPU)", vllm_7b_small)
# del vllm_7b_small; torch.cuda.empty_cache()

# # 4) Tiny model (可选)
# tiny = R1Searcher(retriever=bm25_ret,
#     model_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     use_vllm=True, max_turn=6, top_k=8, max_tokens=384, verbose=True, prompt_type='v1'
# )
# run_f1_em("TinyLlama-1.1B (parallel)", tiny, batch_size=8)
# run_efficiency("TinyLlama-1.1B (parallel)", tiny)
# del tiny; torch.cuda.empty_cache()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[R1Searcher] transformers backend ready: XXsongLALA/Qwen-2.5-7B-base-RAG-RL on cuda:0


In [5]:
def ensure_qanswer_str(df):
    # 没有该列，新建；有 None，用空串代替
    if 'qanswer' not in df.columns:
        df['qanswer'] = ""
    else:
        df['qanswer'] = df['qanswer'].fillna("").astype(str)
    return df

# 正确：使用 pt.apply.generic 构造一个 transformer
hf_7b = hf_7b >> pt.apply.generic(ensure_qanswer_str)

In [6]:
run_f1_em("HF-7B (sequential)", hf_7b, batch_size=8)
run_efficiency("HF-7B (sequential)", hf_7b)

pt.Experiment:   0%|          | 0/13 [00:00<?, ?batches/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.

TerrierRetr(BM25):   0%|          | 0/5 [00:00<?, ?q/s][A
TerrierRetr(BM25):  20%|██        | 1/5 [00:00<00:01,  2.03q/s][A
TerrierRetr(BM25):  60%|██████    | 3/5 [00:00<00:00,  4.06q/s][A
TerrierRetr(BM25):  80%|████████  | 4/5 [00:01<00:00,  2.60q/s][A
TerrierRetr(BM25): 100%|██████████| 5/5 [00:01<00:00,  2.93q/s][A
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
pt.Experiment:   8%|▊         | 1/13 [00:13<02:47, 13.99s/batches]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.

TerrierRetr(BM25):   0%|    

Saved: f1em_HF-7B (sequential).csv


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.40 GiB. GPU 0 has a total capacity of 23.69 GiB of which 1.12 GiB is free. Process 1228150 has 22.56 GiB memory in use. Of the allocated memory 19.61 GiB is allocated by PyTorch, and 2.65 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)