In [1]:
from pathlib import Path
import json, re, time
import pandas as pd

REPORT_DIR = Path("/home/dataset-assist-0/annual-reports_output")
OUT_DIR = Path("/home/dataset-assist-0/eval_outputs_geval")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 断点续跑：已存在就跳过
OUT_JSONL = OUT_DIR / "geval_llama2_13b.jsonl"
OUT_CSV   = OUT_DIR / "geval_llama2_13b.csv"

report_files = sorted(REPORT_DIR.glob("*_short_report*.txt"))
print("reports:", len(report_files), "example:", report_files[0] if report_files else None)

def load_text(p: Path, max_chars=20000):
    t = p.read_text(encoding="utf-8", errors="ignore")
    t = re.sub(r"\n{3,}", "\n\n", t).strip()
    return t[:max_chars]

# 你可以按需改评分维度（G-Eval）
RUBRIC = """You are an expert buy-side analyst and research editor.
Evaluate the given equity research summary (the "Report") for professional readers.

Score each dimension from 1 to 5:
- Professionalism: finance terminology, tone, concision, no fluff.
- Structure: clear sections, logical flow, scannable bullets where helpful.
- Clarity: unambiguous statements, avoids repetition.
- Actionability: highlights key drivers, risks, and implications.
- Overall: holistic quality.

Return ONLY valid JSON with keys:
professionalism, structure, clarity, actionability, overall, rationale (1 short paragraph).
"""

def build_prompt(report_text: str) -> str:
    return f"""{RUBRIC}

Report:
\"\"\"\n{report_text}\n\"\"\"
"""

print(build_prompt("Example report...")[:400])


reports: 8 example: /home/dataset-assist-0/annual-reports_output/NVIDIA-2018-Annual-Report_short_report.txt
You are an expert buy-side analyst and research editor.
Evaluate the given equity research summary (the "Report") for professional readers.

Score each dimension from 1 to 5:
- Professionalism: finance terminology, tone, concision, no fluff.
- Structure: clear sections, logical flow, scannable bullets where helpful.
- Clarity: unambiguous statements, avoids repetition.
- Actionability: highlights 


In [2]:
import os, sys, subprocess, time, requests, textwrap, signal

MODEL_DIR = "/home/dataset-assist-0/data/models/llama2-13b"
VLLM_BASE = "http://127.0.0.1:8000/v1"

env = os.environ.copy()
env["TRANSFORMERS_OFFLINE"] = "1"
env["HF_HUB_OFFLINE"] = "1"

cmd = [
    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
    "--model", MODEL_DIR,
    "--host", "127.0.0.1", "--port", "8000",
    "--dtype", "auto",
    "--tensor-parallel-size", "2",
    "--max-model-len", "4096",
    "--gpu-memory-utilization", "0.90",
]

proc = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

# 等待服务起来
ok = False
for _ in range(120):
    try:
        r = requests.get(f"{VLLM_BASE}/models", timeout=1.5)
        if r.status_code == 200:
            ok = True
            break
    except Exception:
        pass
    time.sleep(1)

if not ok:
    # 打印最近日志，方便定位
    lines = []
    try:
        for _ in range(80):
            lines.append(proc.stdout.readline())
    except Exception:
        pass
    raise RuntimeError("vLLM failed to start.\nLast logs:\n" + "".join(lines))

print("vLLM is up:", requests.get(f"{VLLM_BASE}/models", timeout=10).json())

RuntimeError: vLLM failed to start.
Last logs:
[0;36m(APIServer pid=241583)[0;0m INFO 12-29 20:02:24 [api_server.py:1351] vLLM API server version 0.13.0
[0;36m(APIServer pid=241583)[0;0m INFO 12-29 20:02:24 [utils.py:253] non-default args: {'host': '127.0.0.1', 'model': '/home/dataset-assist-0/data/models/llama2-13b', 'max_model_len': 4096, 'tensor_parallel_size': 2}
[0;36m(APIServer pid=241583)[0;0m INFO 12-29 20:02:24 [model.py:514] Resolved architecture: LlamaForCausalLM
[0;36m(APIServer pid=241583)[0;0m INFO 12-29 20:02:24 [model.py:1661] Using max model len 4096
[0;36m(APIServer pid=241583)[0;0m INFO 12-29 20:02:25 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=2048.
[0;36m(EngineCore_DP0 pid=241726)[0;0m INFO 12-29 20:02:31 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='/home/dataset-assist-0/data/models/llama2-13b', speculative_config=None, tokenizer='/home/dataset-assist-0/data/models/llama2-13b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False), seed=0, served_model_name=/home/dataset-assist-0/data/models/llama2-13b, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'level': None, 'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::kda_attention', 'vllm::sparse_attn_indexer'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [2048], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'eliminate_noops': True, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False}, 'local_cache_dir': None}
[0;36m(EngineCore_DP0 pid=241726)[0;0m WARNING 12-29 20:02:31 [multiproc_executor.py:882] Reducing Torch parallelism from 56 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
INFO 12-29 20:02:36 [parallel_state.py:1203] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:35703 backend=nccl
ERROR 12-29 20:02:37 [multiproc_executor.py:751] WorkerProc failed to start.
ERROR 12-29 20:02:37 [multiproc_executor.py:751] Traceback (most recent call last):
ERROR 12-29 20:02:37 [multiproc_executor.py:751]   File "/opt/conda/lib/python3.11/site-packages/vllm/v1/executor/multiproc_executor.py", line 722, in worker_main
ERROR 12-29 20:02:37 [multiproc_executor.py:751]     worker = WorkerProc(*args, **kwargs)
ERROR 12-29 20:02:37 [multiproc_executor.py:751]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 12-29 20:02:37 [multiproc_executor.py:751]   File "/opt/conda/lib/python3.11/site-packages/vllm/v1/executor/multiproc_executor.py", line 553, in __init__
ERROR 12-29 20:02:37 [multiproc_executor.py:751]     self.worker.init_device()
ERROR 12-29 20:02:37 [multiproc_executor.py:751]   File "/opt/conda/lib/python3.11/site-packages/vllm/v1/worker/worker_base.py", line 326, in init_device
ERROR 12-29 20:02:37 [multiproc_executor.py:751]     self.worker.init_device()  # type: ignore
ERROR 12-29 20:02:37 [multiproc_executor.py:751]     ^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 12-29 20:02:37 [multiproc_executor.py:751]   File "/opt/conda/lib/python3.11/site-packages/vllm/v1/worker/gpu_worker.py", line 216, in init_device
ERROR 12-29 20:02:37 [multiproc_executor.py:751]     current_platform.set_device(self.device)
ERROR 12-29 20:02:37 [multiproc_executor.py:751]   File "/opt/conda/lib/python3.11/site-packages/vllm/platforms/cuda.py", line 123, in set_device
ERROR 12-29 20:02:37 [multiproc_executor.py:751]     torch.cuda.set_device(device)
ERROR 12-29 20:02:37 [multiproc_executor.py:751]   File "/opt/conda/lib/python3.11/site-packages/torch/cuda/__init__.py", line 567, in set_device
ERROR 12-29 20:02:37 [multiproc_executor.py:751]     torch._C._cuda_setDevice(device)
ERROR 12-29 20:02:37 [multiproc_executor.py:751] torch.AcceleratorError: CUDA error: out of memory
ERROR 12-29 20:02:37 [multiproc_executor.py:751] Search for `cudaErrorMemoryAllocation' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
ERROR 12-29 20:02:37 [multiproc_executor.py:751] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
ERROR 12-29 20:02:37 [multiproc_executor.py:751] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
ERROR 12-29 20:02:37 [multiproc_executor.py:751] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
ERROR 12-29 20:02:37 [multiproc_executor.py:751] 
INFO 12-29 20:02:37 [multiproc_executor.py:709] Parent process exited, terminating worker
INFO 12-29 20:02:37 [multiproc_executor.py:709] Parent process exited, terminating worker
[W1229 20:02:37.462299527 TCPStore.cpp:340] [c10d] TCP client failed to connect/validate to host 127.0.0.1:35703 - retrying (try=0, timeout=600000ms, delay=474ms): Interrupted system call
Exception raised from delay at /pytorch/torch/csrc/distributed/c10d/socket.cpp:115 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x80 (0x7fc90a22bb80 in /opt/conda/lib/python3.11/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x5ffd531 (0x7fc94c633531 in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #2: <unknown function> + 0x14a173a (0x7fc947ad773a in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #3: <unknown function> + 0x60791eb (0x7fc94c6af1eb in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #4: <unknown function> + 0x6079584 (0x7fc94c6af584 in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #5: <unknown function> + 0x5ff5e03 (0x7fc94c62be03 in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #6: c10d::TCPStore::TCPStore(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10d::TCPStoreOptions const&) + 0x41d (0x7fc94c63282d in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_cpu.so)
frame #7: <unknown function> + 0xd71465 (0x7fc95bd4e465 in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_python.so)
frame #8: <unknown function> + 0xdadda6 (0x7fc95bd8ada6 in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_python.so)
frame #9: <unknown function> + 0x3cc7bd (0x7fc95b3a97bd in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_python.so)
frame #10: /opt/conda/bin/python() [0x528527]
frame #11: _PyObject_MakeTpCall + 0x254 (0x504f04 in /opt/conda/bin/python)
frame #12: /opt/conda/bin/python() [0x556d63]
frame #13: _PyObject_Call + 0x93 (0x542503 in /opt/conda/bin/python)
frame #14: /opt/conda/bin/python() [0x53fe52]
frame #15: /opt/conda/bin/python() [0x50542c]
frame #16: <unknown function> + 0x3caeab (0x7fc95b3a7eab in /opt/conda/lib/python3.11/site-packages/torch/lib/libtorch_python.so)
frame #17: _PyObject_MakeTpCall + 0x254 (0x504f04 in /opt/conda/bin/python)
frame #18: _PyEval_EvalFrameDefault + 0x753 (0x5111d3 in /opt/conda/bin/python)
frame #19: /opt/conda/bin/python() [0x59dcd7]
frame #20: /opt/conda/bin/python() [0x52ea0b]
frame #21: PyObject_Vectorcall + 0x31 (0x51e131 in /opt/conda/bin/python)
frame #22: _PyEval_EvalFrameDefault + 0x753 (0x5111d3 in /opt/conda/bin/python)
frame #23: _PyFunction_Vectorcall + 0x173 (0x5380f3 in /opt/conda/bin/python)
frame #24: PyObject_Call + 0xa2 (0x542292 in /opt/conda/bin/python)
frame #25: _PyEval_EvalFrameDefault + 0x44bd (0x514f3d in /opt/conda/bin/python)
frame #26: _PyFunction_Vectorcall + 0x173 (0x5380f3 in /opt/conda/bin/python)
frame #27: PyObject_Call + 0xa2 (0x542292 in /opt/conda/bin/python)
frame #28: _PyEval_EvalFrameDefault + 0x44bd (0x514f3d in /opt/conda/bin/python)
frame #29: _PyFunction_Vectorcall + 0x173 (0x5380f3 in /opt/conda/bin/python)
frame #30: /opt/conda/bin/python() [0x53fba2]
frame #31: /opt/conda/bin/python() [0x50542c]
frame #32: PyObject_Call + 0x20b (0x5423fb in /opt/conda/bin/python)
frame #33: _PyEval_EvalFrameDefault + 0x44bd (0x514f3d in /opt/conda/bin/python)
frame #34: _PyFunction_Vectorcall + 0x173 (0x5380f3 in /opt/conda/bin/python)
frame #35: PyObject_Call + 0xa2 (0x542292 in /opt/conda/bin/python)
frame #36: _PyEval_EvalFrameDefault + 0x44bd (0x514f3d in /opt/conda/bin/python)
frame #37: /opt/conda/bin/python() [0x5caeae]
frame #38: PyEval_EvalCode + 0x9f (0x5ca4ef in /opt/conda/bin/python)
frame #39: /opt/conda/bin/python() [0x5ec747]
frame #40: /opt/conda/bin/python() [0x5e8af0]
frame #41: PyRun_StringFlags + 0x5f (0x5db1bf in /opt/conda/bin/python)
frame #42: PyRun_SimpleStringFlags + 0x3b (0x5db06b in /opt/conda/bin/python)
frame #43: Py_RunMain + 0x3e8 (0x5f6cd8 in /opt/conda/bin/python)
frame #44: Py_BytesMain + 0x39 (0x5b9a79 in /opt/conda/bin/python)
frame #45: __libc_start_main + 0xf3 (0x7fc96a033083 in /usr/lib/x86_64-linux-gnu/libc.so.6)


In [None]:
import requests, json
from tqdm.auto import tqdm

VLLM_BASE = "http://127.0.0.1:8000/v1"
MODEL_ID = "llama2-13b-judge"

def chat_once(prompt: str, temperature=0.0, max_tokens=400):
    payload = {
        "model": MODEL_ID,
        "messages": [
            {"role": "system", "content": "You are a strict evaluator. Output JSON only."},
            {"role": "user", "content": prompt},
        ],
        "temperature": temperature,
        "max_tokens": max_tokens,
    }
    r = requests.post(f"{VLLM_BASE}/chat/completions", json=payload, timeout=120)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"]

def extract_json(s: str):
    m = re.search(r"\{.*\}", s, flags=re.S)
    if not m:
        raise ValueError("No JSON found")
    return json.loads(m.group(0))

# 断点续跑：读已有 jsonl
done = set()
rows = []
if OUT_JSONL.exists():
    for line in OUT_JSONL.read_text(encoding="utf-8").splitlines():
        if not line.strip(): 
            continue
        obj = json.loads(line)
        done.add(obj["file"])
        rows.append(obj)

print("already done:", len(done))

with OUT_JSONL.open("a", encoding="utf-8") as f:
    for rp in tqdm(report_files):
        if rp.name in done:
            continue

        report_text = load_text(rp, max_chars=20000)
        prompt = build_prompt(report_text)

        t0 = time.time()
        try:
            raw = chat_once(prompt, temperature=0.0, max_tokens=400)
            j = extract_json(raw)
            rec = {
                "file": rp.name,
                "professionalism": j.get("professionalism"),
                "structure": j.get("structure"),
                "clarity": j.get("clarity"),
                "actionability": j.get("actionability"),
                "overall": j.get("overall"),
                "rationale": j.get("rationale"),
                "latency_s": round(time.time() - t0, 3),
            }
        except Exception as e:
            rec = {
                "file": rp.name,
                "error": str(e),
                "latency_s": round(time.time() - t0, 3),
            }

        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        f.flush()
        rows.append(rec)
        done.add(rp.name)

df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
df.head(), df.shape
