In [2]:
import json
import os

import hashlib
from typing import List, Dict, Any
from tqdm import tqdm
import sys
import concurrent.futures
import random

from get_text_embedding import get_text_embedding

from dotenv import load_dotenv
from openai import OpenAI
import os, requests

In [3]:
# 1) Install/Imports (run once; skip install if already available)
# %pip install -q sentence-transformers rank_bm25 jieba
#from sentence_transformers import CrossEncoder
from rank_bm25 import BM25Okapi
import jieba, re, numpy as np


In [49]:
# Build windows from existing MinerU chunks: all_pdf_page_chunks.json -> all_pdf_windows.json
import json, re, hashlib
from pathlib import Path

SRC = Path("./all_pdf_page_chunks_mineru.json")          # MinerU output you already have
DST = Path("./all_pdf_windows_mineru.json")              # new, windowized chunks

HDR = re.compile(r"^(#{1,3})\s+.+", re.M)         # #, ##, ### headings
FIG = re.compile(r"!\[[^\]]*\]\([^)]+\)")         # markdown images
TB  = re.compile(r"<table[\s\S]*?</table>", re.I)  # html tables

def _norm(s): return re.sub(r"\s+", " ", (s or "").strip())

def split_by_headers(md: str):
    # keep figure/table blocks as atomic spans
    blocks = []
    # placeholder tokens for figures/tables
    fig_tokens, tab_tokens = [], []
    def _stash(pattern, text, token_prefix):
        out, toks, i = text, [], 0
        for m in pattern.finditer(text):
            tok = f"__{token_prefix}_{i}__"
            toks.append((tok, m.group(0)))
            out = out.replace(m.group(0), tok, 1)
            i += 1
        return out, toks
    text, figs = _stash(FIG, md, "FIG")
    text, tabs = _stash(TB,  text, "TAB")

    # split by headers; if none, single block
    parts = []
    last = 0
    for m in HDR.finditer(text):
        if m.start() > last:
            parts.append(text[last:m.start()])
        parts.append(text[m.start():m.end()])  # header line as its own piece
        last = m.end()
    if last < len(text):
        parts.append(text[last:])

    # restore tokens
    def _restore(s, toks):
        for tok, val in toks:
            s = s.replace(tok, val)
        return s
    parts = [_restore(_restore(p, figs), tabs) for p in parts]
    # drop empties
    parts = [p for p in parts if _norm(p)]
    return parts

def windowize(parts, target=900, overlap=150):
    out, buf = [], ""
    for p in parts:
        p = _norm(p)
        if len(p) <= target:
            if len(buf) + len(p) + 1 <= target:
                buf = (buf + " " + p).strip()
            else:
                if buf: out.append(buf)
                tail = buf[-overlap:] if buf and overlap else ""
                buf = (tail + " " + p).strip()
        else:
            # long piece -> sliding windows
            stride = max(50, target - overlap)
            i = 0
            while i < len(p):
                out.append(_norm(p[i:i+target]))
                i += stride
    if buf: out.append(buf)
    return out

print("Loading MinerU page chunks…")
chunks = json.loads(Path(SRC).read_text(encoding="utf-8"))
print(f"Loaded {len(chunks)} page-chunks")

seen, windows = set(), []
for c in chunks:
    file = c["metadata"]["file_name"]
    page = int(c["metadata"]["page"])+1   # shift 1 , quick fix for the fucked up in mineru process
    parts = split_by_headers(c["content"])
    wins  = windowize(parts, target=900, overlap=150)
    for widx, w in enumerate(wins):
        h = hashlib.md5(_norm(w).encode()).hexdigest()
        if h in seen: 
            continue
        seen.add(h)
        windows.append({
            "id": f"{c['id']}_w{widx}",
            "content": w,
            "metadata": {
                "file_name": file,
                "page": page,         
                "widx": widx
            }
        })

print(f"Built {len(windows)} windows")
DST.write_text(json.dumps(windows, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Saved -> {DST}")

Loading MinerU page chunks…
Loaded 4272 page-chunks
Built 10053 windows
Saved -> all_pdf_windows_mineru.json


In [52]:
# Add these helpers near your RAG class

from collections import defaultdict
import numpy as np

def page_vote(ranked_chunks, top_m_pages=1, agg="max"):
    """
    ranked_chunks: list already reranked by your reranker (desc)
    Aggregate scores per (file_name, page) and return chunks from the winning page(s).
    """
    buckets = defaultdict(list)
    for c in ranked_chunks:
        key = (c["metadata"]["file_name"], int(c["metadata"]["page"]))
        s = c.get("rerank_score", c.get("ret_score", 0.0))
        buckets[key].append(float(s))

    def agg_fn(v):
        v = np.asarray(v, float)
        if agg == "sum": return v.sum()
        if agg == "mean": return v.mean()
        return v.max()

    scored_pages = sorted([(k, agg_fn(v)) for k, v in buckets.items()], key=lambda x: x[1], reverse=True)
    keep_keys = set(k for k,_ in scored_pages[:top_m_pages])
    return [c for c in ranked_chunks if (c["metadata"]["file_name"], int(c["metadata"]["page"])) in keep_keys]

def expand_neighbors_on_page(page_chunks, radius=1, max_total=8):
    # Assumes all chunks are from the same (file,page); use their widx to add neighbors
    page_chunks = sorted(page_chunks, key=lambda x: x["metadata"].get("widx", 0))
    out = []
    picked = set()
    for c in page_chunks[:max_total]:
        w = c["metadata"].get("widx", 0)
        for j in range(max(0, w - radius), min(len(page_chunks), w + radius + 1)):
            cj = page_chunks[j]
            key = cj["metadata"].get("widx", j)
            if key in picked: 
                continue
            picked.add(key); out.append(cj)
            if len(out) >= max_total: 
                return out
    return out or page_chunks[:max_total]


In [4]:
class SiliconFlowReranker:
    def __init__(self, api_key: str = None,
                 model: str = "BAAI/bge-reranker-v2-m3",
                 endpoint: str = "https://api.siliconflow.cn/v1/rerank",
                 timeout: int = 30):
        self.api_key = os.getenv('LOCAL_API_KEY')
        if not self.api_key:
            raise ValueError("Missing SILICONFLOW_API_KEY")
        self.model = model
        self.endpoint = endpoint
        self.timeout = timeout

    def rerank(self, question: str, candidates: list):
        # candidates: list of dicts with keys: 'content', 'metadata': {'file_name','page'}
        docs = [c["content"] for c in candidates]
        payload = {"model": self.model, "query": question, "documents": docs}
        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
        r = requests.post(self.endpoint, json=payload, headers=headers, timeout=self.timeout)
        r.raise_for_status()
        data = r.json()

        # SiliconFlow returns a list of results with {index, relevance_score}
        # (Long docs are auto-chunked server-side; highest sub-chunk score is used per doc.) :contentReference[oaicite:1]{index=1}
        results = data.get("results") or data.get("data") or []
        # attach scores back to your candidate objects
        scored = []
        for res in results:
            idx = res.get("index")
            if idx is None or idx >= len(candidates): 
                continue
            c = dict(candidates[idx])  # shallow copy
            c["rerank_score"] = float(res.get("relevance_score", 0.0))
            scored.append(c)

        # If API didn’t return for all docs, keep the rest with score 0
        seen = {res.get("index") for res in results if res.get("index") is not None}
        for i, c in enumerate(candidates):
            if i not in seen:
                cc = dict(c)
                cc["rerank_score"] = cc.get("rerank_score", 0.0)
                scored.append(cc)

        return sorted(scored, key=lambda x: x["rerank_score"], reverse=True)


In [5]:
class PageChunkLoader:
    def __init__(self, json_path: str):
        self.json_path = json_path
    def load_chunks(self) -> List[Dict[str, Any]]:
        with open(self.json_path, 'r', encoding='utf-8') as f:
            return json.load(f)

In [6]:
class EmbeddingModel:
    def __init__(self, batch_size: int = 64):
        self.api_key = os.getenv('LOCAL_API_KEY')
        self.base_url = os.getenv('LOCAL_BASE_URL')
        self.embedding_model = os.getenv('LOCAL_EMBEDDING_MODEL')
        self.batch_size = batch_size
        if not self.api_key or not self.base_url:
            raise ValueError('请在.env中配置LOCAL_API_KEY和LOCAL_BASE_URL')

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        return get_text_embedding(
            texts,
            api_key=self.api_key,
            base_url=self.base_url,
            embedding_model=self.embedding_model,
            batch_size=self.batch_size
        )

    def embed_text(self, text: str) -> List[float]:
        return self.embed_texts([text])[0]

In [7]:
class SimpleVectorStore:
    def __init__(self):
        self.embeddings = []
        self.chunks = []
    def add_chunks(self, chunks: List[Dict[str, Any]], embeddings: List[List[float]]):
        self.chunks.extend(chunks)
        self.embeddings.extend(embeddings)
    def search(self, query_embedding: List[float], top_k: int = 3) -> List[Dict[str, Any]]:
        from numpy import dot
        from numpy.linalg import norm
        import numpy as np
        if not self.embeddings:
            return []
        emb_matrix = np.array(self.embeddings)
        query_emb = np.array(query_embedding)
        sims = emb_matrix @ query_emb / (norm(emb_matrix, axis=1) * norm(query_emb) + 1e-8)
        idxs = sims.argsort()[::-1][:top_k]
        return [self.chunks[i] for i in idxs]

In [None]:
from typing import Dict, Any, List, Optional
import os
from openai import OpenAI

class SimpleRAG:
    def __init__(
        self,
        chunk_json_path: str,
        model_path: str = None,
        batch_size: int = 32,
        use_rerank: bool = False,
        candidate_k: int = 120,
        final_k: int = 5,
        reranker=None,  # expect your HybridReranker instance
    ):
        self.loader = PageChunkLoader(chunk_json_path)
        self.embedding_model = EmbeddingModel(batch_size=batch_size)
        self.vector_store = SimpleVectorStore()

        # Rerank controls
        self.use_rerank = use_rerank
        self.candidate_k = candidate_k
        self.final_k = final_k
        self.reranker = reranker

    def setup(self):
        print("加载所有页chunk...")
        chunks = self.loader.load_chunks()
        print(f"共加载 {len(chunks)} 个chunk")
        print("生成嵌入...")
        embeddings = self.embedding_model.embed_texts([c["content"] for c in chunks])
        print("存储向量...")
        self.vector_store.add_chunks(chunks, embeddings)
        print("RAG向量库构建完成！")

    def query(self, question: str, top_k: int = 3) -> Dict[str, Any]:
        q_emb = self.embedding_model.embed_text(question)
        results = self.vector_store.search(q_emb, top_k)
        return {"question": question, "chunks": results}

    def _build_context(self, items: List[Dict[str, Any]]) -> str:
        return "\n".join(
            [
                f"[文件名]{c['metadata']['file_name']} [页码]{c['metadata']['page']}\n{c['content']}"
                for c in items
            ]
        )

    def generate_answer(self, question: str, top_k: int = 3) -> Dict[str, Any]:
        """
        检索+大模型生成式回答，返回结构化结果
        """
        qwen_api_key = os.getenv("LOCAL_API_KEY")
        qwen_base_url = os.getenv("LOCAL_BASE_URL")
        qwen_model = os.getenv("LOCAL_TEXT_MODEL")
        if not qwen_api_key or not qwen_base_url or not qwen_model:
            raise ValueError("请在.env中配置LOCAL_API_KEY、LOCAL_BASE_URL、LOCAL_TEXT_MODEL")

        # ------ Retrieval (+ optional rerank) ------
        q_emb = self.embedding_model.embed_text(question)

        if self.use_rerank and self.reranker is not None:
            # Stage A: larger pool
            candidates = self.vector_store.search(q_emb, self.candidate_k)
            if candidates:
                # Stage B: rerank
                ranked = self.reranker.rerank(question, candidates)

                #### CHANGE PAGE VOTE START ###
                page_best = page_vote(ranked, top_m_pages=1, agg="max")

                try:
                    page_best = expand_neighbors_on_page(page_best, radius=1, max_total=self.final_k)
                except Exception:
                    # Fallback if widx missing: just take top-K from this page
                    page_best = page_best[: self.final_k]
                    
                chunks = page_best if page_best else ranked[: self.final_k]
                #chunks = ranked[: self.final_k]
            else:
                chunks = []
        else:
            # Baseline
            chunks = self.vector_store.search(q_emb, top_k)

        # Build LLM context
        context = self._build_context(chunks)

        # ------ LLM call (unchanged) ------
        prompt = (
            "你是一名专业的金融分析助手，请根据以下检索到的内容回答用户问题。\n"
            "请严格按照如下JSON格式输出：\n"
            '{"answer": "你的简洁回答", "filename": "来源文件名", "page": "来源页码"}\n'
            f"检索内容：\n{context}\n\n问题：{question}\n"
            "请确保输出内容为合法JSON字符串，不要输出多余内容。"
        )

        client = OpenAI(api_key=qwen_api_key, base_url=qwen_base_url)
        completion = client.chat.completions.create(
            model=qwen_model,
            messages=[
                {"role": "system", "content": "你是一名专业的金融分析助手。"},
                {"role": "user", "content": prompt},
            ],
            temperature=0.2,
            max_tokens=1024,
        )

        import json as pyjson
        from extract_json_array import extract_json_array

        raw = completion.choices[0].message.content.strip()
        json_str = extract_json_array(raw, mode="objects")

        if json_str:
            try:
                arr = pyjson.loads(json_str)
                if isinstance(arr, list) and arr:
                    j = arr[0]
                    answer = j.get("answer", "")
                    filename = j.get("filename", "")
                    page = j.get("page", "")
                else:
                    answer = raw
                    filename = chunks[0]["metadata"]["file_name"] if chunks else ""
                    page = chunks[0]["metadata"]["page"] if chunks else ""
            except Exception:
                answer = raw
                filename = chunks[0]["metadata"]["file_name"] if chunks else ""
                page = chunks[0]["metadata"]["page"] if chunks else ""
        else:
            answer = raw
            filename = chunks[0]["metadata"]["file_name"] if chunks else ""
            page = chunks[0]["metadata"]["page"] if chunks else ""

        # ------ Source fallback/override from best evidence (place it HERE) ------
        if chunks:
            best_file = chunks[0]["metadata"]["file_name"]
            best_page = chunks[0]["metadata"]["page"]
            # Fallback: only fill if model didn't provide values
            if not filename:
                filename = best_file
            if not page:
                page = best_page
            # If you prefer to ALWAYS trust retrieval/rerank for grading:
            # filename, page = best_file, best_page

        # Final return
        return {
            "question": question,
            "answer": answer,
            "filename": filename,
            "page": page,
            "retrieval_chunks": chunks,
        }

In [53]:
# 1) Imports & Paths
from pathlib import Path
import os, json, random
from tqdm.auto import tqdm
import concurrent.futures
import math

# Notebook is in .../notebook; project root is parent
NOTEBOOK_DIR = Path.cwd()
PROJ_ROOT = NOTEBOOK_DIR.parent

# Try common locations for train.json
CANDIDATE_TRAIN = [
    PROJ_ROOT / "datas" / "train.json",
    PROJ_ROOT / "data" / "train.json",
    NOTEBOOK_DIR / "datas" / "train.json",
    NOTEBOOK_DIR / "data" / "train.json",
]
TRAIN_PATH = next((p for p in CANDIDATE_TRAIN if p.exists()), None)
if TRAIN_PATH is None:
    raise FileNotFoundError(f"train.json not found in: {CANDIDATE_TRAIN}")

# Chunk JSON path (your earlier structure)
CHUNK_JSON_PATH = PROJ_ROOT / "notebook" / "all_pdf_windows_mineru.json"

# Outputs
EVAL_RAW_PATH = PROJ_ROOT / "eval_train_raw.json"
EVAL_SUMMARY_PATH = PROJ_ROOT / "eval_train_scored.json"

print("Notebook Dir:", NOTEBOOK_DIR)
print("Project Root :", PROJ_ROOT)
print("Train JSON   :", TRAIN_PATH)
print("Chunks JSON  :", CHUNK_JSON_PATH)


Notebook Dir: d:\Datawhale\Multimodal-RAG-Competitions\notebook
Project Root : d:\Datawhale\Multimodal-RAG-Competitions
Train JSON   : d:\Datawhale\Multimodal-RAG-Competitions\data\train.json
Chunks JSON  : d:\Datawhale\Multimodal-RAG-Competitions\notebook\all_pdf_windows_mineru.json


In [54]:
# Create your reranker beforehand
# reranker = HybridReranker(model_name="BAAI/bge-reranker-base")
reranker = SiliconFlowReranker()
rag = SimpleRAG(
    chunk_json_path=CHUNK_JSON_PATH,  # or your page/block chunk file
    use_rerank=True,
    candidate_k=120,
    final_k=5,
    reranker=reranker,
)
rag.setup()


加载所有页chunk...
共加载 10053 个chunk
生成嵌入...


Embedding: 100%|██████████| 315/315 [05:58<00:00,  1.14s/batch]

存储向量...
RAG向量库构建完成！





In [30]:
# 3) Load train and sample
with open(TRAIN_PATH, "r", encoding="utf-8") as f:
    train_data = json.load(f)

N = len(train_data)
random.seed(42)

sample_size = max(1, math.ceil(N * 0.10))
all_idx = list(range(N))
sample_idx = sorted(random.sample(all_idx, sample_size)) if sample_size < N else all_idx

print(f"Train size = {N} | Sample size = {len(sample_idx)}")
sample_idx[:10]


Train size = 118 | Sample size = 12


[3, 13, 14, 17, 28, 31, 35, 69, 81, 86]

In [31]:
# 4) Jaccard helper
def jaccard_char(a: str, b: str) -> float:
    a = (a or "").strip()
    b = (b or "").strip()
    if not a and not b:
        return 1.0
    set_a, set_b = set(a), set(b)
    union = set_a | set_b
    inter = set_a & set_b
    return len(inter) / len(union) if union else 0.0


In [55]:
# 5) Inference
def run_one(idx):
    item = train_data[idx]
    q = item.get("question", "")
    tqdm.write(f"[{sample_idx.index(idx)+1}/{len(sample_idx)}] {q[:60]}...")
    pred = rag.generate_answer(q, top_k=5)
    return idx, pred

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as ex:
    for out in tqdm(ex.map(run_one, sample_idx), total=len(sample_idx), desc="Infer on train sample"):
        results.append(out)

# Save raw (idx, pred) for debugging
with open(EVAL_RAW_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print(f"Saved raw predictions to: {EVAL_RAW_PATH}")


[1/12] 联邦制药的UBT37034在超重/肥胖适应症方面取得了哪些临床前数据？...
[2/12] 根据华创证券对凌云股份的深度研究报告，请问该公司在2024年的主要产品收入占比是多少？...
[3/12] 关于凌云股份（600480）的热冲压技术应用和发展前景，能否详细解释热冲压成型工艺与冷冲压成型工艺的主要区别？...
[4/12] 关于凌云股份（600480）的德国WAG业务板块及客户情况，请问具体有哪些主要客户？...
[5/12] 广联达的数字施工业务在2020年的资金压力如何？与同行业其他企业相比，其资金压力有何特点？...
[6/12] 如何评估广联达在数字化转型过程中面临的挑战及其应对策略？...
[7/12] 如何评估广联达在数字化转型中的竞争优势？...
[8/12] 如何分析广联达（002410.SZ）在2021年的PS估值水平及其与可比公司的差异？...
[9/12] 千味央厨的餐饮大客户经营数据在2022年第三季度有何变化？...
[10/12] 千味央厨公司在2020年的毛利率受原材料价格波动影响如何？...


Infer on train sample:   8%|▊         | 1/12 [00:33<06:13, 33.99s/it]

[11/12] 关于伊利股份的历史发展和市场竞争，请问在2005年至2013年间，伊利如何通过创新产品和营销策略实现营收突破100亿大关...


Infer on train sample:   8%|▊         | 1/12 [00:37<06:13, 33.99s/it]

[12/12] 广联达（002410）的数字设计业务在2021年下半年将如何推进？...


Infer on train sample: 100%|██████████| 12/12 [05:10<00:00, 25.84s/it]

Saved raw predictions to: d:\Datawhale\Multimodal-RAG-Competitions\eval_train_raw.json





## 6) Scoring vs Ground Truth
Score per item:
- page_match: 1 if exact page equals, else 0 (×0.25)
- filename_match: 1 if exact filename equals, else 0 (×0.25)
- answer_jaccard: char Jaccard (×0.5)


In [56]:
# 6) Score predictions
idx2pred = {idx: pred for idx, pred in results}

scored_rows = []
for idx in sample_idx:
    gt = train_data[idx]
    pred = idx2pred.get(idx, {})

    gt_q = gt.get("question", "")
    gt_a = gt.get("answer", "")
    gt_f = gt.get("filename", "")
    gt_p = gt.get("page", "")

    pr_a = pred.get("answer", "")
    pr_f = pred.get("filename", "")
    pr_p = pred.get("page", "")

    page_match = 1.0 if str(pr_p) == str(gt_p) else 0.0
    filename_match = 1.0 if str(pr_f) == str(gt_f) else 0.0
    answer_sim = jaccard_char(str(pr_a), str(gt_a))

    score = 0.25 * page_match + 0.25 * filename_match + 0.5 * answer_sim

    scored_rows.append({
        "idx": idx,
        "question": gt_q,
        "gt_answer": gt_a,
        "pr_answer": pr_a,
        "gt_filename": gt_f,
        "pr_filename": pr_f,
        "gt_page": gt_p,
        "pr_page": pr_p,
        "page_match": page_match,
        "filename_match": filename_match,
        "answer_jaccard": answer_sim,
        "score": score,
    })

# Sort by score ascending to inspect weak cases first
scored_rows_sorted = sorted(scored_rows, key=lambda x: x["score"])

with open(EVAL_SUMMARY_PATH, "w", encoding="utf-8") as f:
    json.dump(scored_rows_sorted, f, ensure_ascii=False, indent=2)

print(f"Saved scored results to: {EVAL_SUMMARY_PATH}")
print(f"max score: {max(r['score'] for r in scored_rows_sorted)}")
print(f"Mean score: {sum(r['score'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")
print(f"min score: {min(r['score'] for r in scored_rows_sorted) }")
print(f"Mean Jaccard: {sum(r['answer_jaccard'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")
print(f"Filename exact@1: {sum(r['filename_match'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")
print(f"Page exact@1: {sum(r['page_match'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")


Saved scored results to: d:\Datawhale\Multimodal-RAG-Competitions\eval_train_scored.json
max score: 0.803030303030303
Mean score: 0.3274
min score: 0.05188679245283019
Mean Jaccard: 0.2380
Filename exact@1: 0.5000
Page exact@1: 0.3333


Saved scored results to: d:\Datawhale\Multimodal-RAG-Competitions\eval_train_scored.json  
max score: 0.5576923076923077  
Mean score: 0.3201  
min score: 0.08403361344537816  
Mean Jaccard: 0.3068  
Filename exact@1: 0.5833  
Page exact@1: 0.0833  

Mineru + BN25
max score: 0.4962686567164179  
Mean score: 0.3231  
min score: 0.09130434782608696  
Mean Jaccard: 0.3128
Filename exact@1: 0.5833  
Page exact@1: 0.0833  

Note: With BM25 rerank :
max score: 0.5732323232323233  
Mean score: 0.2638   
min score: 0.09668508287292818  
Mean Jaccard: 0.3193  
Filename exact@1: 0.4167  
Page exact@1: 0.0000  

In [57]:
# Show a couple of worst and best cases inline (adjust k as needed)
k = 3
print("— Worst cases —")
for r in scored_rows_sorted[:k]:
    print("\nScore:", r["score"])
    print("Q:", r["question"])
    print("GT:", r["gt_answer"])
    print("PR:", r["pr_answer"])
    print("GT file/page:", r["gt_filename"], r["gt_page"])
    print("PR file/page:", r["pr_filename"], r["pr_page"])

print("\n— Best cases —")
for r in scored_rows_sorted[-k:]:
    print("\nScore:", r["score"])
    print("Q:", r["question"])
    print("GT:", r["gt_answer"])
    print("PR:", r["pr_answer"])
    print("GT file/page:", r["gt_filename"], r["gt_page"])
    print("PR file/page:", r["pr_filename"], r["pr_page"])


— Worst cases —

Score: 0.05188679245283019
Q: 千味央厨的餐饮大客户经营数据在2022年第三季度有何变化？
GT: 根据图片中的图表32和文字内容，千味央厨的餐饮大客户经营数据在2022年第三季度（22Q3）的变化如下：

- 肯德基：同店营收同比持平（+0%）
- 必胜客：同店营收同比增长2%（+2%）

这些数据表明，尽管2022年第二季度（22Q2）相比2022年第一季度（22Q1）出现了显著下滑（16%/-15%），但在第三季度（22Q3）都出现了明显的改善。这反映了
PR: 2022年直营大客户数量稍有减少，主要系外部不利因素影响部分企业持续经营
GT file/page: 千味央厨-千寻百味乘势而上-22122726页.pdf 16
PR file/page: 千味央厨-公司研究报告-深耕餐饮供应链为人间千味-23011258页.pdf 37

Score: 0.08280254777070063
Q: 如何评估广联达在数字化转型过程中面临的挑战及其应对策略？
GT: 广联达在数字化转型过程中面临的挑战主要包括以下几个方面：

1. **劳动力问题**：
   - 工人的体力劳动量较大，施工工作环境较苦，收入较低，导致施工人员流动性大。
   - 老龄化现象严重，30岁及以上比例的工人占比逐年提升，接近8成。
   - 受教育程度低，建筑施工以简单重复性工作为主，难以提供良好的职业上升通道。

2. **项目变化频繁**：
   - 建筑项目的设计、预算和时间表可能随时发生变化，影响工期与成本。
   - 变
PR: 广联达在数字化转型中面临地区云转型未完成、施工市场渗透率偏低及系统化需求挑战，应对策略包括加速云转型、推进SaaS模式及构建平台化解决方案。
GT file/page: 广联达-再谈广联达当前时点下如何看待其三条增长曲线-220217131页.pdf 32
PR file/page: 广联达-造价筑底施工决胜-22041233页.pdf 1

Score: 0.08680555555555555
Q: 广联达的数字施工业务在2020年的资金压力如何？与同行业其他企业相比，其资金压力有何特点？
GT: 根据图片中的图表和文字内容，可以得出以下结论：

1. **资金压力情况**：
   - 图表3

### Notes
- Set a different **sample fraction** by changing the `0.10` in `math.ceil(N * 0.10)`.
- If `filename`/`page` in ground truth differ in minor formatting (e.g., case, spaces), add normalization before comparison.
- You can plug this same scorer later for validation on a dev split.
