In [1]:
import json
import os

import hashlib
from typing import List, Dict, Any
from tqdm import tqdm
import sys
import concurrent.futures
import random

from get_text_embedding import get_text_embedding

from dotenv import load_dotenv
from openai import OpenAI
import os, requests

In [2]:
from rank_bm25 import BM25Okapi
import jieba, re

def _preprocess(doc: str) -> str:
    # 切分 6-8 位数字（日期/代码），弱化连字符/下划线
    doc = re.sub(r'(\d{6,8})', r' \1 ', doc)
    doc = doc.replace('-', ' ').replace('_', ' ')
    return doc

class FileBM25Index:
    """
    文件级 BM25：用 文件名 + 前3页文本 + 标题/目录 行 来增强“版本可分性”
    """
    def __init__(self, pages_per_file: int = 3, per_page_chars: int = 1500,
                 repeat_filename_tokens: int = 2):
        self.pages_per_file = pages_per_file
        self.per_page_chars = per_page_chars
        self.repeat_filename_tokens = repeat_filename_tokens
        self.files = []
        self.tokens = []
        self.fn_to_idx = {}
        self.bm25 = None

    def build(self, chunks):
        # 收集每文件的若干最早页 & 抓取标题行（MinerU markdown 常有 #/##/表/图）
        pages_by_file = {}
        titles_by_file = {}
        for c in chunks:
            fn = c["metadata"]["file_name"]
            pg = int(c["metadata"]["page"])
            txt = c["content"]
            # 标题/目录/图表行
            titles = "\n".join(
                ln for ln in txt.splitlines()
                if ln.lstrip().startswith(('#','##','表','图'))
            )
            if titles:
                titles_by_file.setdefault(fn, []).append(titles[:500])

            pages_by_file.setdefault(fn, {})
            # 只收每页一次
            if pg not in pages_by_file[fn]:
                pages_by_file[fn][pg] = txt

        self.files, self.tokens, self.fn_to_idx = [], [], {}
        for fn, pgmap in pages_by_file.items():
            first_pages = [pgmap[p] for p in sorted(pgmap)[: self.pages_per_file]]
            titles = "\n".join(titles_by_file.get(fn, []))[:1500]
            # 文档：文件名 + 标题/目录摘要 + 前3页各1500字
            doc = f"{fn}\n{titles}\n" + "\n".join(fp[: self.per_page_chars] for fp in first_pages)
            doc = _preprocess(doc)
            toks = list(jieba.cut(doc))
            # 额外重复文件名分词，放大“版本词/日期码”权重
            toks += list(jieba.cut(_preprocess(fn))) * self.repeat_filename_tokens

            self.fn_to_idx[fn] = len(self.files)
            self.files.append(fn)
            self.tokens.append(toks)

        self.bm25 = BM25Okapi(self.tokens)

    def top_files(self, query, n=25):
        if not self.bm25:
            return []
        q = list(jieba.cut(_preprocess(query)))
        scores = self.bm25.get_scores(q)
        order = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
        return [(self.files[i], float(scores[i])) for i in order[:n]]


In [3]:
import re
def rrf(rank, k=90):
    """
    Reciprocal Rank Fusion 基本项。
    约定 rank 为 0-based，这里内部转为 1-based 以避免极端放大。
    k 越大越“温和”，推荐 90。
    """
    r = int(rank) + 1
    return 1.0 / (k + r)


THEME_TOKENS = ["二次创业","三条增长曲线","深度复盘","稳定成长","公司深度报告","再谈","走在修复的道路上"]
def extract_years(q): return set(re.findall(r"20\d{2}", q))

def soft_bias(ranked, question, year_boost=0.15, theme_boost=0.05):
    years = extract_years(question)
    themed = any(t in question for t in THEME_TOKENS)
    for c in ranked:
        fn = c["metadata"]["file_name"]
        s  = c.get("rerank_score", 0.0)
        if years and any(y in fn for y in years): s += year_boost
        if themed: s += theme_boost if any(t in fn for t in THEME_TOKENS) else 0.0
        c["rerank_score"] = s
    ranked.sort(key=lambda x: x["rerank_score"], reverse=True)
    return ranked



In [4]:
from collections import defaultdict
import numpy as np

def file_vote(ranked_chunks, top_n_files=1, agg="sum"):
    buckets = defaultdict(list)
    for c in ranked_chunks:
        fn = c["metadata"]["file_name"]
        s  = c.get("rerank_score", c.get("ret_score", 0.0))
        buckets[fn].append(float(s))

    def agg_fn(v):
        v = np.asarray(v, float)
        return v.sum() if agg=="sum" else (v.mean() if agg=="mean" else v.max())

    scored = sorted([(fn, agg_fn(v)) for fn, v in buckets.items()], key=lambda x: x[1], reverse=True)
    keep = set(fn for fn,_ in scored[:top_n_files])
    kept = [c for c in ranked_chunks if c["metadata"]["file_name"] in keep]
    return kept, (scored[0][0] if scored else None, dict(scored))


In [5]:
# # Build windows from existing MinerU chunks: all_pdf_page_chunks.json -> all_pdf_windows.json RUN ONCE FIX page
# import json, re, hashlib
# from pathlib import Path

# SRC = Path("./all_pdf_page_chunks_mineru.json")          # MinerU output you already have
# DST = Path("./all_pdf_windows_mineru.json")              # new, windowized chunks

# HDR = re.compile(r"^(#{1,3})\s+.+", re.M)         # #, ##, ### headings
# FIG = re.compile(r"!\[[^\]]*\]\([^)]+\)")         # markdown images
# TB  = re.compile(r"<table[\s\S]*?</table>", re.I)  # html tables

# def _norm(s): return re.sub(r"\s+", " ", (s or "").strip())

# def split_by_headers(md: str):
#     # keep figure/table blocks as atomic spans
#     blocks = []
#     # placeholder tokens for figures/tables
#     fig_tokens, tab_tokens = [], []
#     def _stash(pattern, text, token_prefix):
#         out, toks, i = text, [], 0
#         for m in pattern.finditer(text):
#             tok = f"__{token_prefix}_{i}__"
#             toks.append((tok, m.group(0)))
#             out = out.replace(m.group(0), tok, 1)
#             i += 1
#         return out, toks
#     text, figs = _stash(FIG, md, "FIG")
#     text, tabs = _stash(TB,  text, "TAB")

#     # split by headers; if none, single block
#     parts = []
#     last = 0
#     for m in HDR.finditer(text):
#         if m.start() > last:
#             parts.append(text[last:m.start()])
#         parts.append(text[m.start():m.end()])  # header line as its own piece
#         last = m.end()
#     if last < len(text):
#         parts.append(text[last:])

#     # restore tokens
#     def _restore(s, toks):
#         for tok, val in toks:
#             s = s.replace(tok, val)
#         return s
#     parts = [_restore(_restore(p, figs), tabs) for p in parts]
#     # drop empties
#     parts = [p for p in parts if _norm(p)]
#     return parts

# def windowize(parts, target=900, overlap=150):
#     out, buf = [], ""
#     for p in parts:
#         p = _norm(p)
#         if len(p) <= target:
#             if len(buf) + len(p) + 1 <= target:
#                 buf = (buf + " " + p).strip()
#             else:
#                 if buf: out.append(buf)
#                 tail = buf[-overlap:] if buf and overlap else ""
#                 buf = (tail + " " + p).strip()
#         else:
#             # long piece -> sliding windows
#             stride = max(50, target - overlap)
#             i = 0
#             while i < len(p):
#                 out.append(_norm(p[i:i+target]))
#                 i += stride
#     if buf: out.append(buf)
#     return out

# print("Loading MinerU page chunks…")
# chunks = json.loads(Path(SRC).read_text(encoding="utf-8"))
# print(f"Loaded {len(chunks)} page-chunks")

# seen, windows = set(), []
# for c in chunks:
#     file = c["metadata"]["file_name"]
#     page = int(c["metadata"]["page"])+1   # shift 1 , quick fix for the fucked up in mineru process
#     parts = split_by_headers(c["content"])
#     wins  = windowize(parts, target=900, overlap=150)
#     for widx, w in enumerate(wins):
#         h = hashlib.md5(_norm(w).encode()).hexdigest()
#         if h in seen: 
#             continue
#         seen.add(h)
#         windows.append({
#             "id": f"{c['id']}_w{widx}",
#             "content": w,
#             "metadata": {
#                 "file_name": file,
#                 "page": page,         
#                 "widx": widx
#             }
#         })

# print(f"Built {len(windows)} windows")
# DST.write_text(json.dumps(windows, ensure_ascii=False, indent=2), encoding="utf-8")
# print(f"Saved -> {DST}")

In [6]:
from collections import defaultdict
import numpy as np
#   Score drop IDK
# def file_vote(ranked_chunks, top_n_files=1, agg="sum"):
#     """
#     Aggregate scores per filename and keep chunks from the winning file(s).
#     ranked_chunks: reranked list (desc), each with ["metadata"]["file_name"] and "rerank_score".
#     """
#     buckets = defaultdict(list)
#     for c in ranked_chunks:
#         fn = c["metadata"]["file_name"]
#         s = c.get("rerank_score", c.get("ret_score", 0.0))
#         buckets[fn].append(float(s))

#     def agg_fn(v):
#         v = np.asarray(v, float)
#         if agg == "mean": return v.mean()
#         if agg == "max":  return v.max()
#         return v.sum()  # default

#     scored = sorted([(fn, agg_fn(v)) for fn, v in buckets.items()],
#                     key=lambda x: x[1], reverse=True)
#     keep_files = set(fn for fn, _ in scored[:top_n_files])
#     kept = [c for c in ranked_chunks if c["metadata"]["file_name"] in keep_files]
#     # Keep a tiny meta for telemetry
#     return kept, (scored[0][0] if scored else None, dict(scored))

def page_vote(ranked_chunks, top_m_pages=1, agg="max"):
    """
    ranked_chunks: list already reranked by your reranker (desc)
    Aggregate scores per (file_name, page) and return chunks from the winning page(s).
    """
    buckets = defaultdict(list)
    for c in ranked_chunks:
        key = (c["metadata"]["file_name"], int(c["metadata"]["page"]))
        s = c.get("rerank_score", c.get("ret_score", 0.0))
        buckets[key].append(float(s))

    def agg_fn(v):
        v = np.asarray(v, float)
        if agg == "sum": return v.sum()
        if agg == "mean": return v.mean()
        return v.max()

    scored_pages = sorted([(k, agg_fn(v)) for k, v in buckets.items()], key=lambda x: x[1], reverse=True)
    keep_keys = set(k for k,_ in scored_pages[:top_m_pages])
    return [c for c in ranked_chunks if (c["metadata"]["file_name"], int(c["metadata"]["page"])) in keep_keys]

def expand_neighbors_on_page(page_chunks, radius=1, max_total=8):
    # Assumes all chunks are from the same (file,page); use their widx to add neighbors
    page_chunks = sorted(page_chunks, key=lambda x: x["metadata"].get("widx", 0))
    out = []
    picked = set()
    for c in page_chunks[:max_total]:
        w = c["metadata"].get("widx", 0)
        for j in range(max(0, w - radius), min(len(page_chunks), w + radius + 1)):
            cj = page_chunks[j]
            key = cj["metadata"].get("widx", j)
            if key in picked: 
                continue
            picked.add(key); out.append(cj)
            if len(out) >= max_total: 
                return out
    return out or page_chunks[:max_total]


In [7]:
import os, re, time, requests

class SiliconFlowReranker:
    def __init__(self, api_key: str = None,
                 model: str = "BAAI/bge-reranker-v2-m3",
                 endpoint: str = "https://api.siliconflow.cn/v1/rerank",
                 timeout: int = 30,
                 max_retries: int = 4,
                 add_header: bool = True,
                 header_snippet_chars: int = 300):
        self.api_key = api_key or os.getenv('LOCAL_API_KEY')
        if not self.api_key:
            raise ValueError("Missing SILICONFLOW_API_KEY")
        self.model = model
        self.endpoint = endpoint
        self.timeout = timeout
        self.max_retries = max_retries

        # NEW
        self.add_header = add_header
        self.header_snippet_chars = header_snippet_chars

    def _extract_date_from_filename(self, fn: str) -> str:
        """
        Try to parse YYYY[-_.]?MM[-_.]?DD in filename and normalize to YYYY-MM-DD.
        Returns '' if not found.
        """
        m = re.search(r'(20\d{2})[-_.]?(0[1-9]|1[0-2])[-_.]?(0[1-9]|[12]\d|3[01])', fn)
        if not m: return ''
        y, mo, d = m.groups()
        return f"{y}-{mo}-{d}"

    def _make_header(self, c: dict) -> str:
        meta = c.get("metadata", {})
        fn = str(meta.get("file_name", ""))
        pg = str(meta.get("page", ""))
        dt = self._extract_date_from_filename(fn)
        snippet = c.get("content", "")[: self.header_snippet_chars]
        parts = [f"【文件】{fn}", f"【页】{pg}"]
        if dt: parts.append(f"【日期】{dt}")
        parts.append(f"【摘要】{snippet}")
        return "\n".join(parts) + "\n"

    def rerank(self, question: str, candidates: list, return_meta: bool = False):
        """
        Returns:
          ranked_chunks, meta where meta = {"status": http_status or None, "attempts": N}
        Retries on 429/5xx with exponential backoff.
        """
        # Build documents (with optional header)
        if self.add_header:
            documents = [self._make_header(c) + c["content"] for c in candidates]
        else:
            documents = [c["content"] for c in candidates]

        payload = {"model": self.model, "query": question, "documents": documents}
        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}

        status = None
        attempts = 0
        data = None

        for attempt in range(self.max_retries):
            attempts = attempt + 1
            try:
                r = requests.post(self.endpoint, json=payload, headers=headers, timeout=self.timeout)
                status = r.status_code
                if status in (429, 500, 502, 503, 504):
                    time.sleep(2 ** attempt)  # backoff
                    continue
                r.raise_for_status()
                data = r.json()
                break
            except requests.RequestException:
                time.sleep(2 ** attempt)

        ranked = []
        if data:
            results = data.get("results") or data.get("data") or []
            seen = set()
            for res in results:
                idx = res.get("index")
                if idx is None or idx >= len(candidates):
                    continue
                seen.add(idx)
                c = dict(candidates[idx])
                c["rerank_score"] = float(res.get("relevance_score", 0.0))
                ranked.append(c)
            # fill any missing
            for i, c in enumerate(candidates):
                if i not in seen:
                    cc = dict(c); cc["rerank_score"] = 0.0
                    ranked.append(cc)
            ranked.sort(key=lambda x: x["rerank_score"], reverse=True)

        meta = {"status": status, "attempts": attempts}
        return (ranked, meta) if return_meta else ranked


In [8]:
class PageChunkLoader:
    def __init__(self, json_path: str):
        self.json_path = json_path
    def load_chunks(self) -> List[Dict[str, Any]]:
        with open(self.json_path, 'r', encoding='utf-8') as f:
            return json.load(f)

In [9]:
class EmbeddingModel:
    def __init__(self, batch_size: int = 64):
        self.api_key = os.getenv('LOCAL_API_KEY')
        self.base_url = os.getenv('LOCAL_BASE_URL')
        self.embedding_model = os.getenv('LOCAL_EMBEDDING_MODEL')
        self.batch_size = batch_size
        if not self.api_key or not self.base_url:
            raise ValueError('请在.env中配置LOCAL_API_KEY和LOCAL_BASE_URL')

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        return get_text_embedding(
            texts,
            api_key=self.api_key,
            base_url=self.base_url,
            embedding_model=self.embedding_model,
            batch_size=self.batch_size
        )

    def embed_text(self, text: str) -> List[float]:
        return self.embed_texts([text])[0]

In [10]:
class SimpleVectorStore:
    def __init__(self):
        self.embeddings = []
        self.chunks = []
    def add_chunks(self, chunks: List[Dict[str, Any]], embeddings: List[List[float]]):
        self.chunks.extend(chunks)
        self.embeddings.extend(embeddings)
    def search(self, query_embedding: List[float], top_k: int = 3) -> List[Dict[str, Any]]:
        from numpy import dot
        from numpy.linalg import norm
        import numpy as np
        if not self.embeddings:
            return []
        emb_matrix = np.array(self.embeddings)
        query_emb = np.array(query_embedding)
        sims = emb_matrix @ query_emb / (norm(emb_matrix, axis=1) * norm(query_emb) + 1e-8)
        idxs = sims.argsort()[::-1][:top_k]
        return [self.chunks[i] for i in idxs]

In [11]:
from collections import defaultdict

def pick_best_page_and_expand(file_scoped, final_k=5, radius=1):
    # group by page
    by_page = defaultdict(list)
    for c in file_scoped:
        by_page[int(c["metadata"]["page"])].append(c)
    # score each page by max rerank_score
    page_scores = [(pg, max(float(x.get("rerank_score", 0.0)) for x in cs))
                   for pg, cs in by_page.items()]
    best_pg, _ = max(page_scores, key=lambda t: t[1])

    page_chunks = sorted(by_page[best_pg], key=lambda x: x["metadata"].get("widx", 0))
    # expand around the highest-scored chunk on that page
    top_idx = max(range(len(page_chunks)),
                  key=lambda i: float(page_chunks[i].get("rerank_score", 0.0)))
    w0 = page_chunks[top_idx]["metadata"].get("widx", top_idx)

    # collect neighbors by widx
    wanted = {w0}
    for r in range(1, radius+1):
        wanted.add(w0 - r); wanted.add(w0 + r)

    chosen = [c for c in page_chunks if c["metadata"].get("widx", -10**9) in wanted]
    # backfill with other chunks on same page if needed
    if len(chosen) < final_k:
        for c in page_chunks:
            if c not in chosen:
                chosen.append(c)
            if len(chosen) >= final_k: break
    return chosen[:final_k]


In [12]:
from typing import Dict, Any, List, Optional
import os
from openai import OpenAI

class SimpleRAG:
    def __init__(
        self,
        chunk_json_path: str,
        model_path: str = None,
        batch_size: int = 32,
        use_rerank: bool = False,
        candidate_k: int = 120,
        final_k: int = 5,
        reranker=None,
        
    ):
        self.loader = PageChunkLoader(chunk_json_path)
        self.embedding_model = EmbeddingModel(batch_size=batch_size)
        self.vector_store = SimpleVectorStore()
        

        # Rerank controls
        self.use_rerank = use_rerank
        self.candidate_k = candidate_k
        self.final_k = final_k
        self.reranker = reranker
        

        # Behavior flags
        self.lock_source     = bool(int(os.getenv("LOCK_SOURCE",  "0")))
        self.strict_rerank   = bool(int(os.getenv("STRICT_RERANK","0")))
        self.debug_telemetry = bool(int(os.getenv("DEBUG_RAG",    "1")))

        self.limit_per_file = int(os.getenv("LIMIT_PER_FILE", "12"))


        # # Retrieval helpers
        # self.file_bm25 = FileBM25Index(pages_per_file=3, per_page_chars=1500, repeat_filename_tokens=2)
        # self.file_bm25.build(self.chunks)


        # Hold corpus if needed elsewhere
        self.chunks = None                 # <-- set in setup()

    def setup(self):
        print("加载所有页chunk...")
        self.chunks = self.loader.load_chunks()
        print(f"共加载 {len(self.chunks)} 个chunk")

        print("生成嵌入...")
        embeddings = self.embedding_model.embed_texts([c["content"] for c in self.chunks])

        print("存储向量...")
        self.vector_store.add_chunks(self.chunks, embeddings)

        # Build file-level BM25 now that chunks exist
        print("构建文件级BM25索引...")
        self.file_bm25 = FileBM25Index(pages_per_file=3, per_page_chars=1500, repeat_filename_tokens=2)
        self.file_bm25.build(self.chunks)

        print("RAG向量库构建完成！")

    def query(self, question: str, top_k: int = 3) -> Dict[str, Any]:
        q_emb = self.embedding_model.embed_text(question)
        results = self.vector_store.search(q_emb, top_k)
        return {"question": question, "chunks": results}

    def _build_context(self, items: List[Dict[str, Any]]) -> str:
        return "\n".join(
            f"[文件名]{c['metadata']['file_name']} [页码]{c['metadata']['page']}\n{c['content']}"
            for c in items
        )

    def generate_answer(self, question: str, top_k: int = 3) -> Dict[str, Any]:
        qwen_api_key = os.getenv("LOCAL_API_KEY")
        qwen_base_url = os.getenv("LOCAL_BASE_URL")
        qwen_model    = os.getenv("LOCAL_TEXT_MODEL")
        if not qwen_api_key or not qwen_base_url or not qwen_model:
            raise ValueError("请在.env中配置LOCAL_API_KEY、LOCAL_BASE_URL、LOCAL_TEXT_MODEL")

        tele = {"path":"", "rerank_ok":False, "rerank_attempts":0, "rerank_http":None,
                "cand_n":0, "ranked_n":0, "file_vote_best":None, "file_vote_n":0, "final_n":0}

        # Stage-A dense
        q_emb = self.embedding_model.embed_text(question)
        candidates = self.vector_store.search(q_emb, self.candidate_k)
        tele["cand_n"] = len(candidates)

        # Optional per-file diversity cap
        if candidates and self.limit_per_file > 0:
            by_file, diverse = {}, []
            for c in candidates:
                fn = c["metadata"]["file_name"]
                by_file.setdefault(fn, 0)
                if by_file[fn] < self.limit_per_file:
                    diverse.append(c); by_file[fn] += 1
            candidates = diverse

        # RRF fuse with file-level BM25 (only if built)
        if self.file_bm25 and candidates:
            # 记录 BEFORE (先存，再融合)
            pre_rrf_files = list(dict.fromkeys([c["metadata"]["file_name"] for c in candidates]))
            tele["first3_files_before_rrf"] = pre_rrf_files[:3]

            # 参数
            BM25_TOPN = int(os.getenv("BM25_TOPN", "25"))
            RRF_K     = int(os.getenv("RRF_K", "90"))

            # BM25 文件排名
            top_files = self.file_bm25.top_files(question, n=BM25_TOPN)
            tele["bm25_top_files"] = [fn for fn, _ in top_files[:5]]

            # 融合（统一用 RRF_K；rank 约定 0-based -> rrf 内部会 +1）
            file_to_rank = {fn: r for r, (fn, _) in enumerate(top_files)}
            BIG = 10_000

            fused = []
            for i, c in enumerate(candidates):
                fn = c["metadata"]["file_name"]
                dense_rank = i
                bm25_rank  = file_to_rank.get(fn, BIG)
                score = rrf(dense_rank, k=RRF_K) + rrf(bm25_rank, k=RRF_K)
                cc = dict(c); cc["fused_score"] = score
                fused.append(cc)

            candidates = sorted(fused, key=lambda x: x["fused_score"], reverse=True)

        # 记录 AFTER（放在 if 外也行，candidates 一定存在）
        tele["first3_files_after_rrf"] = list(dict.fromkeys([c["metadata"]["file_name"] for c in candidates]))[:3]


        # Rerank → soft bias → file_vote → neighbor expansion
        if self.use_rerank and self.reranker is not None and candidates:
            tele["path"] = "rerank"
            ranked, rr_meta = self.reranker.rerank(question, candidates, return_meta=True)
            tele["rerank_http"] = rr_meta.get("status"); tele["rerank_attempts"] = rr_meta.get("attempts")
            tele["ranked_n"] = len(ranked); tele["rerank_ok"] = bool(ranked)

            if not ranked:
                if self.strict_rerank:
                    raise RuntimeError(f"Rerank failed (status={tele['rerank_http']}) and STRICT_RERANK=1")
                chunks = candidates[: self.final_k]
            else:
                ranked = soft_bias(ranked, question)  # cheap nudge on year/theme
                file_scoped, (best_file, _) = file_vote(ranked, top_n_files=1, agg="max") # Change sum to MAX
                chunks = pick_best_page_and_expand(file_scoped, final_k=self.final_k, radius=1)

                # no page_vote; just take neighbors within the chosen file
                try:
                    chunks = expand_neighbors_on_page(file_scoped, radius=1, max_total=self.final_k)
                except Exception:
                    chunks = file_scoped[: self.final_k]
        else:
            tele["path"] = "baseline"
            chunks = candidates[: top_k]

        tele["final_n"] = len(chunks)

        # Early exit
        if not chunks:
            out = {"question": question, "answer": "", "filename": "", "page": "", "retrieval_chunks": []}
            if self.debug_telemetry: out["debug"] = tele
            return out

        # Evidence for LOCK_SOURCE
        top_file = chunks[0]['metadata']['file_name']
        top_page = chunks[0]['metadata']['page']

        # Build context
        context = self._build_context(chunks)

        # LLM call
        client = OpenAI(api_key=qwen_api_key, base_url=qwen_base_url)
        prompt = (
            "你是一名专业的金融分析助手，请根据以下检索到的内容回答用户问题。\n"
            "请严格按照如下JSON格式输出：\n"
            '{"answer": "你的简洁回答", "filename": "来源文件名", "page": "来源页码"}\n'
            f"检索内容：\n{context}\n\n问题：{question}\n"
            "请确保输出内容为合法JSON字符串，不要输出多余内容。"
        )
        completion = client.chat.completions.create(
            model=qwen_model,
            messages=[{"role": "system", "content": "你是一名专业的金融分析助手。"},
                      {"role": "user", "content": prompt}],
            temperature=0.2, max_tokens=1024,
        )

        import json as pyjson
        from extract_json_array import extract_json_array
        raw = completion.choices[0].message.content.strip()
        json_str = extract_json_array(raw, mode="objects")

        if json_str:
            try:
                arr = pyjson.loads(json_str)
                if isinstance(arr, list) and arr:
                    j = arr[0]
                    answer = j.get("answer", ""); filename = j.get("filename", ""); page = j.get("page", "")
                else:
                    answer, filename, page = raw, top_file, top_page
            except Exception:
                answer, filename, page = raw, top_file, top_page
        else:
            answer, filename, page = raw, top_file, top_page

        # normalize page & apply offset
        PAGE_OFFSET = int(os.getenv("PAGE_OFFSET", "0"))
        try:
            if page not in ("", None): page = int(page) + PAGE_OFFSET
        except Exception:
            try: page = int(top_page) + PAGE_OFFSET
            except Exception: page = top_page

        # LOCK_SOURCE
        model_file, model_page = filename, page
        if self.lock_source and chunks:
            filename, page = top_file, (int(top_page) + PAGE_OFFSET if str(top_page).isdigit() else top_page)

        out = {"question": question, "answer": answer, "filename": filename, "page": page, "retrieval_chunks": chunks}
        if self.debug_telemetry:
            tele.update({"top_file": top_file, "top_page": top_page, "model_file": model_file, "model_page": model_page})
            out["debug"] = tele
        return out


In [13]:
# 1) Imports & Paths
from pathlib import Path
import os, json, random
from tqdm.auto import tqdm
import concurrent.futures
import math

# Notebook is in .../notebook; project root is parent
NOTEBOOK_DIR = Path.cwd()
PROJ_ROOT = NOTEBOOK_DIR.parent

# Try common locations for train.json
CANDIDATE_TRAIN = [
    PROJ_ROOT / "datas" / "train.json",
    PROJ_ROOT / "data" / "train.json",
    NOTEBOOK_DIR / "datas" / "train.json",
    NOTEBOOK_DIR / "data" / "train.json",
]
TRAIN_PATH = next((p for p in CANDIDATE_TRAIN if p.exists()), None)
if TRAIN_PATH is None:
    raise FileNotFoundError(f"train.json not found in: {CANDIDATE_TRAIN}")

# Chunk JSON path (your earlier structure)
CHUNK_JSON_PATH = PROJ_ROOT / "notebook" / "all_pdf_windows_mineru.json"

# Outputs
EVAL_RAW_PATH = PROJ_ROOT / "eval_train_raw.json"
EVAL_SUMMARY_PATH = PROJ_ROOT / "eval_train_scored.json"

print("Notebook Dir:", NOTEBOOK_DIR)
print("Project Root :", PROJ_ROOT)
print("Train JSON   :", TRAIN_PATH)
print("Chunks JSON  :", CHUNK_JSON_PATH)

# --- RAG behavior flags (your SimpleRAG reads these) ---
env = os.environ.copy()
env.update({
    "LOCK_SOURCE":  "1",   # force filename/page to top evidence
    "STRICT_RERANK":"0",   # set "1" to fail fast if rerank API hiccups
    "DEBUG_RAG":    "1",   # include telemetry in outputs
    "LIMIT_PER_FILE": "12",
    "PAGE_OFFSET":  "0",   # MinerU 0-based -> leaderboard 1-based
    "BM25_TOPN":"25",
    "RRF_K":"90",
})
os.environ.update(env)


Notebook Dir: d:\Datawhale\Multimodal-RAG-Competitions\notebook
Project Root : d:\Datawhale\Multimodal-RAG-Competitions
Train JSON   : d:\Datawhale\Multimodal-RAG-Competitions\data\train.json
Chunks JSON  : d:\Datawhale\Multimodal-RAG-Competitions\notebook\all_pdf_windows_mineru.json


  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Create your reranker beforehand
# reranker = HybridReranker(model_name="BAAI/bge-reranker-base")
reranker = SiliconFlowReranker()
rag = SimpleRAG(
    chunk_json_path=CHUNK_JSON_PATH,  # or your page/block chunk file
    use_rerank=True,
    candidate_k=120,
    final_k=5,
    reranker=reranker,
)
rag.setup()


加载所有页chunk...
共加载 10053 个chunk
生成嵌入...


Embedding: 100%|██████████| 315/315 [06:06<00:00,  1.16s/batch]
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\liuch\AppData\Local\Temp\jieba.cache


存储向量...
构建文件级BM25索引...


Loading model cost 0.690 seconds.
Prefix dict has been built successfully.


RAG向量库构建完成！


In [15]:
# 3) Load train and sample
with open(TRAIN_PATH, "r", encoding="utf-8") as f:
    train_data = json.load(f)

N = len(train_data)
random.seed(42)

sample_size = max(1, math.ceil(N * 0.50))
all_idx = list(range(N))
sample_idx = sorted(random.sample(all_idx, sample_size)) if sample_size < N else all_idx

print(f"Train size = {N} | Sample size = {len(sample_idx)}")
sample_idx[:10]


Train size = 118 | Sample size = 59


[0, 3, 4, 5, 10, 11, 12, 13, 14, 15]

In [16]:
# 4) Jaccard helper
def jaccard_char(a: str, b: str) -> float:
    a = (a or "").strip()
    b = (b or "").strip()
    if not a and not b:
        return 1.0
    set_a, set_b = set(a), set(b)
    union = set_a | set_b
    inter = set_a & set_b
    return len(inter) / len(union) if union else 0.0


In [17]:
# 5) Inference
def run_one(idx):
    item = train_data[idx]
    q = item.get("question", "")
    tqdm.write(f"[{sample_idx.index(idx)+1}/{len(sample_idx)}] {q[:60]}...")
    pred = rag.generate_answer(q, top_k=5)
    return idx, pred

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as ex:
    for out in tqdm(ex.map(run_one, sample_idx), total=len(sample_idx), desc="Infer on train sample"):
        results.append(out)

# Save raw (idx, pred) for debugging
with open(EVAL_RAW_PATH, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print(f"Saved raw predictions to: {EVAL_RAW_PATH}")


[1/59] 根据联邦制药（03933.HK）的公司发展历程，请简述其在2023年的重大产品临床进展。...
[2/59] 联邦制药的UBT37034在超重/肥胖适应症方面取得了哪些临床前数据？...
[3/59] 联邦制药的TUL01101片剂和软膏剂在特应性皮炎治疗中的疗效如何？...
[4/59] 根据《联邦制药-港股公司研究报告-创新突破三靶点战略联姻诺和诺德-250712》文件内容，请问联邦制药在2024年的营业...
[5/59] 极兔速递-W，根据图表和文字信息，请分析东南亚电商市场的发展潜力及其对极兔速递的影响。...
[6/59] 极兔速递-W：2024年单位运输成本和分拣成本分别下降了多少百分比？...
[7/59] 极兔速递-W的跨境物流业务未来发展方向如何？...
[8/59] 根据华创证券对凌云股份的深度研究报告，请问该公司在2024年的主要产品收入占比是多少？...
[9/59] 关于凌云股份（600480）的热冲压技术应用和发展前景，能否详细解释热冲压成型工艺与冷冲压成型工艺的主要区别？...
[10/59] 根据华创证券对凌云股份（600480）的深度研究报告，请问该公司在2024年的热成型客户拓展情况如何？...


Infer on train sample:   0%|          | 0/59 [00:23<?, ?it/s]

[11/59] 关于凌云股份（600480）的德国WAG业务板块及客户情况，请问具体有哪些主要客户？...


Infer on train sample:   0%|          | 0/59 [00:24<?, ?it/s]

[12/59] 关于凌云股份（600480）的热成型电池盒双轮驱动传感器加速布局，请问其在新能源汽车领域的具体布局情况如何？...


Infer on train sample:   2%|▏         | 1/59 [00:28<27:15, 28.20s/it]

[13/59] 根据凌云股份（600480）的深度研究报告，请问亚大集团在乘用车和商用车管路产品应用方面分别有哪些具体的产品？...
[14/59] 关于凌云股份（600480）的力传感器业务，具体应用场景有哪些？...


Infer on train sample:   2%|▏         | 1/59 [00:34<27:15, 28.20s/it]

[15/59] 如何分析广联达在“两新一重”建设政策下的业绩表现？...


Infer on train sample:   2%|▏         | 1/59 [00:35<27:15, 28.20s/it]

[16/59] 如何分析广联达在数字化转型中的增长潜力？...


Infer on train sample:   3%|▎         | 2/59 [00:38<16:45, 17.63s/it]

[17/59] 广联达的数字施工业务在2020年的资金压力如何？与同行业其他企业相比，其资金压力有何特点？...


Infer on train sample:   5%|▌         | 3/59 [00:41<10:02, 10.75s/it]

[18/59] 广联达的施工总承包资质分为几个类别？每个类别分别对应哪些工程类型？...


Infer on train sample:   5%|▌         | 3/59 [00:52<10:02, 10.75s/it]

[19/59] 如何评估广联达在数字化转型过程中面临的挑战及其应对策略？...


Infer on train sample:   5%|▌         | 3/59 [00:54<10:02, 10.75s/it]

[20/59] 如何评估广联达在数字化转型过程中面临的安全事故和质量问题？...


Infer on train sample:   5%|▌         | 3/59 [00:55<10:02, 10.75s/it]

[21/59] 如何评估广联达在数字化转型中的竞争优势？...


Infer on train sample:   5%|▌         | 3/59 [00:57<10:02, 10.75s/it]

[22/59] 广联达：如何看待其数字造价业务的增长潜力？...


Infer on train sample:   5%|▌         | 3/59 [00:58<10:02, 10.75s/it]

[23/59] 广联达的数字造价业务未来成长空间是多少？...


Infer on train sample:   5%|▌         | 3/59 [01:01<10:02, 10.75s/it]

[24/59] 如何分析广联达在数字施工业务上的增长潜力？...


Infer on train sample:   5%|▌         | 3/59 [01:13<10:02, 10.75s/it]

[25/59] 如何分析广联达云造价业务的营收增长趋势及其与房屋新开工面积增速的关系？...


Infer on train sample:   5%|▌         | 3/59 [01:14<10:02, 10.75s/it]

[26/59] 广联达：根据图表130和131，预测2025年我国电子招标采购交易规模将达到多少亿元？...


Infer on train sample:   5%|▌         | 3/59 [01:20<10:02, 10.75s/it]

[27/59] 关于广联达的CAD快速看图工具及其七大功能价值，请问这些功能如何具体提升用户的工作效率？...


Infer on train sample:   5%|▌         | 3/59 [01:24<10:02, 10.75s/it]

[28/59] 如何评价广联达在搅拌站材料核算系统上的核心功能及其工作流程？...


Infer on train sample:   5%|▌         | 3/59 [01:28<10:02, 10.75s/it]

[29/59] 如何评估广联达在2019年至2021年上半年期间，施工业务的客户覆盖率和渗透率的变化趋势？...


Infer on train sample:   5%|▌         | 3/59 [01:28<10:02, 10.75s/it]

[30/59] 如何评估广联达在数字化转型中的竞争优势及其未来增长潜力？...


Infer on train sample:   5%|▌         | 3/59 [01:29<10:02, 10.75s/it]

[31/59] 广联达：根据图表217和218，您能否分析一下国家在支持工业软件和建筑业软件系统自主可控方面的政策趋势？...


Infer on train sample:  14%|█▎        | 8/59 [01:32<08:46, 10.33s/it]

[32/59] 如何分析广联达（002410.SZ）在2021年的PS估值水平及其与可比公司的差异？...


Infer on train sample:  36%|███▌      | 21/59 [01:33<01:45,  2.77s/it]

[33/59] 千味央厨的股价走势如何？...


Infer on train sample:  37%|███▋      | 22/59 [01:34<01:36,  2.61s/it]

[34/59] 千味央厨的速冻米面制品产品结构与日本相比有何差异？...


Infer on train sample:  37%|███▋      | 22/59 [01:34<01:36,  2.61s/it]

[35/59] 千味央厨 (001215 CH) 的速冻米面业务在餐饮端的增长潜力如何？...


Infer on train sample:  37%|███▋      | 22/59 [01:43<01:36,  2.61s/it]

[36/59] 根据华泰证券发布的《千味央厨-千寻百味乘势而上》报告，结合图表23和24的数据，请问2025年国内餐饮端速冻米面市场的潜...


Infer on train sample:  42%|████▏     | 25/59 [01:45<01:38,  2.89s/it]

[37/59] 千味央厨的供应链管理体系如何保障产品质量和供应稳定性？...


Infer on train sample:  42%|████▏     | 25/59 [01:58<01:38,  2.89s/it]

[38/59] 千味央厨的模拟后厨实验室是如何具体运作的？...


Infer on train sample:  47%|████▋     | 28/59 [02:02<01:53,  3.66s/it]

[39/59] 千味央厨的餐饮大客户经营数据在2022年第三季度有何变化？...


Infer on train sample:  47%|████▋     | 28/59 [02:04<01:53,  3.66s/it]

[40/59] 千味央厨的三大新品牌分别针对哪些消费群体？各自的产品特点是什么？...


Infer on train sample:  49%|████▉     | 29/59 [02:07<01:53,  3.78s/it]

[41/59] 千味央厨公司在2020年的毛利率受原材料价格波动影响如何？...


Infer on train sample:  49%|████▉     | 29/59 [02:08<01:53,  3.78s/it]

[42/59] 千味央厨2020年的原材料储备占比存货显著提升的具体情况如何？...


Infer on train sample:  49%|████▉     | 29/59 [02:09<01:53,  3.78s/it]

[43/59] 针对千味央厨的募投项目，其总部基地及研发中心建设的具体投资内容和占比是多少？...


Infer on train sample:  51%|█████     | 30/59 [02:13<01:59,  4.12s/it]

[44/59] 根据千味央厨的招股书，其2021年的收入预测是多少？...
[45/59] 千味央厨：使用OEM代工模式的优势体现在哪些方面？...


Infer on train sample:  53%|█████▎    | 31/59 [02:15<01:44,  3.73s/it]

[46/59] 关于伊利股份的历史发展和市场竞争，请问在2005年至2013年间，伊利如何通过创新产品和营销策略实现营收突破100亿大关...


Infer on train sample:  59%|█████▉    | 35/59 [02:19<00:57,  2.39s/it]

[47/59] 根据伊利股份的公司深度报告《王者荣耀行稳致远》，请问伊利在2005年后通过哪些营销策略实现了后来居上？...


Infer on train sample:  59%|█████▉    | 35/59 [02:28<00:57,  2.39s/it]

[48/59] 根据《伊利股份-公司深度报告》中的内容，请分析影响奶粉行业规模的主要因素，并解释为什么我国奶粉单价高于多个发达国家？...


Infer on train sample:  64%|██████▍   | 38/59 [02:31<01:01,  2.94s/it]

[49/59] 根据《伊利股份-公司深度报告》中的内容，请问伊利在母乳研究方面有哪些重要成就？...


Infer on train sample:  64%|██████▍   | 38/59 [02:36<01:01,  2.94s/it]

[50/59] 伊利股份的奶酪业务发展现状如何？...


Infer on train sample:  69%|██████▉   | 41/59 [02:37<00:44,  2.49s/it]

[51/59] 广联达（002410）的数字建筑平台战略如何实现软硬一体化？...
[52/59] 广联达（002410）的施工数字化转型一站式服务如何具体实现？...


Infer on train sample:  69%|██████▉   | 41/59 [02:37<00:44,  2.49s/it]

[53/59] 广联达（002410）在2015年至2018年间，通过哪些具体措施推动了“数字企业+智慧工地+BIM建造”的发展？...


Infer on train sample:  69%|██████▉   | 41/59 [02:45<00:44,  2.49s/it]

[54/59] 广联达的数字企业业务在2019年至2021年间发生了哪些变化？...


Infer on train sample:  71%|███████   | 42/59 [02:49<01:10,  4.16s/it]

[55/59] 广联达（002410）：BIM5D+智慧工地平台在施工业务中的应用现状如何？...


Infer on train sample:  71%|███████   | 42/59 [03:00<01:10,  4.16s/it]

[56/59] 广联达在数字建筑一体化领域采取了哪些具体的落地打法？...


Infer on train sample:  71%|███████   | 42/59 [03:19<01:10,  4.16s/it]

[57/59] 广联达（002410）在2021年云收入占比达到多少？...


Infer on train sample:  71%|███████   | 42/59 [03:21<01:10,  4.16s/it]

[58/59] 广联达（002410）的数字设计业务在2021年下半年将如何推进？...


Infer on train sample:  78%|███████▊  | 46/59 [03:26<01:27,  6.73s/it]

[59/59] 广联达（002410）在数字建筑一体化方面有哪些具体的产品定位和解决方案？...


Infer on train sample: 100%|██████████| 59/59 [14:50<00:00, 15.10s/it]

Saved raw predictions to: d:\Datawhale\Multimodal-RAG-Competitions\eval_train_raw.json





## 6) Scoring vs Ground Truth
Score per item:
- page_match: 1 if exact page equals, else 0 (×0.25)
- filename_match: 1 if exact filename equals, else 0 (×0.25)
- answer_jaccard: char Jaccard (×0.5)


In [18]:
# 6) Score predictions
idx2pred = {idx: pred for idx, pred in results}

scored_rows = []
for idx in sample_idx:
    gt = train_data[idx]
    pred = idx2pred.get(idx, {})

    gt_q = gt.get("question", "")
    gt_a = gt.get("answer", "")
    gt_f = gt.get("filename", "")
    gt_p = gt.get("page", "")

    pr_a = pred.get("answer", "")
    pr_f = pred.get("filename", "")
    pr_p = pred.get("page", "")

    page_match = 1.0 if str(pr_p) == str(gt_p) else 0.0
    filename_match = 1.0 if str(pr_f) == str(gt_f) else 0.0
    answer_sim = jaccard_char(str(pr_a), str(gt_a))

    score = 0.25 * page_match + 0.25 * filename_match + 0.5 * answer_sim

    scored_rows.append({
        "idx": idx,
        "question": gt_q,
        "gt_answer": gt_a,
        "pr_answer": pr_a,
        "gt_filename": gt_f,
        "pr_filename": pr_f,
        "gt_page": gt_p,
        "pr_page": pr_p,
        "page_match": page_match,
        "filename_match": filename_match,
        "answer_jaccard": answer_sim,
        "score": score,
    })

# Sort by score ascending to inspect weak cases first
scored_rows_sorted = sorted(scored_rows, key=lambda x: x["score"])

with open(EVAL_SUMMARY_PATH, "w", encoding="utf-8") as f:
    json.dump(scored_rows_sorted, f, ensure_ascii=False, indent=2)

print(f"Saved scored results to: {EVAL_SUMMARY_PATH}")
print(f"max score: {max(r['score'] for r in scored_rows_sorted)}")
print(f"Mean score: {sum(r['score'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")
print(f"min score: {min(r['score'] for r in scored_rows_sorted) }")
print(f"Mean Jaccard: {sum(r['answer_jaccard'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")
print(f"Filename exact@1: {sum(r['filename_match'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")
print(f"Page exact@1: {sum(r['page_match'] for r in scored_rows_sorted)/len(scored_rows_sorted):.4f}")


Saved scored results to: d:\Datawhale\Multimodal-RAG-Competitions\eval_train_scored.json
max score: 0.9102564102564102
Mean score: 0.4234
min score: 0.036231884057971016
Mean Jaccard: 0.3213
Filename exact@1: 0.5763
Page exact@1: 0.4746


max score: 0.8116883116883117  
Mean score: 0.3678  
min score: 0.06818181818181818  
Mean Jaccard: 0.3118  
Filename exact@1: 0.4407  
Page exact@1: 0.4068  

Saved scored results to: d:\Datawhale\Multimodal-RAG-Competitions\eval_train_scored.json  
max score: 0.5576923076923077  
Mean score: 0.3201  
min score: 0.08403361344537816  
Mean Jaccard: 0.3068  
Filename exact@1: 0.5833  
Page exact@1: 0.0833  

Mineru + BN25
max score: 0.4962686567164179  
Mean score: 0.3231  
min score: 0.09130434782608696  
Mean Jaccard: 0.3128
Filename exact@1: 0.5833  
Page exact@1: 0.0833  

Note: With BM25 rerank :
max score: 0.5732323232323233  
Mean score: 0.2638   
min score: 0.09668508287292818  
Mean Jaccard: 0.3193  
Filename exact@1: 0.4167  
Page exact@1: 0.0000  

In [80]:
# Show a couple of worst and best cases inline (adjust k as needed)
k = 6
print("— Worst cases —")
for r in scored_rows_sorted[:k]:
    print("\nScore:", r["score"])
    print("Q:", r["question"])
    print("GT:", r["gt_answer"])
    print("PR:", r["pr_answer"])
    print("GT file/page:", r["gt_filename"], r["gt_page"])
    print("PR file/page:", r["pr_filename"], r["pr_page"])

print("\n— Best cases —")
for r in scored_rows_sorted[-k:]:
    print("\nScore:", r["score"])
    print("Q:", r["question"])
    print("GT:", r["gt_answer"])
    print("PR:", r["pr_answer"])
    print("GT file/page:", r["gt_filename"], r["gt_page"])
    print("PR file/page:", r["pr_filename"], r["pr_page"])


— Worst cases —

Score: 0.06470588235294118
Q: 千味央厨的股价走势如何？
GT: 根据图片中的图表6，千味央厨的股价走势如下：

1. **新股上市**：2021年9月6日至2021年11月期间，股价从约22倍PE-TTM上升到约65倍PE-TTM。
2. **叠加大众品提价潮**：2021年11月至2022年4月，股价从约65倍PE-TTM下降到约42倍PE-TTM。
3. **消费力疲软**：2022年4月至2022年6
PR: 检索内容未提供具体股价数据，仅提及最近半年股价相对走势图表（见附件图片）
GT file/page: 千味央厨-千寻百味乘势而上-22122726页.pdf 6
PR file/page: 千味央厨-深度报告深耕餐饮渠道研发助力发展-22033022页.pdf 1

Score: 0.0872093023255814
Q: 如何分析广联达在数字施工业务上的增长潜力？
GT: 广联达在数字施工业务上的增长潜力可以从以下几个方面进行分析：

1. **产品体系完善**：
   - 广联达已经形成了“平台+组件”的数字施工产品体系，覆盖企业级、项目级、岗位级三层和商务、技术、生产各业务线。这表明公司在施工数字化上有完整的产品体系。

2. **快速调整策略**：
   - 在建筑业数字化转型的大背景下，广联达能够快速不断地调整策略，这有助于其在施工数字化上保持竞争力并引领公司的第二增长曲线。

3. **市场潜力**：
   - 施工业务
PR: 广联达数字施工业务增长潜力主要源于客户结构稳定（服务行业龙头施工总承包商）、数字化渗透率偏低的市场空间、经济周期影响弱及规模化效应。预计2024-2025年实现盈亏平衡，未来5年营收增速至少30%，毛利率稳定在65%水平。
GT file/page: 广联达-再谈广联达当前时点下如何看待其三条增长曲线-220217131页.pdf 56
PR file/page: 广联达-造价筑底施工决胜-22041233页.pdf 20

Score: 0.08741258741258741
Q: 广联达（002410）的数字建筑平台战略如何实现软硬一体化？
GT: 根据图片中的内容，广联达（002410）的数字建筑平台战略通过以下方式实现软硬一体化：

1. **业

### Notes
- Set a different **sample fraction** by changing the `0.10` in `math.ceil(N * 0.10)`.
- If `filename`/`page` in ground truth differ in minor formatting (e.g., case, spaces), add normalization before comparison.
- You can plug this same scorer later for validation on a dev split.
