In [6]:
# %%
import json, os, re, math, statistics
from pathlib import Path
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

NOTEBOOK_DIR = Path.cwd()

EVAL_PATH = NOTEBOOK_DIR.parent / "eval_train_scored.json"
RAW_PATH = NOTEBOOK_DIR.parent / "eval_train_raw.json"

with open(EVAL_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# If file is a dict with a "results" field, normalize:
if isinstance(data, dict) and "results" in data:
    data = data["results"]

print(f"Loaded {len(data)} rows")
# Peek first row keys to understand schema
print("Keys in first row:", sorted(list(data[0].keys())))



Loaded 59 rows
Keys in first row: ['answer_jaccard', 'filename_match', 'gt_answer', 'gt_filename', 'gt_page', 'idx', 'page_match', 'pr_answer', 'pr_filename', 'pr_page', 'question', 'score']


In [20]:
def normalize_filename(name: str) -> str:
    if not name: return ""
    base = os.path.basename(str(name))
    base = base.replace("（","(").replace("）",")").replace("，",",").replace("：",":")
    base = re.sub(r"\s+", "", base)
    base = re.sub(r"(\d+)页(?=\.pdf$)", "", base, flags=re.I)  # drop trailing "...NN页" before .pdf
    return base.lower()

def tokenize_zh_en(s: str):
    s = (s or "").strip()
    return re.findall(r"[\u4e00-\u9fff]|[A-Za-z0-9]+", s)

def jaccard(a: str, b: str) -> float:
    A, B = set(tokenize_zh_en(a)), set(tokenize_zh_en(b))
    if not A and not B: return 1.0
    if not A or not B: return 0.0
    return len(A & B) / len(A | B)

# ---- Load scored (grounded evaluation) ----
with open(EVAL_PATH, "r", encoding="utf-8") as f:
    eval_rows = json.load(f)
print("Scored rows:", len(eval_rows))
print("Scored keys example:", sorted(eval_rows[0].keys()))

eval_df = pd.DataFrame(eval_rows)

# ---- Load raw (with telemetry/debug, retrieval_chunks) ----
with open(RAW_PATH, "r", encoding="utf-8") as f:
    raw_rows = json.load(f)
print("Raw pairs:", len(raw_rows))
# raw is [[idx, {...}], ...]
raw_df = pd.DataFrame([{"idx": k, **v} for k, v in raw_rows])

# normalize file cols for later joins
for col in ["gt_filename","pr_filename","filename","model_file","top_file"]:
    if col in eval_df.columns:
        eval_df[col+"_norm"] = eval_df[col].map(normalize_filename)
for col in ["filename","debug"]:
    if col in raw_df.columns and col=="filename":
        raw_df[col+"_norm"] = raw_df[col].map(normalize_filename)

Scored rows: 59
Scored keys example: ['answer_jaccard', 'filename_match', 'gt_answer', 'gt_filename', 'gt_page', 'idx', 'page_match', 'pr_answer', 'pr_filename', 'pr_page', 'question', 'score']
Raw pairs: 59


In [21]:
# %%
def acc_exact(pred, gt):
    m = pred.notna() & gt.notna()
    return float((pred[m]==gt[m]).mean())

summary = {
    "N"                 : len(eval_df),
    "Mean score"        : float(eval_df["score"].mean()),
    "Mean Jaccard"      : float(eval_df["answer_jaccard"].mean()),
    "Filename exact@1"  : float(eval_df["filename_match"].mean()),
    "Page exact@1"      : float(eval_df["page_match"].mean()),
}
pd.Series(summary)


N                   59.000000
Mean score           0.396920
Mean Jaccard         0.336214
Filename exact@1     0.508475
Page exact@1         0.406780
dtype: float64

In [22]:
# Filename/page correctness vs answer quality
eval_df["bucket"] = np.select(
    [
        (eval_df["filename_match"]==1) & (eval_df["page_match"]==1),
        (eval_df["filename_match"]==1) & (eval_df["page_match"]==0),
        (eval_df["filename_match"]==0) & (eval_df["page_match"]==1),
    ],
    ["Both-correct","File-OK/Page-Wrong","File-Wrong/Page-OK"],
    default="Both-wrong"
)
eval_df["bucket"].value_counts(), eval_df.groupby("bucket")[["score","answer_jaccard"]].mean()


(bucket
 Both-wrong            29
 Both-correct          24
 File-OK/Page-Wrong     6
 Name: count, dtype: int64,
                        score  answer_jaccard
 bucket                                      
 Both-correct        0.700651        0.401302
 Both-wrong          0.142500        0.285000
 File-OK/Page-Wrong  0.411695        0.323391)

In [46]:
# %% Setup
import json, re
from pathlib import Path
from collections import defaultdict, Counter
from difflib import SequenceMatcher
import pandas as pd
import numpy as np

BASE = Path.cwd().parent  # adjust if needed
SCORED = BASE / "eval_train_scored.json"
RAW    = BASE / "eval_train_raw.json"  # optional, only if you want rerank telemetry

scored = json.loads(SCORED.read_text(encoding="utf-8"))
if isinstance(scored, dict) and "results" in scored:  # normalize if wrapped
    scored = scored["results"]
df = pd.DataFrame(scored)

# Keep only Both-wrong
bw = df[(df["filename_match"]==0) & (df["page_match"]==0)].copy()
print("Both-wrong N:", len(bw))

# Optional: merge rerank telemetry from raw (idx -> debug)
try:
    raw = json.loads(RAW.read_text(encoding="utf-8"))
    raw_map = {idx: payload for idx, payload in raw}  # raw is [[idx, payload], ...]
    def get_dbg(idx, key, default=None):
        d = raw_map.get(idx, {}).get("debug", {})
        return d.get(key, default)
    bw["top_rr"]   = bw["idx"].apply(lambda i: get_dbg(i, "rerank_ok", False) and get_dbg(i, "top_rr", None))
    bw["top_file"] = bw["idx"].apply(lambda i: get_dbg(i, "top_file", ""))
    bw["top_page"] = bw["idx"].apply(lambda i: get_dbg(i, "top_page", ""))
except Exception:
    pass


Both-wrong N: 29


In [47]:
COMPANY_TOKEN_RE = re.compile(r'([一-龥A-Za-z0-9]+)')
YEAR_RE = re.compile(r'20\d{2}')

def norm(s): 
    return (s or "").strip()

def canonicalize_filename(fn: str):
    """去掉日期/页数字样板，只保留‘根标题’当作 doc_id。"""
    x = norm(fn)
    x = re.sub(r'[-_（(]?\d{6,8}[)）]?页?', '', x)   # 页数
    x = re.sub(r'[-_（(]?(19|20)\d{2}[\d\-_.]*[)）]?', '', x)  # 年月日
    x = re.sub(r'\.pdf$', '', x, flags=re.I)
    x = re.sub(r'\s+', '', x)
    return x

def extract_years(s):
    return YEAR_RE.findall(s or "")

def company_tokens_from_filename(fn: str):
    """从文件名切中文/字母数字 token；你也可以换成自有公司词库匹配。"""
    core = canonicalize_filename(fn)
    toks = [t for t in COMPANY_TOKEN_RE.findall(core) if len(t)>=2]
    return set(toks)

def same_company_heur(pr_fn, gt_fn):
    """粗粒度判断是否同一家公司/系列：根ID相似 + token 重叠。"""
    a, b = canonicalize_filename(pr_fn), canonicalize_filename(gt_fn)
    ratio = SequenceMatcher(None, a, b).ratio()
    ia = company_tokens_from_filename(pr_fn)
    ib = company_tokens_from_filename(gt_fn)
    overlap = len(ia & ib)
    return ratio >= 0.65 or overlap >= 3, ratio, overlap


In [48]:
def classify_reason(row):
    pr, gt = row["pr_filename"], row["gt_filename"]
    same_series, ratio, overlap = same_company_heur(pr, gt)
    pr_years = extract_years(pr)
    gt_years = extract_years(gt)

    if same_series:
        if set(pr_years) and set(gt_years) and set(pr_years) != set(gt_years):
            return "Edition/Year mismatch"
        return "Same-series mismatch"
    else:
        # 粗略：如果共有行业词（“数字化”，“数据中心”，“白酒”等）但公司词不重叠
        pr_tok = company_tokens_from_filename(pr)
        gt_tok = company_tokens_from_filename(gt)
        if len(pr_tok & gt_tok) <= 1:
            return "Cross-company confusion"
        return "Generic keyword bias"

bw["reason"] = bw.apply(classify_reason, axis=1)
print(bw["reason"].value_counts())


reason
Cross-company confusion    29
Name: count, dtype: int64


In [49]:
# 问题里是否出现年份，而 PR/GT 年份不一致？
def year_in_question(q):
    return bool(YEAR_RE.search(q or ""))

bw["q_has_year"] = bw["question"].apply(year_in_question)

def years_set(s): 
    return set(extract_years(s))

bw["pr_years"] = bw["pr_filename"].apply(years_set)
bw["gt_years"] = bw["gt_filename"].apply(years_set)
bw["year_mismatch"] = bw.apply(lambda r: (r["pr_years"] != r["gt_years"]) and (r["gt_years"] != set()), axis=1)

print(pd.crosstab(bw["reason"], bw["q_has_year"], margins=True))
print(pd.crosstab(bw["reason"], bw["year_mismatch"], margins=True))

# 如果有 top_rr / rerank_score 可用，看看“高置信错配”的占比
if "top_rr" in bw.columns:
    print("High-confidence wrong (top_rr>=0.9) by reason:")
    print(pd.crosstab(bw["reason"], (bw["top_rr"]>=0.9), margins=True))


q_has_year               False  True  All
reason                                   
Cross-company confusion     20     9   29
All                         20     9   29
year_mismatch            False  True  All
reason                                   
Cross-company confusion     15    14   29
All                         15    14   29
High-confidence wrong (top_rr>=0.9) by reason:
top_rr                   False  All
reason                             
Cross-company confusion     29   29
All                         29   29


In [23]:
mask = (eval_df["filename_match"]==1) & (eval_df["page_match"]==0)
def _off1(r):
    try:
        return abs(int(r["pr_page"])-int(r["gt_page"]))==1
    except: return False
off1_ratio = float(mask.sum() and sum(_off1(r) for _,r in eval_df[mask].iterrows())/mask.sum())
off1_ratio


0.3333333333333333

In [38]:
# %%
# Telemetry spread
dbg = pd.json_normalize(raw_df["debug"].dropna())
print("Telemetry rows:", len(dbg))
tele = pd.concat([raw_df[["idx"]], dbg], axis=1)

# How many went through rerank path?
tele["path"].value_counts(dropna=False)

# Rerank HTTP health
tele["rerank_http"].value_counts(dropna=False).head(10)

# Did rerank actually return a ranked list?
tele["rerank_ok"].value_counts(dropna=False)

# Candidate pool sizes & final selection sizes
tele[["cand_n","ranked_n","page_vote_n","final_n"]].describe()


Telemetry rows: 59


Unnamed: 0,cand_n,ranked_n,page_vote_n,final_n
count,59.0,59.0,59.0,59.0
mean,120.0,120.0,1.677966,1.677966
std,0.0,0.0,0.899055,0.899055
min,120.0,120.0,1.0,1.0
25%,120.0,120.0,1.0,1.0
50%,120.0,120.0,1.0,1.0
75%,120.0,120.0,2.0,2.0
max,120.0,120.0,4.0,4.0


In [40]:
# %%
mismatch = tele[
    (tele["path"]=="rerank")
    & ((tele["top_file"]!=tele["model_file"]) | (tele["top_page"]!=tele["model_page"]))
]
mismatch.shape[0], mismatch.head(5)


(0,
 Empty DataFrame
 Columns: [idx, path, rerank_ok, rerank_attempts, rerank_http, cand_n, ranked_n, page_vote_n, final_n, top_file, top_page, model_file, model_page]
 Index: [])

In [41]:
# %%
merged = eval_df.merge(tele, on="idx", how="left", suffixes=("","_dbg"))

# Rerank vs baseline quality
merged["path"].value_counts()
merged.groupby("path")[["score","answer_jaccard","filename_match","page_match"]].mean()

# Does a higher top rerank_score imply better answers?
def top_rerank_score(row):
    rc = (raw_df.loc[raw_df["idx"]==row["idx"], "retrieval_chunks"].values or [None])[0]
    if not rc: return None
    s = rc[0].get("rerank_score") if isinstance(rc, list) and rc else None
    return s
merged["top_rr"] = merged.apply(top_rerank_score, axis=1)
pd.cut(merged["top_rr"], bins=[-1,0.3,0.6,0.8,0.9,1.1]).value_counts().sort_index(), \
merged.groupby(pd.cut(merged["top_rr"], bins=[-1,0.3,0.6,0.8,0.9,1.1]))[["score","filename_match","page_match"]].mean()


  merged.groupby(pd.cut(merged["top_rr"], bins=[-1,0.3,0.6,0.8,0.9,1.1]))[["score","filename_match","page_match"]].mean()


(top_rr
 (-1.0, 0.3]     1
 (0.3, 0.6]      0
 (0.6, 0.8]      2
 (0.8, 0.9]      2
 (0.9, 1.1]     54
 Name: count, dtype: int64,
                 score  filename_match  page_match
 top_rr                                           
 (-1.0, 0.3]  0.158451             0.0    0.000000
 (0.3, 0.6]        NaN             NaN         NaN
 (0.6, 0.8]   0.382627             1.0    0.000000
 (0.8, 0.9]   0.492353             0.5    0.500000
 (0.9, 1.1]   0.398331             0.5    0.425926)

In [27]:
# %%
sim = merged.copy()

# use top_file/top_page when present; else keep pr_*
sim["pr_file_locked_norm"] = np.where(sim["top_file"].notna(), sim["top_file"].map(normalize_filename), sim["pr_filename"].map(normalize_filename))
sim["pr_page_locked"]      = np.where(sim["top_page"].notna(), sim["top_page"], sim["pr_page"])

file_exact_locked = (sim["pr_file_locked_norm"] == sim["gt_filename_norm"]).mean()
page_exact_locked = (pd.to_numeric(sim["pr_page_locked"], errors="coerce") == pd.to_numeric(sim["gt_page"], errors="coerce")).mean()

# recompute a hypothetical score with locked file/page, same weights as your scorer (approx: add deltas keeping jaccard same)
W_FILE, W_PAGE, W_ANS = 0.5, 0.2, 0.3
score_locked = (
    W_ANS * sim["answer_jaccard"]
  + W_FILE * (sim["pr_file_locked_norm"] == sim["gt_filename_norm"]).astype(float)
  + W_PAGE * (pd.to_numeric(sim["pr_page_locked"], errors="coerce") == pd.to_numeric(sim["gt_page"], errors="coerce")).astype(float)
).mean()

print({
    "orig_file_exact": float(merged["filename_match"].mean()),
    "orig_page_exact": float(merged["page_match"].mean()),
    "locked_file_exact": float(file_exact_locked),
    "locked_page_exact": float(page_exact_locked),
    "orig_mean_score": float(merged["score"].mean()),
    "locked_mean_score_est": float(score_locked)
})


{'orig_file_exact': 0.5084745762711864, 'orig_page_exact': 0.4067796610169492, 'locked_file_exact': 0.5084745762711864, 'locked_page_exact': 0.4067796610169492, 'orig_mean_score': 0.3969204776660378, 'locked_mean_score_est': 0.43645737134538537}


In [28]:
# %%
def qtype(q: str):
    q = q or ""
    if re.search(r"(图|表|图表|见图|见表)", q): return "Figure/Table"
    if re.search(r"(\d+%|同比|环比|增长|下降|多少|估值|PS|PE)", q): return "Numeric"
    return "Conceptual"

eval_df["qtype"] = eval_df["question"].map(qtype)
eval_df["qtype"].value_counts(), eval_df.groupby("qtype")[["score","answer_jaccard","filename_match","page_match"]].mean()


(qtype
 Conceptual      39
 Numeric         14
 Figure/Table     6
 Name: count, dtype: int64,
                  score  answer_jaccard  filename_match  page_match
 qtype                                                             
 Conceptual    0.428758        0.319054        0.589744    0.487179
 Figure/Table  0.563765        0.377530        0.833333    0.666667
 Numeric       0.236726        0.366308        0.142857    0.071429)

In [42]:
# %%
# Very low score but high rerank confidence
hard_1 = merged[(merged["score"]<0.15) & (merged["top_rr"]>0.8)].head(30)[
    ["idx","question","score","answer_jaccard","filename_match","page_match","top_rr","top_file","top_page","model_file","model_page","gt_filename"]
]

# Filename off by small edit distance (quick heuristic: common prefix >= 10 chars)
def long_prefix(a,b):
    a,b=normalize_filename(a),normalize_filename(b)
    n=min(len(a),len(b)); i=0
    while i<n and a[i]==b[i]: i+=1
    return i
hard_2 = merged[(merged["filename_match"]==0)].assign(
    prefix_len=lambda d: d.apply(lambda r: long_prefix(r["pr_filename"], r["gt_filename"]), axis=1)
).sort_values("prefix_len", ascending=False).head(30)[["idx","question","pr_filename","gt_filename","prefix_len"]]

hard_1.to_json("hard_lowscore_highconf.json", force_ascii=False, orient="records", indent=2)
hard_2.to_json("hard_fn_mismatch_prefix.json", force_ascii=False, orient="records", indent=2)
print("Saved two debug files.")


Saved two debug files.
