In [None]:
# === 新版本：按 text_level 聚合 ===

import json
import re
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
import pandas as pd
import fitz # PyMuPDF

# C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline
# === Cell 1: Imports & Paths (可沿用) ===
DOC_CODE = "g33a"
INPUT_JSON = fr"C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\out_md_mineru\{DOC_CODE}\auto\{DOC_CODE}_content_list.json"
INPUT_PDF = fr"C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\SPM\{DOC_CODE}.pdf"
DOC_ID = Path(INPUT_JSON).stem.split("_content_list")[0]
OUT_AGGREGATED_CSV = fr"C:\Users\User\Desktop\Microsoft_GraphRAG\UnstructuredData_Transformation_Pipeline\SPM\csv_data\{DOC_ID}.csv"

# === Cell 2: Regex & Constants (沿用) ===
# HEADING_RE = re.compile(r'^\s*(\d+(?:\.\d+)*)[.\)]?\s+(.*\S)?\s*$')

# 新增邏輯：hybrid approach，結合 text_level 與 regex
HEADING_RE = re.compile(
    r'^\s*('
    r'\d+(?:\.\d+)*'             # 1. 原有的數字編號 (1.2, 3.4.5)
    r'|ANNEX\s+[A-Z0-9]+'        # 2. 新增: ANNEX 接數字或字母 (ANNEX 1, ANNEX A)
    r'|INTERPRETATIVE\s+NOTES'   # 3. 新增: 固定詞組 INTERPRETATIVE NOTES
    r')[:.\)]?(?:\s+(.*\S)?)?\s*$', # 4. 修改: 允許 ID 後面直接結束 (針對 standalone heading)
    re.IGNORECASE
)

HKMA_BANNER_KEYWORDS = ["monetary authority", "supervisory policy manual", "香港金融管理局"]
NAV_KEYWORDS = ["contents", "glossary", "home", "introduction"]
TOP_Y0_THRESHOLD = 100


# === Cell 3: Loaders & Basic Helpers (沿用) ===
def load_content_list(path: str) -> List[Dict[str, Any]]:
    p = Path(path)
    assert p.exists(), f"File not found: {path}"
    with open(p, "r", encoding="utf-8") as f:
        return json.load(f)

def bbox_is_top(bbox: List[int], y0_thr: int = TOP_Y0_THRESHOLD) -> bool:
    if not bbox or len(bbox) < 4: return False
    return bbox[1] <= y0_thr

def is_hkma_banner_table(block: Dict[str, Any]) -> bool:
    if block.get("type") != "table": return False
    if not bbox_is_top(block.get("bbox") or []): return False
    text_lc = (block.get("table_body") or "").lower()
    return any(kw in text_lc for kw in HKMA_BANNER_KEYWORDS)

def is_nav_table(block: Dict[str, Any]) -> bool:
    if block.get("type") != "table": return False
    table_body = (block.get("table_body") or "").lower()
    return any(kw in table_body for kw in NAV_KEYWORDS)

def normalize_text(s: Optional[str]) -> str:
    if not s: return ""
    return re.sub(r'\s+', ' ', s).strip()


# === Cell 4: Build Blocks (沿用) ===
def build_blocks(raw: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    cleaned = []
    for i, b in enumerate(raw):
        typ = b.get("type")
        if typ == "table" and (is_hkma_banner_table(b) or is_nav_table(b)):
            continue

        blk = {
            "type": typ, "page_idx": b.get("page_idx"), "bbox": b.get("bbox"),
            "text_level": b.get("text_level", None), "index": i,
        }
        if typ == "text":
            blk["text"] = normalize_text(b.get("text"))
        elif typ == "table":
            html = (b.get("table_body") or "").strip()
            blk["text"] = re.sub(r"\s+", " ", html)
        elif typ == "image":
            blk["text"] = "" # 圖片內容暫時忽略
        else:
            blk["text"] = normalize_text(b.get("text", ""))
        cleaned.append(blk)
    return cleaned


# === Cell 5: Heading Detection (沿用) ===
def classify_heading(text: str) -> Optional[Tuple[str, int, str]]:
    m = HEADING_RE.match(text)
    if not m: return None
    cid, title = m.group(1), m.group(2) or ""
    level = cid.count(".") + 1
    return (cid, level, title.strip())

# def find_headings(blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
#     heads = []
#     for idx, b in enumerate(blocks):
#         if b["type"] != "text": continue
#         ch = classify_heading(b.get("text") or "")
#         if not ch: continue
#         clause_id, level, title = ch
#         heads.append({
#             "block_idx": idx, "clause_id": clause_id, "level": level,
#             "title": title, "page_idx": b.get("page_idx"), "bbox": b.get("bbox"),
#         })
#     return heads

def find_headings(blocks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    heads = []
    print("--- Debugging Heading Detection (Strict Mode) ---")
    
    for idx, b in enumerate(blocks):
        if b["type"] != "text": continue
        
        text = b.get("text", "").strip()
        text_level = b.get("text_level")
        
        # 1. 檢查 Regex
        ch = classify_heading(text)
        
        # 2. 檢查 text_level
        is_level_one = (text_level == 1)
        
        # 3. 嚴格判定：必須兩者皆為真
        if ch and is_level_one:
            clause_id, level, title = ch
            print(f"[MATCH] Block {idx}: ID={clause_id} | Title={title[:30]}... | Level={level}")
            heads.append({
                "block_idx": idx, 
                "clause_id": clause_id, 
                "level": level,
                "title": title, 
                "page_idx": b.get("page_idx"), 
                "bbox": b.get("bbox"),
            })
        elif ch:
            # 這是符合 Regex 但 text_level 不是 1 的情況 (例如 2.1, 2.2)
            print(f"[SKIP - Low Level] Block {idx}: Matches Regex but text_level={text_level}. Text: {text[:50]}...")
        elif is_level_one:
            # 這是 text_level 為 1 但不符合 Regex 的情況 (例如 Annex, Glossary)
            print(f"[SKIP - No Regex] Block {idx}: text_level=1 but no Regex match. Text: {text[:50]}...")

    print(f"--- Total Headings Found: {len(heads)} ---")
    return heads

# === Cell 6: Slice bodies (MODIFIED) ===
def slice_body_with_tables(blocks: List[Dict[str, Any]], head_idx: int, next_head_idx: Optional[int]) -> Tuple[str, List[int], int, int]:
    start = head_idx + 1
    end = next_head_idx if next_head_idx is not None else len(blocks)
    content_parts = []
    body_indices = []
    page_start = blocks[head_idx].get("page_idx")
    page_end = page_start

    for i in range(start, end):
        b = blocks[i]
        # ✅ 修改點：現在包含 "text" 和 "table" 類型的 block
        if b["type"] in ("text", "table"):
            content = b.get("text") or ""
            if content.strip():
                content_parts.append(content.strip())
                body_indices.append(i)
                if b.get("page_idx") is not None:
                    page_end = b["page_idx"]

    body = "\n\n".join(content_parts).strip() # 用換兩行來分隔 block
    return body, body_indices, page_start, page_end

def choose_longest_by_id(candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    best: Dict[str, Dict[str, Any]] = {}
    for c in candidates:
        cid = c["clause_id"]
        cur = best.get(cid)
        if (cur is None) or (c.get("body_len", 0) > cur.get("body_len", 0)):
            best[cid] = c
    return sorted(best.values(), key=lambda x: x.get("block_idx", 10**9))


# === Cell 7: Build Clause Rows (MODIFIED) ===
def parent_of(cid: str) -> Optional[str]:
    parts = cid.split(".")
    return ".".join(parts[:-1]) if len(parts) > 1 else None

def build_clause_rows(blocks: List[Dict[str, Any]], heads: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    rows = []
    for i, h in enumerate(heads):
        head_block_idx = h["block_idx"]
        next_block_idx = heads[i+1]["block_idx"] if i+1 < len(heads) else None
        
        heading_line = blocks[head_block_idx].get("text", "") or ""
        # ✅ 使用新的 slice 函式
        body_content, body_indices, page_start_idx, page_end_idx = slice_body_with_tables(blocks, head_block_idx, next_block_idx)

        full_text = (heading_line + "\n\n" + body_content).strip() if body_content else heading_line
        
        page_start = (page_start_idx or 0) + 1 if page_start_idx is not None else None
        page_end   = (page_end_idx or 0) + 1 if page_end_idx is not None else None

        rows.append({
            "document_id": DOC_ID, "clause_id": h["clause_id"], "level": h["level"],
            "parent_id": parent_of(h["clause_id"]), "title": h["title"],
            "heading_page": (h.get("page_idx") or 0) + 1 if h.get("page_idx") is not None else None,
            "heading_bbox": h.get("bbox"), "page_start": page_start, "page_end": page_end,
            "full_text": full_text, "body_len": len(full_text),
            "heading_block_idx": head_block_idx, "body_block_indices": body_indices,
            "block_idx": head_block_idx,
        })
        
    return choose_longest_by_id(rows)


# === Cell 8: Metadata Extraction (沿用) ===
def extract_pdf_metadata(pdf_path: Path) -> dict:
    pdf = fitz.open(str(pdf_path))
    md = pdf.metadata or {}
    pdf.close()
    return {"doc_title": md.get("title", ""), "author": md.get("author", "")}

def extract_json_metadata(json_path: Path) -> dict:
    with open(json_path, "r", encoding="utf-8") as f:
        raw_blocks = json.load(f)
    all_text = "\n".join(b.get("text", "") for b in raw_blocks if b.get("type") == "text")
    version = ""
    for b in raw_blocks:
        if is_hkma_banner_table(b):
            m = re.search(r'V\.\d+\s*[–\-]\s*\d{2}\.\d{2}\.\d{2,4}', b.get("table_body", ""))
            if m: version = m.group(0); break
    purpose = (re.search(r'Purpose\s*\n(.+)', all_text).group(1).strip() if re.search(r'Purpose\s*\n(.+)', all_text) else "")
    classification = (re.search(r'Classification\s*\n(.+)', all_text).group(1).strip() if re.search(r'Classification\s*\n(.+)', all_text) else "")
    previous_version = (re.search(r'Previous guidelines superseded\s*\n(.+)', all_text).group(1).strip() if re.search(r'Previous guidelines superseded\s*\n(.+)', all_text) else "")
    return {"version": version, "purpose": purpose, "classification": classification, "previous_version": previous_version}

def merge_metadata(pdf_path: Path, json_path: Path) -> dict:
    pdf_meta = extract_pdf_metadata(pdf_path)
    json_meta = extract_json_metadata(json_path)
    return {**pdf_meta, **json_meta}


# === Cell 9: Execution (MODIFIED) ===
# 1. 讀取與清理
raw = load_content_list(INPUT_JSON)
blocks = build_blocks(raw)

# 2. 尋找標題
heads = find_headings(blocks)
if not heads:
    raise RuntimeError("No numeric headings found. Check HEADING_RE or the input content.")
print(f"Found {len(heads)} headings.")

# 3. 組裝條款 (已包含表格)
clause_rows = build_clause_rows(blocks, heads)
print(f"Clauses after TOC-dedup (tables included): {len(clause_rows)}")

# 4. 排序
# def clause_sort_key(r: Dict[str, Any]):
#     parts = tuple(int(x) for x in r["clause_id"].split("."))
#     return (parts, r["level"])
# clause_rows_sorted = sorted(clause_rows, key=clause_sort_key)

clause_rows_sorted = sorted(clause_rows, key=lambda x: x["heading_block_idx"])

# 5. 匯出 CSV
df = pd.DataFrame(clause_rows_sorted)
if "body_block_indices" in df.columns:
    df["body_block_indices"] = df["body_block_indices"].apply(lambda x: "|".join(map(str, x)))
if "heading_bbox" in df.columns:
    df["heading_bbox"] = df["heading_bbox"].apply(lambda bb: ",".join(map(str, bb)) if bb else "")

# 6. 加入 Metadata
doc_metadata = merge_metadata(Path(INPUT_PDF), Path(INPUT_JSON))
for k, v in doc_metadata.items():
    df[k] = v

# 7. 整理欄位並儲存
cols = [
    "document_id", "clause_id", "level", "parent_id", "title", 
    "heading_page", "page_start", "page_end", "full_text", "body_len",
    "doc_title", "author", "version", "purpose", "classification", "previous_version",
    "heading_bbox", "heading_block_idx", "body_block_indices",
]
# 確保所有 metadata 欄位都存在
for col in cols:
    if col not in df.columns:
        df[col] = ""
df = df[cols]

df.to_csv(OUT_AGGREGATED_CSV, index=False, encoding="utf-8")
print(f"✅ Successfully saved merged data to: {OUT_AGGREGATED_CSV}")

display(df.head())

--- Debugging Heading Detection (Strict Mode) ---
[SKIP - No Regex] Block 0: text_level=1 but no Regex match. Text: SUPPLEMENT TO THE GUIDELINE...
[SKIP - No Regex] Block 4: text_level=1 but no Regex match. Text: CONTENTS...
[SKIP - No Regex] Block 5: text_level=1 but no Regex match. Text: Page...
[MATCH] Block 9: ID=1 | Title=Introduction... | Level=1
[SKIP - Low Level] Block 10: Matches Regex but text_level=None. Text: 1.1 The current HKMA Guideline on Prevention of Mo...
[SKIP - Low Level] Block 11: Matches Regex but text_level=None. Text: 1.2 A number of significant developments have take...
[SKIP - Low Level] Block 12: Matches Regex but text_level=None. Text: 1.3 The HKMA considers it necessary to revise its ...
[SKIP - Low Level] Block 13: Matches Regex but text_level=None. Text: 1.4 This Supplement mainly reflects the regulatory...
[SKIP - Low Level] Block 14: Matches Regex but text_level=None. Text: 1.5 Unless indicated otherwise, provisions in this...
[SKIP - Low Level] Block 

Unnamed: 0,document_id,clause_id,level,parent_id,title,heading_page,page_start,page_end,full_text,body_len,doc_title,author,version,purpose,classification,previous_version,heading_bbox,heading_block_idx,body_block_indices
0,g33a,1,1,,Introduction,3,3,3,1. Introduction\n\n1.1 The current HKMA Guidel...,2739,,,,,,,1468031797,9,10|11|12|13|14|15|16|17
1,g33a,2,1,,Customer acceptance policy,4,4,4,2. Customer acceptance policy\n\n2.1 This is a...,2219,,,,,,,1448044797,18,19|20|21|22|23|24|25|26|27
2,g33a,3,1,,Customer due diligence,4,4,7,3. Customer due diligence\n\n3.1 This section ...,6579,,,,,,,144817410833,28,29|30|31|32|33|34|35|37|38|39|40|41|42|43|44|4...
3,g33a,4,1,,Corporate customers,7,7,8,4. Corporate customers\n\n4.1 This section sup...,4503,,,,,,,146408389423,52,53|54|55|56|57|59|60|61|62|63|64
4,g33a,5,1,,Trust and nominee accounts,8,8,9,5. Trust and nominee accounts\n\n5.1 This sect...,999,,,,,,,144818452833,65,66|67|68|69
