# -*- coding: utf-8 -*-
"""
分類 new_opinion.urls_dic[*] 為 7 類（A~G）：
A Lexis Commentary
B Statutes / Legislation
C Cases
D Secondary Sources
E Regulations / Court Rules
F Constitution
G Others

使用方式：
1) pip install pymongo
2) 修改 MONGO_URI / DB_NAME
3) 先 DRY_RUN=True 跑一次確認，再改 False 正式更新
"""

In [None]:
from pymongo import MongoClient, UpdateOne
import re

# ====== 基本設定 ======
MONGO_URI = "mongo_url"
DB_NAME = "copyright"
COLL_NAME = "RST_Preprocessed_SBS"

BATCH_SIZE = 500
DRY_RUN = False   # True 試跑
PROJECTION = {"urls_dic": 1}  # 只取需要欄位，加速

# ====== 規則：正則表達式 ======
RE_CASE_V = re.compile(r"\b[A-Z][A-Za-z0-9.&' -]+ v\. [A-Z][A-Za-z0-9.&' -]+")
RE_CASE_ALT = re.compile(r"\b(In re|Ex parte)\b", re.I)
RE_REPORTER = re.compile(
    r"\b(\d+\s+(U\.S\.|S\.Ct\.|L\.Ed\.|F\.3d|F\.2d|F\.Supp\. ?\d*|USPQ|N\.E\. ?\d*|N\.W\. ?\d*|P\. ?\d*|So\. ?\d*|A\. ?\d*|Cal\.|N\.Y\.|Mass\.|Tex\.))\b"
)

RE_USC = re.compile(r"\b\d+\s*U\.?S\.?C\.?\b", re.I)
RE_SECTION_WORD = re.compile(r"\bsection\s+\d+", re.I)
RE_CFR = re.compile(r"\b\d+\s*C\.?F\.?R\.?\b", re.I)
RE_RULES = re.compile(r"(fed\.\s*r\.)|(f\.r\.(civ|app|evid)\.\s*p\.)|(local\s+rule)", re.I)
RE_CONS = re.compile(r"u\.s\.\s*const\.", re.I)
RE_TREATISE = re.compile(r"(nimmer|patry|mccarthy|wright\s*&\s*miller|restatement|matthew\s+bender)", re.I)

# 主要用來抓「法條碎片」類（只由數字/符號/少量字母組成，且含至少一個數字）
RE_MOSTLY_NUMERIC_SYMBOLS = re.compile(r"^[\s\d§().,;a-zA-Z-]+$")

def looks_like_statute_fragment(text: str) -> bool:
    t = (text or "").strip()
    if not t:
        return False
    if not RE_MOSTLY_NUMERIC_SYMBOLS.match(t):
        return False
    if not re.search(r"\d", t):
        return False
    if RE_CASE_V.search(t):  # 避免誤判案件
        return False
    return True

def classify_category(raw_text: str, link: str):
    """
    回傳 (category, source_signal, confidence, rule_id)
    只產出 A~G 七類之一：
      A Lexis Commentary
      B Statutes / Legislation
      C Cases
      D Secondary Sources
      E Regulations / Court Rules
      F Constitution
      G Others
    """
    t = (raw_text or "").strip()
    l = (link or "").strip().lower()

    # --- A: Lexis Commentary（HN / LNHNREF / Headnote）
    if "lnhnref" in l or re.match(r"^hn\d+", t, re.I) or "headnote" in t.lower():
        return "Lexis Commentary", ("link_collection" if "lnhnref" in l else "raw_text_regex"), 1.0, "A_HN"

    # --- 連結優先：決定性訊號 ---
    if "collection=statutes-legislation" in l:
        return "Statutes / Legislation", "link_collection", 1.0, "B_link_statutes"

    if "collection=cases" in l:
        return "Cases", "link_collection", 1.0, "C_link_cases"

    if "collection=analytical-materials" in l:
        return "Secondary Sources", "link_collection", 1.0, "D_link_analytical"

    if "collection=law-reviews-journals" in l:
        return "Secondary Sources", "link_collection", 1.0, "D_link_lawreviews"  # 期刊也歸 D

    if "collection=dockets" in l:
        return "Others", "link_collection", 1.0, "G_link_dockets"

    # --- 無 collection 時：文字規則 ---
    # B: Statutes / Legislation
    if ("§" in t) or RE_USC.search(t) or RE_SECTION_WORD.search(t) or looks_like_statute_fragment(t):
        return "Statutes / Legislation", "raw_text_regex", 0.9, "B_regex"

    # E: Regulations / Court Rules
    if RE_CFR.search(t) or RE_RULES.search(t):
        return "Regulations / Court Rules", "raw_text_regex", 0.9, "E_regex"

    # F: Constitution
    if RE_CONS.search(t) or re.search(r"\bconstitution\b", t, re.I):
        return "Constitution", "raw_text_regex", 0.85, "F_regex"

    # D: Secondary Sources（treatises / practice guides / law reviews 名稱）
    if RE_TREATISE.search(t):
        return "Secondary Sources", "raw_text_regex", 0.85, "D_regex"

    # C: Cases（v. / In re / Ex parte / reporter）
    if RE_CASE_V.search(t) or RE_CASE_ALT.search(t) or RE_REPORTER.search(t):
        return "Cases", "raw_text_regex", 0.85, "C_regex"

    # G: Others
    return "Others", "none", 0.3, "G_fallback"

def main():
    client = MongoClient(MONGO_URI)
    col = client[DB_NAME][COLL_NAME]

    total_docs = 0
    updated_docs = 0
    ops = []

    cursor = col.find({}, PROJECTION)
    for doc in cursor:
        total_docs += 1
        urls = doc.get("urls_dic", []) or []
        new_urls = []
        dirty = False

        for u in urls:
            raw_text = u.get("raw_text", "") or ""
            link = u.get("link", "") or ""

            category, signal, conf, rule_id = classify_category(raw_text, link)

            # 僅寫入必要欄位：category（主需求），以及診斷欄位
            # 不再寫 subtype / is_fragment
            need_update = (
                u.get("category") != category or
                u.get("source_signal") != signal or
                float(u.get("confidence", -1)) != float(conf) or
                u.get("rule_id") != rule_id
            )
            # print(raw_text, link, category)
            if need_update:
                nu = dict(u)
                nu.update({
                    "category": category,
                    "source_signal": signal,
                    "confidence": conf,
                    "rule_id": rule_id,
                })
                new_urls.append(nu)
                dirty = True
            else:
                new_urls.append(u)

        if dirty:
            updated_docs += 1
            if not DRY_RUN:
                ops.append(
                    UpdateOne({"_id": doc["_id"]}, {"$set": {"urls_dic": new_urls}})
                )
                if len(ops) >= BATCH_SIZE:
                    col.bulk_write(ops, ordered=False)
                    ops = []

    if ops and not DRY_RUN:
        col.bulk_write(ops, ordered=False)

    print(f"[DONE] scanned={total_docs} | updated={updated_docs} | DRY_RUN={DRY_RUN}")

if __name__ == "__main__":
    main()


[DONE] scanned=2183 | updated=2183 | DRY_RUN=False
