# 02. 조문 파싱 (Self-contained)

파서 구현을 셀에 직접 포함합니다.

In [None]:
import json
import re
from dataclasses import dataclass, field
from pathlib import Path


def normalize_to_list(value):
    if value is None:
        return []
    if isinstance(value, dict):
        return [value]
    if isinstance(value, list):
        return value
    return []


def normalize_paragraph_num(raw: str) -> str:
    circled_map = {"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9", "⑩": "10", "⑪": "11", "⑫": "12", "⑬": "13", "⑭": "14", "⑮": "15"}
    raw = (raw or "").strip()
    return circled_map.get(raw, raw)


def classify_law_type(law_name: str) -> str:
    if "시행규칙" in law_name:
        return "시행규칙"
    if "시행령" in law_name:
        return "시행령"
    return "법률"


def parse_article_numbers(article: dict, header: str) -> tuple[str, str]:
    main = str(article.get("조문번호", "") or "").strip()
    sub = str(article.get("조문가지번호", "") or "").strip()

    # API 필드가 비어있는 경우 헤더(예: 제4조의2)에서 보정
    if not main or (not sub and "의" in header):
        m = re.search(r"제\s*(\d+)\s*조(?:\s*의\s*(\d+))?", header)
        if m:
            if not main:
                main = m.group(1) or ""
            if not sub:
                sub = m.group(2) or ""

    return main, sub


@dataclass
class ArticleChunk:
    law_name: str
    law_id: str
    law_type: str
    article_num: str
    article_sub: str = ""
    article_title: str = ""
    content: str = ""
    content_resolved: str = ""
    paragraphs: list[dict] = field(default_factory=list)
    internal_refs: list = field(default_factory=list)
    external_refs: list = field(default_factory=list)
    parent_law_refs: list = field(default_factory=list)
    abbreviations: dict = field(default_factory=dict)
    effective_date: str = ""
    change_type: str = ""


def parse_article(article: dict, law_name: str, law_id: str):
    if article.get("조문여부") != "조문":
        return None

    content_parts = []
    header = str(article.get("조문내용", "")).strip()
    if header:
        content_parts.append(header)

    article_num, article_sub = parse_article_numbers(article, header)

    paragraphs = []
    for para in normalize_to_list(article.get("항")):
        # para_num = normalize_paragraph_num(para.get("항번호", ""))
        para_num = para.get("항번호", "")
        para_content = str(para.get("항내용", "")).strip()
        if para_content:
            content_parts.append(para_content)

        subs = []
        for sub in normalize_to_list(para.get("호")):
            sub_num = str(sub.get("호번호", "")).strip().rstrip(".")
            sub_content = str(sub.get("호내용", "")).strip()
            if sub_content:
                content_parts.append(sub_content)

            items = []
            for item in normalize_to_list(sub.get("목")):
                item_num = str(item.get("목번호", "")).strip().rstrip(".")
                item_content = str(item.get("목내용", "")).strip()
                if item_content:
                    content_parts.append(item_content)
                items.append({"num": item_num, "content": item_content})

            subs.append({"num": sub_num, "content": sub_content, "items": items})

        paragraphs.append({"num": para_num, "content": para_content, "subs": subs})

    return ArticleChunk(
        law_name=law_name,
        law_id=str(law_id),
        law_type=classify_law_type(law_name),
        article_num=article_num,
        article_sub=article_sub,
        article_title=str(article.get("조문제목", "")).strip(),
        content="\n".join([x for x in content_parts if x]),
        paragraphs=paragraphs,
        effective_date=str(article.get("조문시행일자", "")),
        change_type=str(article.get("조문제개정유형", "")),
    )


def parse_law_data(data: dict):
    law_info = data["법령"]["기본정보"]
    law_name = law_info["법령명_한글"]
    law_id = str(law_info["법령ID"])
    articles_raw = normalize_to_list(data["법령"]["조문"].get("조문단위"))

    chunks = []
    for article in articles_raw:
        c = parse_article(article, law_name, law_id)
        if c:
            chunks.append(c)
    return chunks


In [None]:
raw_dir = Path('data/processed/raw')
files = sorted(raw_dir.glob('*.json'))
payload = json.loads(files[1].read_text(encoding='utf-8'))
chunks = parse_law_data(payload)
print('chunk count:', len(chunks))
print(chunks[0].law_name, chunks[0].article_num, chunks[0].article_sub, chunks[0].article_title)
print('head:', chunks[0].content[:180])


In [None]:
chunks

In [None]:
from dataclasses import asdict


def chunk_to_dict(chunk: ArticleChunk) -> dict:
    return asdict(chunk)


def sanitize_filename(name: str) -> str:
    keep = []
    for ch in name:
        if ch.isalnum() or ch in ['_', '-', ' ', '.']:
            keep.append(ch)
        else:
            keep.append('_')
    return ''.join(keep).strip().replace(' ', '_')


In [None]:
# chunks를 JSON으로 저장해서 구성 검증
out_dir = Path('data/processed/chunks')
out_dir.mkdir(parents=True, exist_ok=True)

chunk_dicts = [chunk_to_dict(c) for c in chunks]

if chunks:
    law_id = chunks[0].law_id
    law_name = chunks[0].law_name.replace(' ', '_')
    filename = f'{law_id}_{sanitize_filename(law_name)}_chunks.json'
else:
    filename = 'chunks.json'

out_path = out_dir / filename
out_path.write_text(json.dumps(chunk_dicts, ensure_ascii=False, indent=2), encoding='utf-8')
print('saved:', out_path)
print('saved chunk count:', len(chunk_dicts))

# 빠른 점검용 샘플
chunk_dicts[0] if chunk_dicts else {}


In [None]:
chunk_dicts[4] if chunk_dicts else {}