# 02. 자치법규(ordin) 파싱 (Self-contained)

- `LawService > 조문 > 조` 구조를 파싱합니다.
- `조내용` 텍스트에서 `조/항/호`를 정규식으로 분해합니다.
- 결과는 `chunks_ordin` 폴더에 저장합니다.


In [None]:
import json
import re
from pathlib import Path
from dataclasses import dataclass, asdict, field


def normalize_to_list(v):
    if v is None:
        return []
    if isinstance(v, list):
        return v
    if isinstance(v, dict):
        return [v]
    return []


def clean_text(s: str) -> str:
    return re.sub(r'\s+', ' ', str(s or '')).strip()


def parse_article_number_from_title(title_or_content: str):
    # 예: 제2조(정의), 제2조의2(정의)
    m = re.search(r'제\s*(\d+)\s*조(?:의\s*(\d+))?', str(title_or_content or ''))
    if not m:
        return '', ''
    return m.group(1), (m.group(2) or '')


def parse_article_number_from_code(code):
    # 조문번호 예: 000100 -> 1조, 000201 -> 2조의1
    c = code
    if isinstance(c, list):
        c = c[0] if c else ''
    c = str(c or '').strip()
    if not c.isdigit():
        return '', ''
    n = int(c)
    main = str(n // 100)
    sub_n = n % 100
    sub = str(sub_n) if sub_n > 0 else ''
    return main, sub


def remove_article_header(text: str) -> str:
    t = str(text or '').strip()
    # 앞쪽의 '제n조(제목)' 제거
    t = re.sub(r'^제\s*\d+\s*조(?:의\s*\d+)?\s*\([^)]*\)\s*', '', t)
    return t.strip()


CIRCLED = '①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳'


def split_paragraphs(body: str):
    text = str(body or '').strip()
    positions = [(m.start(), m.group(0)) for m in re.finditer(r'[①-⑳]', text)]
    if not positions:
        return [{'num': '', 'content': text}]

    out = []
    for i, (st, mark) in enumerate(positions):
        ed = positions[i + 1][0] if i + 1 < len(positions) else len(text)
        seg = text[st:ed].strip()
        out.append({'num': mark, 'content': seg})
    return out


In [None]:
@dataclass
class OrdinChunk:
    law_name: str
    law_id: str
    law_type: str = '조례'
    article_num: str = ''
    article_sub: str = '0'
    article_title: str = ''
    content: str = ''
    paragraphs: list[dict] = field(default_factory=list)
    effective_date: str = ''
    change_type: str = ''
    municipality: str = ''
    chapter_title: str = ''


def parse_ordin_data(data: dict):
    ls = data.get('LawService', {}) or {}
    base = ls.get('자치법규기본정보', {}) or {}

    law_name = str(base.get('자치법규명', '') or '').strip()
    law_id = str(base.get('자치법규ID', '') or '').strip()
    effective_date = str(base.get('시행일자', '') or '').strip()
    change_type = str(base.get('제개정정보', '') or '').strip()
    municipality = str(base.get('지자체기관명', '') or '').strip()

    jomun = (ls.get('조문', {}) or {})
    articles = normalize_to_list(jomun.get('조'))

    out = []
    current_chapter_title = ''

    for a in articles:
        if not isinstance(a, dict):
            continue

        jo_title = str(a.get('조제목', '') or '').strip()
        content_raw = str(a.get('조내용', '') or '').strip()
        jo_flag = str(a.get('조문여부', '') or '').strip().upper()

        # 조문여부=N 은 장/절/관 제목 성격 -> 청크 인덱싱 제외, 메타로만 누적
        if jo_flag == 'N':
            heading = jo_title or content_raw
            heading = clean_text(heading)
            if heading:
                current_chapter_title = heading
            continue

        # 우선 제목/본문에서 조번호 파싱, 실패 시 코드로 보정
        n1, s1 = parse_article_number_from_title(content_raw)
        if not n1:
            n1, s1 = parse_article_number_from_title(jo_title)
        if not n1:
            n1, s1 = parse_article_number_from_code(a.get('조문번호'))

        body = remove_article_header(content_raw)
        paragraphs = split_paragraphs(body)

        chunk = OrdinChunk(
            law_name=law_name,
            law_id=law_id,
            law_type='조례',
            article_num=str(n1 or ''),
            article_sub=str(s1 or '0'),
            article_title=jo_title,
            content=content_raw,
            paragraphs=paragraphs,
            effective_date=effective_date,
            change_type=change_type,
            municipality=municipality,
            chapter_title=current_chapter_title,
        )
        out.append(chunk)

    return out




def resolve_ordin_raw_dir() -> Path:
    candidates = [
        Path('data/processed/raw/ordin'),
        Path('notebooks/research_mvp/data/processed/raw/ordin'),
    ]
    for p in candidates:
        if p.exists():
            return p
    raise FileNotFoundError(f'ordin raw 폴더를 찾지 못했습니다: {candidates}')


def parse_all_ordin(raw_dir: Path, limit: int | None = None):
    files = sorted(raw_dir.glob('ordin_*.json'))
    if limit is not None:
        files = files[:limit]

    all_chunks = []
    per_law = {}
    for p in files:
        try:
            data = json.loads(p.read_text(encoding='utf-8'))
            chunks = parse_ordin_data(data)
            all_chunks.extend(chunks)
            if chunks:
                lid = chunks[0].law_id.zfill(7) if str(chunks[0].law_id).isdigit() else chunks[0].law_id
                per_law[lid] = chunks
        except Exception as e:
            print(f'[warn] parse failed: {p.name} / {e}')

    return all_chunks, per_law


def save_ordin_chunks(all_chunks, per_law, out_dir: str = 'data/processed/chunks_ordin'):
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)

    all_path = out / 'all_ordin_chunks.json'
    all_path.write_text(
        json.dumps([asdict(c) for c in all_chunks], ensure_ascii=False, indent=2),
        encoding='utf-8'
    )

    saved = [all_path]
    for law_id, chunks in per_law.items():
        name = chunks[0].law_name if chunks else law_id
        safe_name = str(name).replace('/', '_').replace(' ', '_')
        p = out / f'{law_id}_{safe_name}_chunks.json'
        p.write_text(json.dumps([asdict(c) for c in chunks], ensure_ascii=False, indent=2), encoding='utf-8')
        saved.append(p)

    return saved


In [None]:
# 실행
raw_dir = resolve_ordin_raw_dir()
print('raw_dir:', raw_dir)

all_ordin_chunks, per_law_chunks = parse_all_ordin(raw_dir, limit=50)  # 필요 시 None
print('parsed chunks:', len(all_ordin_chunks))
print('laws:', len(per_law_chunks))

saved_paths = save_ordin_chunks(all_ordin_chunks, per_law_chunks)
print('saved files:', len(saved_paths))
print('sample:', [str(x) for x in saved_paths[:5]])

# 샘플 확인
if all_ordin_chunks:
    s = all_ordin_chunks[0]
    print('sample article:', s.article_num, s.article_sub, s.article_title)
    print('sample content head:', clean_text(s.content)[:180])
    print('sample paragraphs:', s.paragraphs[:2])
