# 01. 법령 원문 수집 (Self-contained)

이 노트북은 함수 정의부터 실행까지 모두 포함합니다.

In [None]:
import json
import os
from pathlib import Path

import requests
from dotenv import load_dotenv

load_dotenv()

LAW_SERVICE_URL = "http://www.law.go.kr/DRF/lawService.do"
DEFAULT_LAW_IDS = ("1823", "2118")


def fetch_law_json(law_id: str, oc: str | None = None) -> dict:
    oc_value = oc or os.getenv("OC", "")
    if not oc_value:
        raise ValueError("Missing OC. Set OC in environment or pass explicitly.")

    params = {"OC": oc_value, "target": "eflaw", "ID": law_id, "type": "JSON"}
    response = requests.get(LAW_SERVICE_URL, params=params, timeout=30)
    response.raise_for_status()
    return response.json()


def fetch_and_save_laws(law_ids: tuple[str, ...] = DEFAULT_LAW_IDS, output_dir: str = "data/processed/raw"):
    out_dir = Path(output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    saved_files = []
    for law_id in law_ids:
        payload = fetch_law_json(law_id=law_id)
        law_name = payload["법령"]["기본정보"].get("법령명_한글", law_id).replace(" ", "_")
        target = out_dir / f"{law_id}_{law_name}.json"
        target.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
        saved_files.append(target)
    return saved_files


# ---- 자치법규(ordin) 수집 helpers ----
DEFAULT_ORDIN_CSV = '../../data/법령검색목록_자치법규_건축.csv'


def load_seoul_ordin_ids(csv_path: str = DEFAULT_ORDIN_CSV) -> list[str]:
    import pandas as pd

    df = pd.read_csv(csv_path, encoding='utf-8-sig', skiprows=1)
    required = {'지자체명', '자치법규ID'}
    miss = required - set(df.columns)
    if miss:
        raise ValueError(f'CSV 컬럼 누락: {miss}, 현재={list(df.columns)}')

    seoul = df[df['지자체명'].astype(str).str.contains('서울', na=False)].copy()
    ids = (
        seoul['자치법규ID']
        .astype(str)
        .str.strip()
        .replace({'': None})
        .dropna()
        .drop_duplicates()
        .tolist()
    )
    return ids


def fetch_ordin_json(ordin_id: str, oc: str | None = None):
    oc = oc or os.getenv('OC', '')
    if not oc:
        raise ValueError('Missing OC in environment (.env)')

    params = {
        'OC': oc,
        'target': 'ordin',
        'ID': str(ordin_id),
        'type': 'JSON',
    }
    r = requests.get(LAW_SERVICE_URL, params=params, timeout=30)
    r.raise_for_status()
    return r.json()


def fetch_and_save_ordin(
    ordin_ids: list[str],
    out_dir: str = 'data/processed/raw/ordin',
):
    out = Path(out_dir)
    out.mkdir(parents=True, exist_ok=True)

    saved = []
    failed = []
    for oid in ordin_ids:
        try:
            data = fetch_ordin_json(str(oid))
            p = out / f'ordin_{str(oid)}.json'
            p.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
            saved.append(p)
        except Exception as e:
            failed.append({'ordin_id': str(oid), 'error': str(e)})

    if failed:
        fail_path = out / '_failed_ordin.json'
        fail_path.write_text(json.dumps(failed, ensure_ascii=False, indent=2), encoding='utf-8')
    return saved, failed


In [None]:
paths = fetch_and_save_laws()
for p in paths:
    print(p, p.exists(), p.stat().st_size)

In [None]:
# STEP ordin-1: 서울 자치법규ID 추출
seoul_ordin_ids = load_seoul_ordin_ids('../../data/법령검색목록_자치법규_건축.csv')
print('서울 자치법규ID 개수:', len(seoul_ordin_ids))
print('샘플:', seoul_ordin_ids[:10])


In [None]:
# STEP ordin-2: 서울 자치법규 raw JSON 저장
# 처음에는 limit를 걸고 테스트한 뒤, 필요하면 전체로 실행하세요.
limit = 30  # None이면 전체
targets = seoul_ordin_ids if limit is None else seoul_ordin_ids[:limit]

saved_paths, failed_items = fetch_and_save_ordin(targets)
print('saved:', len(saved_paths))
print('failed:', len(failed_items))
print('saved sample:', [str(p) for p in saved_paths[:5]])
if failed_items:
    print('failed sample:', failed_items[:3])


In [None]:
targets