# 04. [별표1] JSON 구축 (Self-contained)

In [None]:
import json
import re
from pathlib import Path

try:
    import pdfplumber
except Exception:
    pdfplumber = None

PDF_PATH = '../../data/[별표 1] 용도별 건축물의 종류(제3조의5 관련)(건축법 시행령).pdf'
OUT_PATH = 'data/processed/appendix1_terms.json'

SEED_TERMS = [
    {"category": "문화 및 집회시설", "subcategory": "공연장", "aliases": ["문화시설", "집회시설"], "description": "공연, 집회, 관람 시설", "source_clause": "건축법 시행령 [별표 1]"},
    {"category": "문화 및 집회시설", "subcategory": "집회장", "aliases": ["전시장", "회의장"], "description": "회의/전시/행사 시설", "source_clause": "건축법 시행령 [별표 1]"},
]


def normalize_line(line: str) -> str:
    return re.sub(r"\s+", " ", line).strip()


def extract_text_from_pdf(pdf_path: str) -> str:
    if not pdfplumber:
        return ""
    texts = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            t = page.extract_text() or ""
            if t:
                texts.append(t)
    return "\n".join(texts)


def parse_terms(text: str):
    terms = []
    current_category = ""
    for raw in text.splitlines():
        line = normalize_line(raw)
        if not line:
            continue
        if "시설" in line and len(line) < 40:
            current_category = line
            continue
        m = re.match(r"^(\d+\.|[가-힣]\.|\-)?\s*(.+)$", line)
        if m and current_category:
            body = m.group(2)
            if len(body) > 2:
                terms.append({
                    "category": current_category,
                    "subcategory": body[:50],
                    "aliases": [],
                    "description": body,
                    "source_clause": "건축법 시행령 [별표 1]",
                })
    return terms


def build_appendix1_json(pdf_path=PDF_PATH, out_path=OUT_PATH):
    parsed = parse_terms(extract_text_from_pdf(pdf_path))
    merged = {(t['category'], t['subcategory']): t for t in SEED_TERMS}
    for t in parsed:
        merged.setdefault((t['category'], t['subcategory']), t)
    data = {"source": "건축법 시행령 [별표 1]", "version": "mvp", "terms": list(merged.values())}
    out = Path(out_path)
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
    return out


In [None]:
out = build_appendix1_json()
print(out)
data = json.loads(Path(out).read_text(encoding='utf-8'))
print('term count:', len(data['terms']))
print(data['terms'][:2])


In [None]:
data['terms']