# 02 Build Graph Payload (Hierarchical Nodes)

- Source scope: `001823(건축법)`, `002118(건축법 시행령)`
- Node hierarchy: `Law -> Article -> Paragraph`
- Reference edge: `REF` (target can be Law / Article / Paragraph)

In [None]:
import json
from pathlib import Path

SOURCE_LAW_IDS = {'001823', '002118'}


def resolve_processed_root() -> Path:
    candidates = [
        Path('notebooks/research_mvp/data/processed'),
        Path('../research_mvp/data/processed'),
        Path('research_mvp/data/processed'),
    ]
    for c in candidates:
        if c.exists():
            return c
    raise FileNotFoundError('processed root not found')


PROCESSED = resolve_processed_root()
OUT_DIR = Path('notebooks/graphrag_mvp/data')
OUT_DIR.mkdir(parents=True, exist_ok=True)

CHUNKS_DIR = PROCESSED / 'chunks'
REF_MAP_DIR = PROCESSED / 'chunks_golden' / 'chunk_ref_map'

print('PROCESSED =', PROCESSED)
print('CHUNKS_DIR =', CHUNKS_DIR, CHUNKS_DIR.exists())
print('REF_MAP_DIR =', REF_MAP_DIR, REF_MAP_DIR.exists())


In [None]:
def load_source_chunks(chunks_dir: Path, source_law_ids: set[str]):
    rows = []
    for f in sorted(chunks_dir.glob('*_chunks.json')):
        arr = json.loads(f.read_text(encoding='utf-8'))
        for x in arr:
            law_id = str(x.get('law_id', '')).strip()
            if law_id in source_law_ids:
                rows.append(x)
    return rows


def load_ref_map_for_sources(ref_map_dir: Path, source_law_ids: set[str]):
    out = {}
    for law_id in source_law_ids:
        p = ref_map_dir / f'{law_id}.json'
        if p.exists():
            obj = json.loads(p.read_text(encoding='utf-8'))
            if isinstance(obj, dict):
                out.update(obj)
    return out


chunks = load_source_chunks(CHUNKS_DIR, SOURCE_LAW_IDS)
ref_map = load_ref_map_for_sources(REF_MAP_DIR, SOURCE_LAW_IDS)

print('source chunks:', len(chunks))
print('ref map keys:', len(ref_map))


In [None]:
CIRCLED_NUM_MAP = {
    '①': '1', '②': '2', '③': '3', '④': '4', '⑤': '5',
    '⑥': '6', '⑦': '7', '⑧': '8', '⑨': '9', '⑩': '10',
}


def norm_para_num(v) -> str:
    s = str(v or '').strip()
    if not s:
        return '0'
    if s in CIRCLED_NUM_MAP:
        return CIRCLED_NUM_MAP[s]
    digits = ''.join(ch for ch in s if ch.isdigit())
    return digits if digits else s


def normalize_text(v) -> str:
    return str(v or '').strip()


def normalize_name_key(name: str) -> str:
    # law_id 미확보 ref 법령을 위한 안정 키
    s = normalize_text(name)
    if not s:
        return ''
    return 'name::' + s.lower().replace(' ', '_')


def make_article_key(law_key: str, article_num: str, article_sub: str = '0') -> str:
    sub = normalize_text(article_sub) or '0'
    return f"{law_key}:{normalize_text(article_num)}:{sub}"


def make_paragraph_key(law_key: str, article_num: str, article_sub: str, para_num: str) -> str:
    return f"{law_key}:{normalize_text(article_num)}:{normalize_text(article_sub) or '0'}:{norm_para_num(para_num)}"


def normalize_ref_articles(value):
    # article 값이 int/str/list일 수 있음
    if value is None:
        return []
    if isinstance(value, list):
        out=[]
        for x in value:
            s=normalize_text(x)
            if s:
                out.append(s)
        return out
    s = normalize_text(value)
    return [s] if s else []


In [None]:
nodes = {}
edges = []


def upsert_node(node_id: str, label: str, props: dict):
    if node_id not in nodes:
        nodes[node_id] = {'id': node_id, 'label': label, 'props': dict(props)}
    else:
        # 빈값은 기존값 보존, 신규 유효값만 채움
        old = nodes[node_id]['props']
        for k,v in props.items():
            if k not in old or old.get(k) in [None, '']:
                old[k] = v


def add_edge(src: str, rel: str, dst: str, props=None):
    edges.append({'from': src, 'type': rel, 'to': dst, 'props': props or {}})


def ensure_law_node(law_key: str, law_id: str, law_name: str, law_type: str = ''):
    upsert_node(f'LAW:{law_key}', 'Law', {
        'law_key': law_key,
        'law_id': normalize_text(law_id),
        'law_name': normalize_text(law_name),
        'law_type': normalize_text(law_type),
    })


def ensure_article_node(law_key: str, law_id: str, law_name: str, article_num: str, article_sub: str = '0', article_title: str = '', content: str = ''):
    akey = make_article_key(law_key, article_num, article_sub)
    upsert_node(f'ART:{akey}', 'Article', {
        'article_key': akey,
        'law_key': law_key,
        'law_id': normalize_text(law_id),
        'law_name': normalize_text(law_name),
        'article_num': normalize_text(article_num),
        'article_sub': normalize_text(article_sub) or '0',
        'article_title': normalize_text(article_title),
        'content': normalize_text(content),
    })
    add_edge(f'LAW:{law_key}', 'HAS_ARTICLE', f'ART:{akey}')
    return akey


def ensure_paragraph_node(law_key: str, law_id: str, law_name: str, article_num: str, article_sub: str, para_num: str, content: str = '', is_placeholder: bool = False):
    pkey = make_paragraph_key(law_key, article_num, article_sub, para_num)
    upsert_node(f'PARA:{pkey}', 'Paragraph', {
        'paragraph_key': pkey,
        'law_key': law_key,
        'law_id': normalize_text(law_id),
        'law_name': normalize_text(law_name),
        'article_num': normalize_text(article_num),
        'article_sub': normalize_text(article_sub) or '0',
        'paragraph_num': norm_para_num(para_num),
        'content': normalize_text(content),
        'is_ref_placeholder': bool(is_placeholder),
    })
    akey = make_article_key(law_key, article_num, article_sub)
    add_edge(f'ART:{akey}', 'HAS_PARAGRAPH', f'PARA:{pkey}')
    return pkey


In [None]:
# 1) source 법령 본문으로 Law/Article/Paragraph 생성
law_key_by_name = {}
law_key_by_id = {}

for c in chunks:
    law_id = normalize_text(c.get('law_id'))
    law_name = normalize_text(c.get('law_name'))
    law_type = normalize_text(c.get('law_type'))
    article_num = normalize_text(c.get('article_num'))
    article_sub = normalize_text(c.get('article_sub')) or '0'

    law_key = law_id
    law_key_by_id[law_id] = law_key
    if law_name:
        law_key_by_name[law_name] = law_key

    ensure_law_node(law_key, law_id, law_name, law_type)
    ensure_article_node(
        law_key=law_key,
        law_id=law_id,
        law_name=law_name,
        article_num=article_num,
        article_sub=article_sub,
        article_title=normalize_text(c.get('article_title')),
        content=normalize_text(c.get('content')),
    )

    paras = c.get('paragraphs', [])
    if isinstance(paras, list) and paras:
        for p in paras:
            if not isinstance(p, dict):
                continue
            ensure_paragraph_node(
                law_key=law_key,
                law_id=law_id,
                law_name=law_name,
                article_num=article_num,
                article_sub=article_sub,
                para_num=normalize_text(p.get('num')) or '0',
                content=normalize_text(p.get('content')),
                is_placeholder=False,
            )
    else:
        ensure_paragraph_node(
            law_key=law_key,
            law_id=law_id,
            law_name=law_name,
            article_num=article_num,
            article_sub=article_sub,
            para_num='0',
            content=normalize_text(c.get('content')),
            is_placeholder=False,
        )

print('after source graph -> nodes:', len(nodes), 'edges:', len(edges))
print('known laws by id:', len(law_key_by_id), 'by name:', len(law_key_by_name))


In [None]:
# 2) REF 생성 (target level: Law / Article / Paragraph)

def pick_law_key_from_ref(ref_law_name: str, src_law_key: str):
    name = normalize_text(ref_law_name)
    if not name:
        return src_law_key, '', ''
    if name in law_key_by_name:
        key = law_key_by_name[name]
        return key, key, name
    # source 스코프 밖 법령 -> synthetic law_key 생성
    skey = normalize_name_key(name)
    return skey, '', name


ref_edges = 0
for c in chunks:
    src_law_key = normalize_text(c.get('law_id'))
    src_law_id = normalize_text(c.get('law_id'))
    src_law_name = normalize_text(c.get('law_name'))
    src_article = normalize_text(c.get('article_num'))
    src_sub = normalize_text(c.get('article_sub')) or '0'
    src_art_key = make_article_key(src_law_key, src_article, src_sub)

    refs = ref_map.get(make_article_key(src_law_key, src_article, src_sub), {}) or {}
    internal_refs = refs.get('internal_refs', []) if isinstance(refs.get('internal_refs', []), list) else []
    external_refs = refs.get('external_refs', []) if isinstance(refs.get('external_refs', []), list) else []

    for scope, ref_list in [('internal', internal_refs), ('external', external_refs)]:
        for r in ref_list:
            if not isinstance(r, dict):
                continue

            law_key, target_law_id_known, target_law_name = pick_law_key_from_ref(r.get('law_name', ''), src_law_key)
            if not target_law_name:
                target_law_name = src_law_name

            # law node 보장
            ensure_law_node(
                law_key=law_key,
                law_id=target_law_id_known,
                law_name=target_law_name,
                law_type='',
            )

            article_candidates = normalize_ref_articles(r.get('article'))
            para_val = normalize_text(r.get('paragraph')) or '0'

            if not article_candidates:
                # 법-only ref -> target: Law
                add_edge(
                    f'ART:{src_art_key}',
                    'REF',
                    f'LAW:{law_key}',
                    {
                        'scope': scope,
                        'raw': normalize_text(r.get('raw')),
                        'item': normalize_text(r.get('item')),
                        'target_level': 'law',
                    },
                )
                ref_edges += 1
                continue

            for art in article_candidates:
                # article node 보장(내용 미확보시 빈값)
                tgt_art_key = ensure_article_node(
                    law_key=law_key,
                    law_id=target_law_id_known,
                    law_name=target_law_name,
                    article_num=art,
                    article_sub=normalize_text(r.get('article_sub')) or '0',
                    article_title='',
                    content='',
                )

                if para_val and para_val != '0':
                    # paragraph ref
                    tgt_para_key = ensure_paragraph_node(
                        law_key=law_key,
                        law_id=target_law_id_known,
                        law_name=target_law_name,
                        article_num=art,
                        article_sub=normalize_text(r.get('article_sub')) or '0',
                        para_num=para_val,
                        content='',
                        is_placeholder=True,
                    )
                    add_edge(
                        f'ART:{src_art_key}',
                        'REF',
                        f'PARA:{tgt_para_key}',
                        {
                            'scope': scope,
                            'raw': normalize_text(r.get('raw')),
                            'item': normalize_text(r.get('item')),
                            'target_level': 'paragraph',
                        },
                    )
                else:
                    # article ref
                    add_edge(
                        f'ART:{src_art_key}',
                        'REF',
                        f'ART:{tgt_art_key}',
                        {
                            'scope': scope,
                            'raw': normalize_text(r.get('raw')),
                            'item': normalize_text(r.get('item')),
                            'target_level': 'article',
                        },
                    )
                ref_edges += 1

print('ref edges added:', ref_edges)
print('final nodes:', len(nodes), 'final edges:', len(edges))


In [None]:
graph_payload = {
    'meta': {
        'source_law_ids': sorted(SOURCE_LAW_IDS),
        'node_labels': ['Law', 'Article', 'Paragraph'],
        'edge_types': ['HAS_ARTICLE', 'HAS_PARAGRAPH', 'REF'],
    },
    'nodes': list(nodes.values()),
    'edges': edges,
    'stats': {
        'node_count': len(nodes),
        'edge_count': len(edges),
    }
}

out_path = OUT_DIR / 'legal_graph_payload_ref_only.json'
out_path.write_text(json.dumps(graph_payload, ensure_ascii=False, indent=2), encoding='utf-8')
print('saved:', out_path)
print('stats:', graph_payload['stats'])
