# 03 Ingest Neo4j (Law->Article->Paragraph + REF)

In [None]:
import json
import os
from pathlib import Path

from dotenv import load_dotenv
from neo4j import GraphDatabase

load_dotenv()

PAYLOAD_PATH = Path('notebooks/graphrag_mvp/data/legal_graph_payload_ref_only.json')
if not PAYLOAD_PATH.exists():
    raise FileNotFoundError(f'missing payload: {PAYLOAD_PATH}')

payload = json.loads(PAYLOAD_PATH.read_text(encoding='utf-8'))
nodes = payload.get('nodes', [])
edges = payload.get('edges', [])
print('payload nodes:', len(nodes), 'edges:', len(edges))

URI = os.getenv('NEO4J_URI', '').strip()
USER = os.getenv('NEO4J_USER', 'neo4j').strip()
PWD = os.getenv('NEO4J_PASSWORD', '').strip()
DB = os.getenv('NEO4J_DATABASE', 'neo4j').strip()

if not URI:
    raise ValueError('NEO4J_URI is empty')
if not PWD:
    raise ValueError('NEO4J_PASSWORD is empty')

# print('URI =', URI)
# print('USER =', USER)
# print('DB =', DB)


In [None]:
driver = GraphDatabase.driver(URI, auth=(USER, PWD))
driver.verify_connectivity()
print('verify_connectivity: OK')


def run_query(cypher: str, **params):
    records, summary, keys = driver.execute_query(cypher, database_=DB, **params)
    return records, summary, keys


def ingest_rows_in_batches(rows, batch_size, cypher, label):
    total = len(rows)
    if total == 0:
        print(f'{label}: 0 rows (skip)')
        return
    done = 0
    for i in range(0, total, batch_size):
        batch = rows[i:i + batch_size]
        _, summary, _ = run_query(cypher, rows=batch)
        done += len(batch)
        print(f'{label}: {done}/{total}, time(ms)={summary.result_available_after}')


In [None]:
# Legacy schema cleanup (한 번만 실행)
# - 과거 Law.law_id 유니크 제약 충돌 방지
# - law_key 없는 구버전 Law 노드 정리

# 1) legacy constraints drop
cons, _, _ = run_query('SHOW CONSTRAINTS YIELD name, labelsOrTypes, properties RETURN name, labelsOrTypes, properties')
legacy = []
for c in cons:
    labels = c.get('labelsOrTypes') or []
    props = c.get('properties') or []
    if ('Law' in labels) and ('law_id' in props):
        legacy.append(c['name'])

for name in legacy:
    run_query(f'DROP CONSTRAINT {name} IF EXISTS')
    print('dropped legacy constraint:', name)

if not legacy:
    print('no legacy Law(law_id) constraint found')

# 2) delete old Law nodes created by previous schema (law_key 없음)
recs, summary, _ = run_query('MATCH (n:Law) WHERE n.law_key IS NULL DETACH DELETE n RETURN count(n) AS deleted')
print('deleted legacy Law nodes:', recs[0]['deleted'], 'time(ms)=', summary.result_available_after)


In [None]:
for q in [
    'CREATE CONSTRAINT law_key_unique IF NOT EXISTS FOR (n:Law) REQUIRE n.law_key IS UNIQUE',
    'CREATE CONSTRAINT article_key_unique IF NOT EXISTS FOR (n:Article) REQUIRE n.article_key IS UNIQUE',
    'CREATE CONSTRAINT paragraph_key_unique IF NOT EXISTS FOR (n:Paragraph) REQUIRE n.paragraph_key IS UNIQUE',
]:
    _, summary, _ = run_query(q)
    print('constraint ok, time(ms)=', summary.result_available_after)


In [None]:
by_label = {'Law': [], 'Article': [], 'Paragraph': []}
for n in nodes:
    label = n.get('label')
    props = n.get('props', {})
    if label in by_label and isinstance(props, dict):
        by_label[label].append(props)

for k, v in by_label.items():
    print(k, len(v))


In [None]:
BATCH = 500

LAW_CYPHER = (
    'UNWIND $rows AS r '
    'MERGE (n:Law {law_key: r.law_key}) '
    'SET n.law_id = r.law_id, n.law_name = r.law_name, n.law_type = r.law_type'
)

ARTICLE_CYPHER = (
    'UNWIND $rows AS r '
    'MERGE (n:Article {article_key: r.article_key}) '
    'SET n.law_key = r.law_key, n.law_id = r.law_id, n.law_name = r.law_name, '
    'n.article_num = r.article_num, n.article_sub = r.article_sub, '
    'n.article_title = r.article_title, n.content = r.content'
)

PARA_CYPHER = (
    'UNWIND $rows AS r '
    'MERGE (n:Paragraph {paragraph_key: r.paragraph_key}) '
    'SET n.law_key = r.law_key, n.law_id = r.law_id, n.law_name = r.law_name, '
    'n.article_num = r.article_num, n.article_sub = r.article_sub, '
    'n.paragraph_num = r.paragraph_num, n.content = r.content, '
    'n.is_ref_placeholder = coalesce(r.is_ref_placeholder, false)'
)

ingest_rows_in_batches(by_label['Law'], BATCH, LAW_CYPHER, 'Law')
ingest_rows_in_batches(by_label['Article'], BATCH, ARTICLE_CYPHER, 'Article')
ingest_rows_in_batches(by_label['Paragraph'], BATCH, PARA_CYPHER, 'Paragraph')


In [None]:
edge_by_type = {}
for e in edges:
    edge_by_type.setdefault(e.get('type', ''), []).append(e)

print({k: len(v) for k,v in edge_by_type.items()})


In [None]:
def parse_node(node_id: str):
    if node_id.startswith('LAW:'):
        return 'Law', node_id[4:]
    if node_id.startswith('ART:'):
        return 'Article', node_id[4:]
    if node_id.startswith('PARA:'):
        return 'Paragraph', node_id[5:]
    return '', ''


# HAS_ARTICLE
ha_rows = []
for e in edge_by_type.get('HAS_ARTICLE', []):
    s_label, s_key = parse_node(e.get('from',''))
    t_label, t_key = parse_node(e.get('to',''))
    if s_label == 'Law' and t_label == 'Article':
        ha_rows.append({'law_key': s_key, 'article_key': t_key})

# HAS_PARAGRAPH
hp_rows = []
for e in edge_by_type.get('HAS_PARAGRAPH', []):
    s_label, s_key = parse_node(e.get('from',''))
    t_label, t_key = parse_node(e.get('to',''))
    if s_label == 'Article' and t_label == 'Paragraph':
        hp_rows.append({'article_key': s_key, 'paragraph_key': t_key})

# REF (dynamic target label)
ref_rows = []
for e in edge_by_type.get('REF', []):
    s_label, s_key = parse_node(e.get('from',''))
    t_label, t_key = parse_node(e.get('to',''))
    p = e.get('props', {}) or {}
    if not s_label or not t_label:
        continue
    ref_rows.append({
        'src_label': s_label,
        'src_key': s_key,
        'dst_label': t_label,
        'dst_key': t_key,
        'scope': str(p.get('scope','')),
        'raw': str(p.get('raw','')),
        'item': str(p.get('item','')),
        'target_level': str(p.get('target_level','')),
    })

print('ha_rows', len(ha_rows), 'hp_rows', len(hp_rows), 'ref_rows', len(ref_rows))


In [None]:
HAS_ARTICLE_CYPHER = (
    'UNWIND $rows AS r '
    'MATCH (a:Law {law_key:r.law_key}) '
    'MATCH (b:Article {article_key:r.article_key}) '
    'MERGE (a)-[:HAS_ARTICLE]->(b)'
)
HAS_PARAGRAPH_CYPHER = (
    'UNWIND $rows AS r '
    'MATCH (a:Article {article_key:r.article_key}) '
    'MATCH (b:Paragraph {paragraph_key:r.paragraph_key}) '
    'MERGE (a)-[:HAS_PARAGRAPH]->(b)'
)
REF_CYPHER = (
    'UNWIND $rows AS r '
    'CALL { '
    '  WITH r '
    '  MATCH (a:Law {law_key:r.src_key}) WHERE r.src_label = "Law" '
    '  MATCH (b:Law {law_key:r.dst_key}) WHERE r.dst_label = "Law" '
    '  MERGE (a)-[x:REF]->(b) '
    '  SET x.scope=r.scope, x.raw=r.raw, x.item=r.item, x.target_level=r.target_level '
    '  RETURN 1 AS done '
    '  UNION '
    '  WITH r '
    '  MATCH (a:Law {law_key:r.src_key}) WHERE r.src_label = "Law" '
    '  MATCH (b:Article {article_key:r.dst_key}) WHERE r.dst_label = "Article" '
    '  MERGE (a)-[x:REF]->(b) '
    '  SET x.scope=r.scope, x.raw=r.raw, x.item=r.item, x.target_level=r.target_level '
    '  RETURN 1 AS done '
    '  UNION '
    '  WITH r '
    '  MATCH (a:Article {article_key:r.src_key}) WHERE r.src_label = "Article" '
    '  MATCH (b:Law {law_key:r.dst_key}) WHERE r.dst_label = "Law" '
    '  MERGE (a)-[x:REF]->(b) '
    '  SET x.scope=r.scope, x.raw=r.raw, x.item=r.item, x.target_level=r.target_level '
    '  RETURN 1 AS done '
    '  UNION '
    '  WITH r '
    '  MATCH (a:Article {article_key:r.src_key}) WHERE r.src_label = "Article" '
    '  MATCH (b:Article {article_key:r.dst_key}) WHERE r.dst_label = "Article" '
    '  MERGE (a)-[x:REF]->(b) '
    '  SET x.scope=r.scope, x.raw=r.raw, x.item=r.item, x.target_level=r.target_level '
    '  RETURN 1 AS done '
    '  UNION '
    '  WITH r '
    '  MATCH (a:Article {article_key:r.src_key}) WHERE r.src_label = "Article" '
    '  MATCH (b:Paragraph {paragraph_key:r.dst_key}) WHERE r.dst_label = "Paragraph" '
    '  MERGE (a)-[x:REF]->(b) '
    '  SET x.scope=r.scope, x.raw=r.raw, x.item=r.item, x.target_level=r.target_level '
    '  RETURN 1 AS done '
    '} RETURN count(*) AS c'
)

ingest_rows_in_batches(ha_rows, BATCH, HAS_ARTICLE_CYPHER, 'HAS_ARTICLE')
ingest_rows_in_batches(hp_rows, BATCH, HAS_PARAGRAPH_CYPHER, 'HAS_PARAGRAPH')
ingest_rows_in_batches(ref_rows, BATCH, REF_CYPHER, 'REF')


In [None]:
for q in [
    'MATCH (n:Law) RETURN count(n) AS c',
    'MATCH (n:Article) RETURN count(n) AS c',
    'MATCH (n:Paragraph) RETURN count(n) AS c',
    'MATCH ()-[r:REF]->() RETURN count(r) AS c',
]:
    recs, summary, _ = run_query(q)
    print(q, '=>', recs[0]['c'], 'time(ms)=', summary.result_available_after)

recs, _, _ = run_query('MATCH (a)-[r:REF]->(b) RETURN labels(a) AS src_label, labels(b) AS dst_label, r.target_level AS target_level, r.raw AS raw LIMIT 20')
for r in recs:
    print(r.data())


In [None]:
# REF target_level backfill (누락 대비)
# 기존 REF에 target_level 속성이 없으면 대상 노드 라벨로 보정

records, summary, _ = run_query(
    "MATCH (a)-[r:REF]->(b) "
    "WHERE r.target_level IS NULL "
    "WITH r, labels(b) AS lb "
    "SET r.target_level = CASE "
    "  WHEN 'Paragraph' IN lb THEN 'paragraph' "
    "  WHEN 'Article' IN lb THEN 'article' "
    "  WHEN 'Law' IN lb THEN 'law' "
    "  ELSE 'unknown' END "
    "RETURN count(r) AS updated"
)
print('target_level backfilled:', records[0]['updated'], 'time(ms)=', summary.result_available_after)

records, _, _ = run_query('MATCH ()-[r:REF]->() RETURN count(r) AS total_ref, count(r.target_level) AS with_target_level')
print('ref target_level coverage:', records[0])


In [None]:
driver.close()
print('driver closed')
