# 01 Load Sources
GraphRAG 실험에 필요한 입력 데이터(chunks/ref/abbr/appendix)를 확인합니다.

In [None]:
import json
from pathlib import Path


def resolve_processed_root() -> Path:
    candidates = [
        Path('notebooks/research_mvp/data/processed'),
        Path('../research_mvp/data/processed'),
        Path('research_mvp/data/processed'),
    ]
    for c in candidates:
        if c.exists():
            return c
    raise FileNotFoundError('processed root not found')


PROCESSED = resolve_processed_root()
OUT_DIR = Path('notebooks/graphrag_mvp/data')
OUT_DIR.mkdir(parents=True, exist_ok=True)

print('PROCESSED =', PROCESSED)
print('OUT_DIR =', OUT_DIR)


In [None]:
def pick_chunks_path(root: Path) -> Path:
    p1 = root / 'chunks_with_refs' / 'all_chunks_with_refs.json'
    p2 = root / 'chunks_enriched' / 'all_chunks_enriched.json'
    p3 = root / 'chunks' / '001823_건축법_chunks.json'
    for p in [p1, p2, p3]:
        if p.exists():
            return p
    raise FileNotFoundError('no chunks json found')


chunks_path = pick_chunks_path(PROCESSED)
abbr_path = PROCESSED / 'abbr_maps_by_chunk.json'
appendix_path = PROCESSED / 'appendix1_terms.json'

print('chunks_path =', chunks_path)
print('abbr_path exists =', abbr_path.exists())
print('appendix_path exists =', appendix_path.exists())


In [None]:
chunks = json.loads(chunks_path.read_text(encoding='utf-8'))
if isinstance(chunks, dict):
    chunks = list(chunks.values())

abbr_maps = {}
if abbr_path.exists():
    abbr_maps = json.loads(abbr_path.read_text(encoding='utf-8'))

appendix_terms = []
if appendix_path.exists():
    appendix_terms = json.loads(appendix_path.read_text(encoding='utf-8'))

print('chunks:', len(chunks))
print('abbr_maps:', len(abbr_maps))
print('appendix_terms:', len(appendix_terms))


In [None]:
# quick sanity
for i, c in enumerate(chunks[:3], 1):
    print(i, c.get('law_id'), c.get('law_name'), c.get('article_num'), c.get('article_sub', '0'))
    print('  refs:', len(c.get('internal_refs', [])), len(c.get('external_refs', [])))


In [None]:
snapshot = {
    'chunks_path': str(chunks_path),
    'chunks_count': len(chunks),
    'abbr_count': len(abbr_maps),
    'appendix_count': len(appendix_terms),
}
(Path('notebooks/graphrag_mvp/data/source_snapshot.json')
 .write_text(json.dumps(snapshot, ensure_ascii=False, indent=2), encoding='utf-8'))
print('saved: notebooks/graphrag_mvp/data/source_snapshot.json')
