## Qdrant Cloud 연결 설정

로컬 대신 클라우드를 쓰려면 아래 환경변수를 먼저 설정하세요.

In [None]:
import os



# 06. 에이전트 실행 (Self-contained)

그래프/툴 정의를 노트북 셀에서 직접 작성해 실행합니다.

In [None]:
import json
import re
import uuid
from pathlib import Path

from langchain_naver import ChatClovaX, ClovaXEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import FieldCondition, Filter, MatchValue
from qdrant_client.http.exceptions import UnexpectedResponse

try:
    from langgraph.graph import END, StateGraph
except Exception:
    END = None
    StateGraph = None


def load_appendix_terms(path='data/processed/appendix1_terms.json'):
    data = json.loads(Path(path).read_text(encoding='utf-8'))
    return data.get('terms', [])


def lookup_appendix1_term(term_or_query: str, terms):
    q = term_or_query.lower().strip()
    if not q:
        return []

    exact = []
    alias = []
    fuzzy = []
    q_tokens = {t for t in re.split(r'[^0-9A-Za-z가-힣]+', q) if t}

    for t in terms:
        c = str(t.get('category', ''))
        s = str(t.get('subcategory', ''))
        a = [str(x) for x in t.get('aliases', [])]
        d = str(t.get('description', ''))

        if q in c.lower() or q in s.lower():
            exact.append(t)
            continue
        if any(q in x.lower() or x.lower() in q for x in a):
            alias.append(t)
            continue
        doc_tokens = {x for x in re.split(r'[^0-9A-Za-z가-힣]+', ' '.join([c, s, ' '.join(a), d]).lower()) if x}
        if q_tokens and doc_tokens:
            score = len(q_tokens & doc_tokens) / len(q_tokens | doc_tokens)
            if score > 0:
                fuzzy.append((score, t))

    fuzzy.sort(key=lambda x: x[0], reverse=True)
    return exact + alias + [x[1] for x in fuzzy][:5]


def build_retriever(
    collection='building_law',
    qdrant_path='./qdrant_data',
    qdrant_url=None,
    qdrant_api_key=None,
    prefer_grpc=False,
):
    url = (qdrant_url or os.getenv('QDRANT_URL') or '').strip()
    api_key = (qdrant_api_key or os.getenv('QDRANT_API_KEY') or '').strip()

    if url:
        if '/dashboard' in url:
            raise ValueError('QDRANT_URL은 dashboard URL이 아니라 API endpoint여야 합니다. 예: https://<cluster>.cloud.qdrant.io:6333')
        if not api_key:
            raise ValueError('QDRANT_URL 사용 시 QDRANT_API_KEY도 설정해야 합니다.')
        client = QdrantClient(url=url, api_key=api_key, prefer_grpc=prefer_grpc)
        print('Qdrant mode: CLOUD')
        print('Qdrant url :', url)
    else:
        client = QdrantClient(path=qdrant_path)
        print('Qdrant mode: LOCAL')
        print('Qdrant path:', qdrant_path)

    # 연결 확인 + 상세 에러 출력
    try:
        exists = client.collection_exists(collection)
        print(f'collection "{collection}" exists:', exists)
    except UnexpectedResponse as e:
        print('Qdrant request failed')
        print('status/code:', e)
        print('hint:')
        print('- QDRANT_URL 형식 확인 (https://<cluster>.cloud.qdrant.io:6333)')
        print('- QDRANT_API_KEY 권한 확인 (읽기/쓰기)')
        print('- 컬렉션명 확인:', collection)
        raise

    embeddings = ClovaXEmbeddings(model='bge-m3')
    store = QdrantVectorStore(client=client, collection_name=collection, embedding=embeddings)
    return client, store


def search_law_chunks(store, query, k=4):
    docs = store.similarity_search(query, k=k)
    return [{'content': d.page_content, 'metadata': d.metadata} for d in docs]




def parse_article_token(article: str) -> tuple[str, str]:
    a = str(article or '').strip()
    m = re.fullmatch(r'(\d+)(?:의(\d+))?', a)
    if m:
        return m.group(1) or '', m.group(2) or ''
    return a, ''


def make_chunk_key(law_id: str, article_num: str, article_sub: str | int | None = '') -> str:
    lid = str(law_id).strip().zfill(6)
    sub = str(article_sub or '').strip()
    return f"{lid}:{str(article_num).strip()}:{sub if sub else '0'}"


def get_article(client, collection, law_id, article_num):
    # article_num could be '4' or '4의2'
    law_id = str(law_id).strip().zfill(6)
    main, sub = parse_article_token(str(article_num))

    candidate_keys = []
    if main:
        candidate_keys.append(make_chunk_key(law_id, main, sub))
        # backward compatibility for old indexed data
        if sub:
            candidate_keys.append(f"{law_id}:{main}의{sub}")
        candidate_keys.append(f"{law_id}:{main}")
    else:
        candidate_keys.append(f"{law_id}:{str(article_num).strip()}")

    # 1) retrieve by deterministic point id candidates
    for source_key in candidate_keys:
        point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, source_key))
        try:
            points = client.retrieve(
                collection_name=collection,
                ids=[point_id],
                with_payload=True,
                with_vectors=False,
            )
            if points:
                return [{'content': (p.payload or {}).get('content_original', ''), 'metadata': (p.payload or {})} for p in points]
        except Exception:
            pass

    # 2) fallback: scroll and memory filter (works even without payload index)
    points, _ = client.scroll(
        collection_name=collection,
        limit=5000,
        with_payload=True,
        with_vectors=False,
    )

    out = []
    for p in points:
        payload = p.payload or {}
        pl_law = str(payload.get('law_id', '')).strip().zfill(6)
        pl_num = str(payload.get('article_num', '')).strip()
        pl_sub = str(payload.get('article_sub', '')).strip()

        if pl_law != law_id:
            continue
        if main and pl_num == main and (pl_sub or '') == (sub or ''):
            out.append({'content': payload.get('content_original', ''), 'metadata': payload})
        elif (not main) and pl_num == str(article_num).strip():
            out.append({'content': payload.get('content_original', ''), 'metadata': payload})

    return out


In [None]:
# (legacy graph helpers removed)
# 아래쪽 cell의 build_reference_graph_data / render_reference_graph 를 사용하세요.


## Step A. 상태 스키마


In [None]:
from typing import TypedDict


class GraphState(TypedDict, total=False):
    user_query: str
    targets: list[str]
    target_hits: dict[str, list[dict]]
    target_decisions: dict[str, list[dict]]
    target_retrieve_debug: dict[str, dict]
    hits: list[dict]
    pending_refs: list[dict]
    seen_ref_keys: list[str]
    contexts: list[dict]
    appendix: list[dict]
    hop_count: int
    max_hops: int
    answer: str
    ref_batch_decisions: list[dict]


## Step B. 런타임 초기화


In [None]:

DEFAULT_LLM_CONFIGS = {
    # 질문에서 구할 항목(건축선/용적률/...) 추출
    'target_extractor': {
        'model': 'HCX-005',
        'temperature': 0.1,
        'max_tokens': 512,
    },
    # 청크별 ref 확장 여부/우선순위/이유 판단
    'ref_expander': {
        'model': 'HCX-005',
        'temperature': 0.1,
        'max_tokens': 512,
    },
    # hop마다 몇 개 ref를 우선 확장할지 결정
    'ref_batch_planner': {
        'model': 'HCX-005',
        'temperature': 0.0,
        'max_tokens': 500,
    },
    # 최종 답변 생성
    'answer_generator': {
        'model': 'HCX-005',
        'temperature': 0.3,
        'max_tokens': 2048,
    },
}


def _build_llm_from_cfg(cfg: dict) -> ChatClovaX:
    return ChatClovaX(
        model=cfg.get('model', 'HCX-005'),
        temperature=float(cfg.get('temperature', 0.0)),
        max_tokens=int(cfg.get('max_tokens', 1024)),
    )


def get_runtime_llm(runtime: dict, role: str) -> ChatClovaX:
    cache = runtime.setdefault('_llm_cache', {})
    if role in cache:
        return cache[role]

    cfg_map = runtime.get('llm_configs', {}) or {}
    cfg = dict(DEFAULT_LLM_CONFIGS.get(role, DEFAULT_LLM_CONFIGS['answer_generator']))
    cfg.update(cfg_map.get(role, {}) or {})

    llm = _build_llm_from_cfg(cfg)
    cache[role] = llm
    return llm


def init_runtime(llm_configs: dict | None = None):
    client, store = build_retriever()  # Cloud/Local 자동
    terms = load_appendix_terms()

    merged = {k: dict(v) for k, v in DEFAULT_LLM_CONFIGS.items()}
    if llm_configs:
        for k, v in llm_configs.items():
            if k not in merged:
                merged[k] = {}
            merged[k].update(v or {})

    return {
        'client': client,
        'store': store,
        'terms': terms,
        'llm_configs': merged,
        '_llm_cache': {},
    }


## Step C. 질문에서 target 추출 (LLM)


In [None]:
def extract_targets_with_llm(llm, user_query: str) -> list[str]:
    prompt = f"""
당신은 건축법 질의에서 '무엇을 구하려는지'를 추출하는 분류기입니다.

규칙:
1) 출력은 반드시 JSON 하나만 출력
2) 스키마: {{"targets": ["...", "..."]}}
3) targets에는 계산/판단 대상 키워드를 넣는다 (예: 건축선, 용적률, 건폐율, 주차대수, 높이제한, 접도조건)
4) 중복 제거
5) 없으면 {{"targets": ["일반"]}}

질문:
{user_query}
""".strip()

    raw = llm.invoke(prompt)
    text = getattr(raw, 'content', str(raw)).strip()

    parsed = None
    try:
        parsed = json.loads(text)
    except Exception:
        m = re.search(r'\{[\s\S]*\}', text)
        if m:
            try:
                parsed = json.loads(m.group(0))
            except Exception:
                parsed = None

    targets = []
    if isinstance(parsed, dict) and isinstance(parsed.get('targets'), list):
        for t in parsed['targets']:
            if isinstance(t, str) and t.strip():
                targets.append(t.strip())

    if not targets:
        cand = ['건축선', '용적률', '건폐율', '주차', '높이제한', '접도조건', '일조권', '도로사선']
        for c in cand:
            if c in user_query:
                targets.append(c)

    out = []
    seen = set()
    for t in targets:
        if t not in seen:
            out.append(t)
            seen.add(t)

    return out or ['일반']


In [None]:
# runtime 초기화 (예시 실행 아님)
runtime = init_runtime(
    llm_configs={
        'target_extractor': {'temperature': 0.1, 'max_tokens': 512},
        'ref_expander': {'temperature': 0.1, 'max_tokens': 512},
        'ref_batch_planner': {'temperature': 0.1, 'max_tokens': 100},
        'answer_generator': {'temperature': 0.3, 'max_tokens': 2048},
    }
)


## Step D. 공통 유틸 (dedup / key)


In [None]:
def context_key(item: dict) -> str:
    m = item.get('metadata', {}) if isinstance(item, dict) else {}
    return f"{m.get('law_id')}:{m.get('article_num')}"


In [None]:
def dedup_contexts(items: list[dict]) -> list[dict]:
    dedup = {}
    for x in items:
        dedup[context_key(x)] = x
    return list(dedup.values())


## Step D-1. 깊은 참조 추적 유틸 (LLM + Rule)


In [None]:
IN_SCOPE_LAW_NAME_TO_ID = {
    '건축법': '001823',
    '건축법 시행령': '002118',
}


def normalize_abbr_map(abbr_obj) -> dict:
    if not isinstance(abbr_obj, dict):
        return {}
    if '약어' in abbr_obj and isinstance(abbr_obj.get('약어'), dict):
        return abbr_obj['약어']
    return abbr_obj


def extract_ref_candidates(meta: dict) -> list[dict]:
    law_id = str(meta.get('law_id', '')).strip().zfill(6)
    out = []
    seen = set()

    def parse_para_item_fallback(r: dict) -> tuple[str, str]:
        p = str(r.get('paragraph', '') or '').strip()
        it = str(r.get('item', '') or '').strip()
        if p or it:
            return p, it
        raw = str(r.get('raw', '') or '')
        m1 = re.search(r'제\s*(\d+)\s*항', raw)
        m2 = re.search(r'제\s*(\d+)\s*호', raw)
        return (m1.group(1) if m1 else ''), (m2.group(1) if m2 else '')

    def resolve_external_law_id(r: dict) -> str:
        lname = str(r.get('law_name', '') or '').strip()
        raw = str(r.get('raw', '') or '')
        if lname in IN_SCOPE_LAW_NAME_TO_ID:
            return IN_SCOPE_LAW_NAME_TO_ID[lname]
        if '대통령령' in raw or '시행령' in raw:
            return '002118'
        if re.search(r'\b법\b|법\s*제\d+', raw):
            return '001823'
        if '이 법' in raw:
            return law_id
        return ''

    for r in meta.get('internal_refs', []) or []:
        if not isinstance(r, dict):
            continue
        article = str(r.get('article', '')).strip()
        if not (law_id and article):
            continue
        paragraph, item = parse_para_item_fallback(r)
        key = (law_id, article, paragraph, item, 'internal')
        if key in seen:
            continue
        seen.add(key)
        out.append({
            'law_id': law_id,
            'article': article,
            'paragraph': paragraph,
            'item': item,
            'source_ref': r,
            'source': 'internal',
        })

    for r in meta.get('external_refs', []) or []:
        if not isinstance(r, dict):
            continue
        mapped_law_id = resolve_external_law_id(r)
        article = str(r.get('article', '')).strip()
        if not mapped_law_id:
            continue
        paragraph, item = parse_para_item_fallback(r)
        key = (mapped_law_id, article, paragraph, item, 'external')
        if key in seen:
            continue
        seen.add(key)
        out.append({
            'law_id': mapped_law_id,
            'article': article,
            'paragraph': paragraph,
            'item': item,
            'source_ref': r,
            'source': 'external-in-scope' if mapped_law_id in ['001823', '002118'] else 'external',
        })

    out.sort(key=lambda x: (0 if str(x.get('article','')).strip() else 1, x.get('source') != 'internal'))
    return out



In [None]:
def rule_expand_analysis(content: str, meta: dict) -> dict:
    refs = extract_ref_candidates(meta)
    abbr = normalize_abbr_map(meta.get('abbreviations', {}))
    text = content or ''

    cue_pattern = re.compile(r'(이하|에\s*따른|에\s*의한|에서\s*정하는|대통령령으로\s*정하는|국토교통부령으로\s*정하는)')
    cue_matches = cue_pattern.findall(text)

    reasons = []
    expand = False

    if not refs:
        reasons.append('ref 후보 없음')
    else:
        reasons.append(f"ref 후보 {len(refs)}개")

    if abbr:
        expand = True
        reasons.append(f"축약어 {len(abbr)}개 존재")

    if cue_matches:
        expand = True
        reasons.append(f"법령 해석 cue 발견: {sorted(set(cue_matches))}")

    internal_cnt = len(meta.get('internal_refs', []) or [])
    if internal_cnt >= 2:
        expand = True
        reasons.append(f"internal_refs {internal_cnt}개 (>=2)")

    if refs and not expand:
        reasons.append('rule 기준으로는 확장 필요성 낮음')

    return {
        'expand': bool(expand),
        'reasons': reasons,
        'cue_matches': sorted(set(cue_matches)),
        'abbr_count': len(abbr),
        'ref_candidates_count': len(refs),
        'internal_ref_count': internal_cnt,
    }


def rule_should_expand_chunk(content: str, meta: dict) -> bool:
    return rule_expand_analysis(content, meta)['expand']


def llm_should_expand_chunk(llm, user_query: str, target: str, content: str, meta: dict, return_debug: bool = False):
    refs_preview = {
        'internal_refs': (meta.get('internal_refs', []) or [])[:5],
        'external_refs': (meta.get('external_refs', []) or [])[:5],
    }

    prompt = f"""
당신은 건축법률 RAG 참조 확장 판단기다.
현재 청크를 읽고, 이 청크의 참조조항을 추가 추적해야 하는지 판단하라.

규칙:
- 출력은 JSON만: {{"expand": true/false, "priority": 0|1|2, "reason": "..."}}
- expand=true 기준: 질문/타깃 답변에 참조조항 해석이 핵심인 경우
- priority: 2(매우중요), 1(중요), 0(낮음)
- reason에는 "왜 참조를 확장/비확장해야 하는지"를 한 문장 이상 구체적으로 **한국어**로 쓴다.

질문: {user_query}
타깃: {target}
청크: {(content or '')}
refs_preview: {refs_preview}
""".strip()

    raw = llm.invoke(prompt)
    text = getattr(raw, 'content', str(raw)).strip()

    parse_error = ''
    obj = {}
    try:
        obj = json.loads(text)
    except Exception:
        m = re.search(r'\{[\s\S]*\}', text)
        if m:
            try:
                obj = json.loads(m.group(0))
            except Exception as e:
                parse_error = str(e)
                obj = {}
        else:
            parse_error = 'json object not found'

    # key alias fallback (모델이 한국어 키를 쓸 때 대비)
    expand_v = obj.get('expand', obj.get('확장', obj.get('need_expand', False)))
    pri_v = obj.get('priority', obj.get('우선순위', obj.get('importance', 0)))
    reason_v = obj.get('reason', obj.get('이유', obj.get('근거', '')))

    expand = bool(expand_v)
    priority = int(pri_v) if str(pri_v).isdigit() else 0
    reason = str(reason_v or '').strip()

    # reason이 비면 raw_text를 짧게라도 남겨 디버깅 가능하게 함
    if not reason:
        reason = (text[:300] + '...') if len(text) > 300 else text

    result = (expand, max(0, min(priority, 2)), reason)
    if not return_debug:
        return result

    debug = {
        'prompt_preview': prompt[:1200],
        'raw_text': text,
        'parsed_obj': obj,
        'parse_error': parse_error,
    }
    return result, debug





def llm_select_ref_batch_size(
    llm,
    user_query: str,
    targets: list[str],
    hop_count: int,
    max_hops: int,
    pending_refs: list[dict],
    max_batch: int = 4,
    return_debug: bool = False,
):
    # 아직 확장하지 않은 후보 미리보기
    preview = []
    for r in (pending_refs or [])[:12]:
        preview.append({
            'law_id': r.get('law_id'),
            'article': r.get('article'),
            'priority': r.get('priority', 0),
            'source': r.get('source', ''),
            'decision_reason': str(r.get('decision_reason', ''))[:120],
        })

    prompt = f"""
당신은 법률 RAG의 ref 확장 플래너다.
이번 hop에서 ref를 몇 개까지 우선 확장할지 정한다.

규칙:
- 출력은 JSON만: {{"expand_count": 정수, "reason": "..."}}
- expand_count 범위: 1 ~ {max_batch}
- 중요도(priority), 질문과의 관련성, 남은 hop({max_hops - hop_count})을 고려
- 보수적으로 판단하되, 핵심 ref를 놓치지 않도록 한다.

질문: {user_query}
targets: {targets}
현재 hop: {hop_count}
max_hops: {max_hops}
pending_ref_count: {len(pending_refs or [])}
pending_preview: {preview}
""".strip()

    raw = llm.invoke(prompt)
    text = getattr(raw, 'content', str(raw)).strip()

    obj = {}
    parse_error = ''
    try:
        obj = json.loads(text)
    except Exception:
        m = re.search(r'\{[\s\S]*\}', text)
        if m:
            try:
                obj = json.loads(m.group(0))
            except Exception as e:
                parse_error = str(e)
                obj = {}
        else:
            parse_error = 'json object not found'

    n = obj.get('expand_count', obj.get('count', obj.get('n', 1)))
    reason = str(obj.get('reason', obj.get('이유', '')) or '').strip()

    if not str(n).isdigit():
        # fallback heuristic
        p = len(pending_refs or [])
        if p >= 10:
            n = 4
        elif p >= 6:
            n = 3
        elif p >= 3:
            n = 2
        else:
            n = 1
        reason = reason or f'fallback_heuristic(pending={p})'

    n = max(1, min(int(n), int(max_batch)))

    if not return_debug:
        return n, reason

    debug = {
        'raw_text': text,
        'parsed_obj': obj,
        'parse_error': parse_error,
        'prompt_preview': prompt[:1500],
    }
    return (n, reason), debug




def get_ref_docs_cached(runtime, law_id: str, article: str):
    cache = runtime.setdefault('_ref_doc_cache', {})
    key = f"{str(law_id).strip()}:{str(article).strip()}"
    if key in cache:
        return cache[key]
    docs = get_article(runtime['client'], 'building_law', str(law_id).strip(), str(article).strip())
    cache[key] = docs or []
    return cache[key]



def build_ref_docs_preview(
    docs: list[dict],
    paragraph: str = '',
    item: str = '',
    ref_key: str = '',
    max_docs: int = 2,
    max_chars: int = 260,
):
    out = []
    p_raw = str(paragraph or '').strip()
    it = str(item or '').strip()

    circled = {
        '1':'①','2':'②','3':'③','4':'④','5':'⑤','6':'⑥','7':'⑦','8':'⑧','9':'⑨','10':'⑩',
        '11':'⑪','12':'⑫','13':'⑬','14':'⑭','15':'⑮','16':'⑯','17':'⑰','18':'⑱','19':'⑲','20':'⑳'
    }

    def norm_para_token(v: str) -> str:
        t = str(v or '').strip()
        if not t:
            return ''
        m = re.search(r'제\s*(\d+)\s*항', t)
        if m:
            return m.group(1)
        m = re.search(r'(\d+)\s*항', t)
        if m:
            return m.group(1)
        for k, c in circled.items():
            if t == c:
                return k
        if t.isdigit():
            return t
        return t

    p_norm = norm_para_token(p_raw)

    def extract_by_circled_from_content(full_text: str, pnum: str) -> str:
        symbol = circled.get(str(pnum), '')
        if not symbol:
            return ''
        t = str(full_text or '')
        i = t.find(symbol)
        if i < 0:
            return ''
        # 다음 항 기호 위치 탐색
        next_positions = [t.find(sym, i+1) for sym in circled.values() if t.find(sym, i+1) >= 0]
        j = min(next_positions) if next_positions else len(t)
        return t[i:j].strip()

    def extract_article_header(full_text: str) -> str:
        t = str(full_text or '').strip()
        if not t:
            return ''
        lines = [ln.strip() for ln in t.splitlines() if str(ln).strip()]
        if not lines:
            return ''
        first = lines[0]
        # 예: 제46조(건축선의 지정)
        if re.match(r'^제\s*\d+(?:의\d+)?조', first):
            return first
        m = re.search(r'(제\s*\d+(?:의\d+)?조[^\n]*)', t)
        return m.group(1).strip() if m else ''

    for d in (docs or [])[:max_docs]:
        meta = (d.get('metadata', {}) or {}) if isinstance(d, dict) else {}

        content = ''
        if isinstance(d, dict):
            content = (
                d.get('content')
                or d.get('page_content')
                or meta.get('content_original')
                or meta.get('page_content')
                or ''
            )

        law_id = str(meta.get('law_id', '') or '').zfill(6) if meta.get('law_id') else ''
        article_num = str(meta.get('article_num', '') or '')
        article_sub = str(meta.get('article_sub', '0') or '0')

        if (not law_id or not article_num) and ref_key and ':' in ref_key:
            parts = ref_key.split(':')
            law_id = law_id or parts[0].zfill(6)
            article_num = article_num or (parts[1] if len(parts) > 1 else '')
            article_sub = article_sub or (parts[2] if len(parts) > 2 else '0')

        targeted = ''
        paragraphs = meta.get('paragraphs', []) or []
        if p_norm:
            # 1) structured paragraphs 우선
            if isinstance(paragraphs, list):
                for para in paragraphs:
                    if not isinstance(para, dict):
                        continue
                    para_num_raw = str(para.get('num', '')).strip()
                    para_norm = norm_para_token(para_num_raw)
                    if para_norm == p_norm:
                        targeted = str(para.get('content', '') or '')
                        break
            # 2) fallback: content_original에서 ①② 패턴 직접 추출
            if not targeted:
                targeted = extract_by_circled_from_content(content, p_norm)

        header = extract_article_header(content)
        full_preview = str(content).replace('\n', ' ')[:max_chars]
        targeted_with_header = f"{header}\n{targeted}".strip() if (p_norm and targeted and header) else targeted
        targeted_preview = str(targeted_with_header).replace('\n', ' ')[:max_chars] if targeted_with_header else ''

        if p_norm:
            final_preview = targeted_preview
            paragraph_match = bool(targeted_preview)
        else:
            final_preview = targeted_preview or full_preview
            paragraph_match = True

        out.append({
            'chunk_key': f"{law_id}:{article_num}:{article_sub}",
            'law_name': meta.get('law_name', ''),
            'article_num': article_num,
            'article_title': meta.get('article_title', ''),
            'paragraph': p_raw,
            'item': it,
            'targeted_preview': targeted_preview,
            'content_preview': final_preview,
            'raw_content_preview': full_preview,
            'matched_paragraph_norm': p_norm,
            'paragraph_match': paragraph_match,
        })
    return out



def llm_should_follow_ref_by_content(
    llm,
    user_query: str,
    target: str,
    parent_chunk_preview: str,
    ref_key: str,
    ref_docs_preview: list[dict],
    return_debug: bool = False,
):
    # 중요: 이 판단은 ref 본문을 읽고 결정하지 않는다.
    # ref_docs_preview는 디버깅/표시용으로만 전달되며, 프롬프트에는 포함하지 않는다.
    ref_preview_count = len(ref_docs_preview or [])

    prompt = f"""
당신은 법률 참조 추적 판단기다.
반드시 현재 chunk 맥락만 보고, 해당 ref를 추적해야 하는지 판단하라.

중요 규칙:
- ref 조문 본문 내용은 아직 읽지 않은 상태로 판단한다.
- 아래 입력 중 ref_docs_preview 내용은 사용하지 않는다.
- 출력 reason에는 ref 본문에 대한 구체 인용을 쓰지 않는다.

출력 JSON만:
{{"follow": true/false, "priority": 0|1|2, "reason": "..."}}

판단 기준:
- 현재 chunk를 이해/적용하려면 참조 해석이 필수면 follow=true
- 필수성이 낮거나 현재 chunk만으로 충분하면 follow=false

query: {user_query}
target: {target}
parent_chunk_preview: {(parent_chunk_preview or '')[:700]}
ref_key: {ref_key}
ref_preview_count: {ref_preview_count}
""".strip()

    raw = llm.invoke(prompt)
    text = getattr(raw, 'content', str(raw)).strip()

    obj = {}
    parse_error = ''
    try:
        obj = json.loads(text)
    except Exception:
        m = re.search(r'\{[\s\S]*\}', text)
        if m:
            try:
                obj = json.loads(m.group(0))
            except Exception as e:
                parse_error = str(e)
                obj = {}
        else:
            parse_error = 'json object not found'

    follow_v = obj.get('follow', obj.get('추적', obj.get('expand', False)))
    pri_v = obj.get('priority', obj.get('우선순위', 0))
    reason_v = obj.get('reason', obj.get('이유', ''))

    follow = bool(follow_v)
    priority = int(pri_v) if str(pri_v).isdigit() else 0
    reason = str(reason_v or '').strip()
    if not reason:
        reason = (text[:250] + '...') if len(text) > 250 else text

    # reason에 ref 본문 직접 근거가 섞인 경우를 줄이기 위한 prefix
    reason = f"precheck_without_ref_content: {reason}"

    result = (follow, max(0, min(priority, 2)), reason)
    if not return_debug:
        return result

    debug = {
        'raw_text': text,
        'parsed_obj': obj,
        'parse_error': parse_error,
        'prompt_preview': prompt[:1400],
    }
    return result, debug




def evaluate_ref_candidates_from_chunk(runtime, user_query: str, target: str, doc: dict, max_refs_per_chunk: int = 6) -> list[dict]:
    content = str(doc.get('content', ''))
    meta = doc.get('metadata', {}) or {}
    src_key = context_key(doc)

    candidates = extract_ref_candidates(meta)
    if not candidates:
        return []

    try:
        need_ref, need_reason = llm_need_refs_from_current_context(
            get_runtime_llm(runtime, 'ref_expander'),
            user_query=user_query,
            target=target,
            context_preview=(content or '')[:900],
        )
    except Exception as e:
        need_ref, need_reason = True, f'llm_error:{e}'

    out = []
    if not need_ref:
        for c in candidates[:max_refs_per_chunk]:
            out.append({
                'law_id': str(c.get('law_id', '')).strip(),
                'article': str(c.get('article', '')).strip(),
                'paragraph': str(c.get('paragraph', '')).strip(),
                'item': str(c.get('item', '')).strip(),
                'source': c.get('source', ''),
                'source_chunk_key': src_key,
                'follow': False,
                'priority': 0,
                'decision_reason': f'chunk_context_sufficient: {need_reason}',
                'ref_docs_preview': [],
                'raw_ref': (c.get('source_ref') or {}).get('raw', ''),
            })
        return out

    for c in candidates[:max_refs_per_chunk]:
        law_id = str(c.get('law_id', '')).strip()
        article = str(c.get('article', '')).strip()
        paragraph = str(c.get('paragraph', '')).strip()
        item = str(c.get('item', '')).strip()
        ref_key = f"{law_id}:{article or '__law__'}"

        docs = get_ref_docs_for_candidate(runtime, law_id, article, user_query=user_query, target=target)
        ref_preview = build_ref_docs_preview(
            docs,
            paragraph=paragraph,
            item=item,
            ref_key=ref_key,
            max_docs=2,
            max_chars=260,
        )

        if not ref_preview:
            out.append({
                'law_id': law_id,
                'article': article,
                'paragraph': paragraph,
                'item': item,
                'source': c.get('source', ''),
                'source_chunk_key': src_key,
                'follow': False,
                'priority': 0,
                'decision_reason': 'ref_docs_not_found',
                'ref_docs_preview': [],
                'raw_ref': (c.get('source_ref') or {}).get('raw', ''),
            })
            continue

        try:
            (follow, pri, reason), _dbg = llm_should_follow_ref_by_content(
                get_runtime_llm(runtime, 'ref_expander'),
                user_query=user_query,
                target=target,
                parent_chunk_preview=content,
                ref_key=ref_key,
                ref_docs_preview=ref_preview,
                return_debug=True,
            )
        except Exception as e:
            follow, pri, reason = False, 0, f'llm_error:{e}'

        out.append({
            'law_id': law_id,
            'article': article,
            'paragraph': paragraph,
            'item': item,
            'source': c.get('source', ''),
            'source_chunk_key': src_key,
            'follow': bool(follow),
            'priority': int(pri or 0),
            'decision_reason': reason,
            'ref_docs_preview': ref_preview,
            'raw_ref': (c.get('source_ref') or {}).get('raw', ''),
        })

    return out



def propose_refs_from_chunk(runtime, user_query: str, target: str, doc: dict, max_refs_per_chunk: int = 6) -> list[dict]:
    reviews = evaluate_ref_candidates_from_chunk(
        runtime,
        user_query=user_query,
        target=target,
        doc=doc,
        max_refs_per_chunk=max_refs_per_chunk,
    )
    out = []
    for r in reviews:
        if not r.get('follow'):
            continue
        out.append({
            'law_id': r.get('law_id'),
            'article': r.get('article'),
            'source': r.get('source', ''),
            'source_chunk_key': r.get('source_chunk_key', ''),
            'priority': int(r.get('priority', 0)),
            'decision_reason': r.get('decision_reason', ''),
            'llm_reason': r.get('decision_reason', ''),
            'ref_docs_preview': r.get('ref_docs_preview', []),
            'raw_ref': r.get('raw_ref', ''),
        })
    return out


def llm_need_refs_from_current_context(llm, user_query: str, target: str, context_preview: str, return_debug: bool = False):
    prompt = f"""
너는 법률 QA의 ref 필요성 판단기다.
반드시 현재 컨텍스트만 보고 판단하라. ref 내용은 아직 보지 않는다.

출력 JSON만:
{{"need_ref": true/false, "reason": "..."}}

판단 기준:
- 현재 컨텍스트만으로 질문의 판단/계산/결론이 가능하면 need_ref=false
- 참조 법령/조항의 해석이 있어야 현재 컨텍스트를 이해할 수 있으면 need_ref=true

query: {user_query}
target: {target}
current_context_preview: {context_preview}
""".strip()

    raw = llm.invoke(prompt)
    text = getattr(raw, 'content', str(raw)).strip()

    obj = {}
    parse_error = ''
    try:
        obj = json.loads(text)
    except Exception:
        m = re.search(r'\{[\s\S]*\}', text)
        if m:
            try:
                obj = json.loads(m.group(0))
            except Exception as e:
                parse_error = str(e)
                obj = {}
        else:
            parse_error = 'json object not found'

    need = bool(obj.get('need_ref', obj.get('need_refs', obj.get('expand', False))))
    reason = str(obj.get('reason', obj.get('이유', '')) or '').strip() or (text[:220] if text else '')

    result = (need, reason)
    if not return_debug:
        return result
    return result, {'raw_text': text, 'parsed_obj': obj, 'parse_error': parse_error, 'prompt_preview': prompt[:1200]}


def llm_need_refs_for_state_context(runtime, state: dict):
    contexts = list(state.get('contexts', []) or [])
    ctx = []
    for c in contexts[:6]:
        m = c.get('metadata', {}) or {}
        ctx.append({
            'chunk_key': f"{m.get('law_id')}:{m.get('article_num')}:{m.get('article_sub','0')}",
            'law_name': m.get('law_name',''),
            'article_num': m.get('article_num',''),
            'article_title': m.get('article_title',''),
            'excerpt': (c.get('content') or '').replace('\n',' ')[:220],
        })
    return llm_need_refs_from_current_context(
        get_runtime_llm(runtime, 'ref_expander'),
        user_query=state.get('user_query',''),
        target=', '.join(state.get('targets',[])) or '일반',
        context_preview=str(ctx),
    )


def retrieve_related_chunks_in_law(runtime, user_query: str, target: str, law_id: str, k: int = 2):
    lid = str(law_id).strip().zfill(6)
    queries = [user_query, target, f"{user_query} {target}", f"{target} 관련 조문"]
    out = []
    seen = set()
    for q in queries:
        docs = search_law_chunks(runtime['store'], q, k=max(8, k * 4))
        for d in docs:
            m = d.get('metadata', {}) or {}
            if str(m.get('law_id','')).strip().zfill(6) != lid:
                continue
            ckey = context_key(d)
            if ckey in seen:
                continue
            seen.add(ckey)
            out.append(d)
            if len(out) >= k:
                return out
    return out


def get_ref_docs_for_candidate(runtime, law_id: str, article: str, user_query: str, target: str):
    lid = str(law_id).strip()
    art = str(article).strip()
    if not lid:
        return []
    if art:
        return get_ref_docs_cached(runtime, lid, art)
    return retrieve_related_chunks_in_law(runtime, user_query=user_query, target=target, law_id=lid, k=2)



## Step E. Target 단위 Agent (Single Target Worker)


In [None]:
def run_target_agent(runtime, user_query: str, target: str, k: int = 6) -> dict:
    q1 = f"{target}"
    q2 = f"{user_query}"
    q3 = f"{user_query} 관련 핵심: {target}"

    per_query_k = max(k, 4)
    query_plan = [(q1, per_query_k), (q2, per_query_k), (q3, per_query_k)]

    hits = []
    for q, qk in query_plan:
        hits.extend(search_law_chunks(runtime['store'], q, k=qk))
    hits = dedup_contexts(hits)

    if len(hits) < k:
        for bq in [f"{target} 건축법 조문", f"{target} 건축법 시행령 조문", f"{user_query} 법령 조문"]:
            if len(hits) >= k:
                break
            hits.extend(search_law_chunks(runtime['store'], bq, k=max(k * 2, 8)))
            hits = dedup_contexts(hits)

    hits = hits[:k]

    pending_refs = []
    seen = set()
    chunk_decisions = []

    for h in hits:
        meta = h.get('metadata', {}) or {}
        chunk_key = f"{meta.get('law_id')}:{meta.get('article_num')}:{meta.get('article_sub', '0')}"

        reviews = evaluate_ref_candidates_from_chunk(
            runtime,
            user_query=user_query,
            target=target,
            doc=h,
            max_refs_per_chunk=6,
        )

        proposed_refs = []
        reasons = []
        for r in reviews:
            if not r.get('follow'):
                reasons.append(str(r.get('decision_reason', '')))
                continue
            cand = {
                'law_id': r.get('law_id'),
                'article': r.get('article', ''),
                'source': r.get('source', ''),
                'source_chunk_key': chunk_key,
                'priority': int(r.get('priority', 0)),
                'decision_reason': r.get('decision_reason', ''),
                'llm_reason': r.get('decision_reason', ''),
                'ref_docs_preview': r.get('ref_docs_preview', []),
                'raw_ref': r.get('raw_ref', ''),
            }
            rk = (str(cand.get('law_id','')), str(cand.get('article','')))
            if rk in seen:
                continue
            seen.add(rk)
            pending_refs.append(cand)
            proposed_refs.append(f"{cand.get('law_id')}:{cand.get('article') or '__law__'}")

        should_expand = len(proposed_refs) > 0
        llm_reason = '; '.join([x for x in reasons if x][:2]) if reasons else ('needs_ref' if should_expand else 'chunk_context_sufficient')

        chunk_decisions.append({
            'chunk_key': chunk_key,
            'target': target,
            'ref_candidates_count': len(reviews),
            'rule_expand': None,
            'rule_reasons': [],
            'llm_expand': should_expand,
            'llm_priority': max([int(x.get('priority',0)) for x in reviews] + [0]),
            'llm_reason': llm_reason,
            'llm_raw_text': '',
            'should_expand': should_expand,
            'decision_reason': llm_reason,
            'proposed_refs': proposed_refs,
        })

    pending_refs.sort(key=lambda x: int(x.get('priority', 0)), reverse=True)

    return {
        'target': target,
        'hits': hits,
        'pending_refs': pending_refs,
        'chunk_decisions': chunk_decisions,
        'retrieve_debug': {'requested_k': k, 'returned_k': len(hits), 'query_plan': query_plan},
    }



## Step F. Multi-Agent Fan-out / Fan-in


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed


In [None]:
def run_multi_target_agents(runtime, user_query: str, targets: list[str], k: int = 6) -> dict:
    targets = targets or ['일반']
    outputs = []

    max_workers = min(4, max(1, len(targets)))
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(run_target_agent, runtime, user_query, t, k) for t in targets]
        for f in as_completed(futures):
            outputs.append(f.result())

    target_hits = {o['target']: o['hits'] for o in outputs}
    target_decisions = {o['target']: o.get('chunk_decisions', []) for o in outputs}
    target_retrieve_debug = {o['target']: o.get('retrieve_debug', {}) for o in outputs}

    all_contexts = []
    all_pending_refs = []
    seen_ref = set()

    for o in outputs:
        all_contexts.extend(o['hits'])
        for r in o['pending_refs']:
            rk = (r.get('law_id'), r.get('article'))
            if rk in seen_ref:
                continue
            seen_ref.add(rk)
            all_pending_refs.append(r)

    all_pending_refs.sort(key=lambda x: int(x.get('priority', 0)), reverse=True)

    return {
        'target_hits': target_hits,
        'target_decisions': target_decisions,
        'contexts': dedup_contexts(all_contexts),
        'pending_refs': all_pending_refs,
        'target_retrieve_debug': target_retrieve_debug,
    }


## Step F-1. 디버깅: Retrieve/확장/Hop 추적


In [None]:
def summarize_target_hits(target_hits: dict[str, list[dict]]) -> list[dict]:
    rows = []
    for target, docs in (target_hits or {}).items():
        rows.append({'target': target, 'retrieved_chunks': len(docs)})
    rows.sort(key=lambda x: x['target'])
    return rows


In [None]:
def inspect_target_hits(target_hits: dict[str, list[dict]], target: str, limit: int = 10) -> list[dict]:
    rows = []
    docs = (target_hits or {}).get(target, [])
    for d in docs[:limit]:
        m = d.get('metadata', {})
        rows.append({
            'chunk_key': f"{m.get('law_id')}:{m.get('article_num')}:{m.get('article_sub', '0')}",
            'law_name': m.get('law_name'),
            'article_num': m.get('article_num'),
            'article_sub': m.get('article_sub', ''),
            'article_title': m.get('article_title'),
            'internal_ref_cnt': len(m.get('internal_refs', []) or []),
            'external_ref_cnt': len(m.get('external_refs', []) or []),
            'content_head': (d.get('content', '') or '')[:180],
        })
    return rows


In [None]:
def inspect_expand_decisions(runtime, user_query: str, target: str, docs: list[dict], limit: int = 10) -> list[dict]:
    rows = []
    for d in docs[:limit]:
        m = d.get('metadata', {})

        reviews = evaluate_ref_candidates_from_chunk(
            runtime,
            user_query=user_query,
            target=target,
            doc=d,
            max_refs_per_chunk=10,
        )

        proposed = [
            {
                'law_id': x.get('law_id'),
                'article': x.get('article'),
                'paragraph': x.get('paragraph', ''),
                'item': x.get('item', ''),
                'source': x.get('source'),
                'priority': x.get('priority', 0),
                'decision_reason': x.get('decision_reason', ''),
                'raw_ref': x.get('raw_ref', ''),
                'ref_docs_preview': x.get('ref_docs_preview', []),
            }
            for x in reviews if x.get('follow')
        ]

        rows.append({
            'chunk_key': f"{m.get('law_id')}:{m.get('article_num')}:{m.get('article_sub', '0')}",
            'target': target,
            'ref_candidates_count': len(reviews),
            'ref_candidates_preview': [
                {
                    'law_id': x.get('law_id'),
                    'article': x.get('article'),
                    'paragraph': x.get('paragraph', ''),
                    'item': x.get('item', ''),
                    'source': x.get('source'),
                    'raw': x.get('raw_ref', ''),
                    'follow': x.get('follow', False),
                    'priority': x.get('priority', 0),
                    'reason': x.get('decision_reason', ''),
                    'ref_docs_preview': x.get('ref_docs_preview', []),
                }
                for x in reviews
            ],
            'llm_expand': any(x.get('follow') for x in reviews),
            'llm_priority': max([int(x.get('priority', 0)) for x in reviews] + [0]),
            'llm_reason': 'ref별 본문+항/호 기반 판단 적용',
            'llm_raw_text': '',
            'llm_parsed_obj': {},
            'llm_parse_error': '',
            'llm_prompt_preview': '',
            'proposed_ref_cnt': len(proposed),
            'proposed_refs': proposed,
        })
    return rows



In [None]:
def show_expand_detail(rows: list[dict], idx: int = 0):
    if not rows:
        print('rows is empty')
        return
    r = rows[idx]
    print('chunk_key:', r.get('chunk_key'))
    print('target:', r.get('target'))
    print('rule_expand:', r.get('rule_expand'))
    print('rule_reasons:', r.get('rule_reasons'))
    print('rule_cue_matches:', r.get('rule_cue_matches'))
    print('ref_candidates_count:', r.get('ref_candidates_count'))
    print('ref_candidates_preview:', r.get('ref_candidates_preview'))
    print('llm_expand:', r.get('llm_expand'))
    print('llm_priority:', r.get('llm_priority'))
    print('llm_reason:', r.get('llm_reason'))
    print('llm_parse_error:', r.get('llm_parse_error'))
    print('llm_raw_text:', r.get('llm_raw_text'))
    print('proposed_ref_cnt:', r.get('proposed_ref_cnt'))
    print('proposed_refs:', r.get('proposed_refs'))


## Step F-2. 초세부 디버깅 (청크 1개 단위)


In [None]:
def pick_doc_for_debug(target_hits: dict[str, list[dict]], target: str, idx: int = 0) -> dict:
    docs = (target_hits or {}).get(target, [])
    if not docs:
        return {}
    if idx < 0:
        idx = 0
    if idx >= len(docs):
        idx = len(docs) - 1
    return docs[idx]


### 디버깅 실행 예시 2: chunk별 ref 확장 판단 확인


## Step G. Graph Node 함수


In [None]:
def node_extract_targets(state: GraphState, runtime) -> GraphState:
    q = state.get('user_query', '')
    state['targets'] = extract_targets_with_llm(get_runtime_llm(runtime, 'target_extractor'), q)
    state['hop_count'] = 0
    state['max_hops'] = state.get('max_hops', 3)
    return state


In [None]:
def node_retrieve_multi_agent(state: GraphState, runtime) -> GraphState:
    out = run_multi_target_agents(runtime, state.get('user_query', ''), state.get('targets', []), k=6)
    state['target_hits'] = out['target_hits']
    state['target_decisions'] = out.get('target_decisions', {})
    state['target_retrieve_debug'] = out.get('target_retrieve_debug', {})
    state['hits'] = out['contexts']
    state['contexts'] = list(out['contexts'])
    state['pending_refs'] = list(out['pending_refs'])
    state['seen_ref_keys'] = []
    return state


In [None]:
def node_reference_tracker(state: GraphState, runtime) -> GraphState:
    pending = list(state.get('pending_refs', []))
    if not pending:
        return state

    try:
        need_ref, need_reason = llm_need_refs_for_state_context(runtime, state)
    except Exception as e:
        need_ref, need_reason = True, f'llm_error:{e}'

    if not need_ref:
        state['pending_refs'] = []
        state['seen_ref_keys'] = list(state.get('seen_ref_keys', []))
        logs = list(state.get('ref_batch_decisions', []))
        logs.append({
            'hop': int(state.get('hop_count', 0)),
            'expand_count': 0,
            'batch_reason': f'skip_ref: {need_reason}',
            'picked_refs': [],
            'checked_refs': [],
            'fetched_chunks': [],
            'llm_raw_text': '',
            'llm_parsed_obj': {},
            'llm_parse_error': '',
        })
        state['ref_batch_decisions'] = logs
        state['hop_count'] = int(state.get('hop_count', 0)) + 1
        return state

    seen_ref_keys = list(state.get('seen_ref_keys', []))
    seen_set = set(seen_ref_keys)

    candidates = []
    for r in pending:
        ref_key = f"{r.get('law_id')}:{r.get('article') or '__law__'}"
        if ref_key in seen_set:
            continue
        candidates.append(r)

    if not candidates:
        state['pending_refs'] = pending
        state['seen_ref_keys'] = list(seen_set)
        state['hop_count'] = int(state.get('hop_count', 0)) + 1
        return state

    hop_count = int(state.get('hop_count', 0))
    max_hops = int(state.get('max_hops', 3))

    checked = []
    for r in candidates[:12]:
        law_id = str(r.get('law_id', '')).strip()
        article = str(r.get('article', '')).strip()
        ref_key = f"{law_id}:{article or '__law__'}"
        docs = get_ref_docs_for_candidate(runtime, law_id=law_id, article=article, user_query=state.get('user_query', ''), target=', '.join(state.get('targets', [])) or '일반')
        ref_preview = build_ref_docs_preview(docs, ref_key=ref_key)
        if not ref_preview:
            checked.append({**r, 'follow': False, 'priority': 0, 'check_reason': 'ref_docs_not_found', 'ref_docs_preview': []})
            continue
        try:
            follow, pri, why = llm_should_follow_ref_by_content(
                get_runtime_llm(runtime, 'ref_expander'),
                user_query=state.get('user_query', ''),
                target=', '.join(state.get('targets', [])) or '일반',
                parent_chunk_preview='',
                ref_key=ref_key,
                ref_docs_preview=ref_preview,
            )
        except Exception as e:
            follow, pri, why = False, 0, f'llm_error:{e}'
        checked.append({**r, 'follow': bool(follow), 'priority': int(pri or 0), 'check_reason': why, 'ref_docs_preview': ref_preview})

    checked.sort(key=lambda x: (1 if x.get('follow') else 0, int(x.get('priority',0))), reverse=True)
    follow_candidates = [x for x in checked if x.get('follow')]
    candidates = follow_candidates + [x for x in checked if not x.get('follow')] if follow_candidates else checked

    try:
        (expand_count, batch_reason), batch_debug = llm_select_ref_batch_size(
            get_runtime_llm(runtime, 'ref_batch_planner'),
            user_query=state.get('user_query', ''),
            targets=state.get('targets', []),
            hop_count=hop_count,
            max_hops=max_hops,
            pending_refs=candidates,
            max_batch=4,
            return_debug=True,
        )
    except Exception as e:
        expand_count, batch_reason = 1, f'llm_error:{e}'
        batch_debug = {'raw_text': '', 'parsed_obj': {}, 'parse_error': str(e), 'prompt_preview': ''}

    picked = candidates[:max(1, min(expand_count, len(candidates)))]
    picked_keys = [f"{r.get('law_id')}:{r.get('article') or '__law__'}" for r in picked]

    picked_counter = {}
    for r in picked:
        rk = (str(r.get('law_id')), str(r.get('article')))
        picked_counter[rk] = picked_counter.get(rk, 0) + 1

    remaining = []
    for r in pending:
        rk = (str(r.get('law_id')), str(r.get('article')))
        if picked_counter.get(rk, 0) > 0:
            picked_counter[rk] -= 1
            continue
        remaining.append(r)
    pending = remaining

    add_refs = []
    merged_contexts = list(state.get('contexts', []))
    target_hint = ', '.join(state.get('targets', [])) or '일반'
    fetched_chunks = []

    for ref in picked:
        ref_key = f"{ref.get('law_id')}:{ref.get('article') or '__law__'}"
        seen_set.add(ref_key)

        law_id = str(ref.get('law_id', '')).strip()
        article = str(ref.get('article', '')).strip()
        if not law_id:
            fetched_chunks.append({'ref_key': ref_key, 'fetched_count': 0, 'fetched_keys': [], 'fetched_previews': [], 'note': 'invalid law_id'})
            continue

        docs = get_ref_docs_for_candidate(runtime, law_id=law_id, article=article, user_query=state.get('user_query',''), target=target_hint)

        row = {
            'ref_key': ref_key,
            'fetched_count': len(docs or []),
            'fetched_keys': [],
            'fetched_previews': [],
            'source': ref.get('source', ''),
            'priority': ref.get('priority', 0),
            'decision_reason': str(ref.get('decision_reason', ''))[:200],
        }

        if not docs:
            fetched_chunks.append(row)
            continue

        for d in docs:
            m = d.get('metadata', {}) or {}
            ckey = f"{m.get('law_id')}:{m.get('article_num')}:{m.get('article_sub', '0')}"
            row['fetched_keys'].append(ckey)
            row['fetched_previews'].append((d.get('content') or '').replace('\n', ' ')[:180])

        fetched_chunks.append(row)
        merged_contexts = dedup_contexts(merged_contexts + list(docs))

        for d in docs:
            add_refs.extend(
                propose_refs_from_chunk(
                    runtime,
                    user_query=state.get('user_query', ''),
                    target=target_hint,
                    doc=d,
                    max_refs_per_chunk=6,
                )
            )

    exist = {(x.get('law_id'), x.get('article')) for x in pending}
    exist |= {(k.split(':', 1)[0], k.split(':', 1)[1] if ':' in k else '') for k in seen_set}
    for r in add_refs:
        rk = (r.get('law_id'), r.get('article'))
        if rk in exist:
            continue
        pending.append(r)
        exist.add(rk)

    pending.sort(key=lambda x: int(x.get('priority', 0)), reverse=True)

    state['contexts'] = merged_contexts
    state['pending_refs'] = pending
    state['seen_ref_keys'] = list(seen_set)
    state['hop_count'] = hop_count + 1

    logs = list(state.get('ref_batch_decisions', []))
    logs.append({
        'hop': hop_count,
        'expand_count': len(picked),
        'batch_reason': batch_reason,
        'picked_refs': picked_keys,
        'checked_refs': [
            {
                'ref_key': f"{x.get('law_id')}:{x.get('article') or '__law__'}",
                'follow': x.get('follow', False),
                'priority': x.get('priority', 0),
                'check_reason': x.get('check_reason', ''),
                'source': x.get('source', ''),
                'ref_docs_preview': x.get('ref_docs_preview', []),
            }
            for x in checked
        ],
        'fetched_chunks': fetched_chunks,
        'llm_raw_text': batch_debug.get('raw_text', ''),
        'llm_parsed_obj': batch_debug.get('parsed_obj', {}),
        'llm_parse_error': batch_debug.get('parse_error', ''),
        'need_ref_reason': need_reason,
    })
    state['ref_batch_decisions'] = logs

    return state



In [None]:
def route_ref_loop(state: GraphState) -> str:
    if state.get('pending_refs') and state.get('hop_count', 0) < state.get('max_hops', 3):
        return 'reference_tracker'
    return 'appendix'


In [None]:
def node_appendix(state: GraphState, runtime) -> GraphState:
    merged = []
    seen = set()
    for t in state.get('targets', []):
        for x in lookup_appendix1_term(t, runtime['terms'])[:3]:
            key = (x.get('category'), x.get('subcategory'))
            if key in seen:
                continue
            seen.add(key)
            merged.append(x)
    state['appendix'] = merged
    return state


In [None]:
def node_generate(state: GraphState, runtime) -> GraphState:
    contexts = state.get('contexts', [])
    targets = state.get('targets', [])
    target_hits = {k: len(v) for k, v in (state.get('target_hits', {}) or {}).items()}
    user_query = state.get('user_query', '')

    # query 조건에 맞는 답변 생성을 위해 근거 원문 일부를 함께 전달
    evidence = []
    for c in contexts[:14]:
        m = c.get('metadata', {})
        txt = (c.get('content') or '').replace('\n', ' ').strip()
        evidence.append({
            'law_name': m.get('law_name'),
            'law_id': m.get('law_id'),
            'article_num': m.get('article_num'),
            'article_sub': m.get('article_sub', '0'),
            'article_title': m.get('article_title'),
            'excerpt': txt[:420],
        })

    prompt = f"""
당신은 건축법률 답변 생성기다.
반드시 사용자 질문(query)의 조건을 기준으로 답하라.

출력 규칙:
1) 먼저 query에서 조건을 명시적으로 정리한다.
2) 근거 조항을 최소 1개 제시하고, 각 조항에 대해 근거 excerpt를 함께 인용한다.
3) 수치/거리(예: 3m)를 말할 때는 반드시 evidence excerpt에 같은 값이 있어야 한다.
4) evidence에 없는 수치/조건은 추정하지 말고 '자료상 확인 불가'로 표시한다.
5) 마지막에 '추가 필요 입력값'을 bullet로 제시한다.
6) 한국어로 답변한다.

query: {user_query}
target_hits_count: {target_hits}
appendix: {state.get('appendix', [])}
evidence: {evidence}
""".strip()

    ans = get_runtime_llm(runtime, 'answer_generator').invoke(prompt)
    state['answer'] = getattr(ans, 'content', str(ans))
    return state



## Step H. Graph 조립/실행


In [None]:
def build_graph_rag(runtime):
    if StateGraph is None:
        raise ImportError('langgraph가 필요합니다. conda run -n natna pip install langgraph')

    g = StateGraph(GraphState)

    g.add_node('extract_targets', lambda s: node_extract_targets(s, runtime))
    g.add_node('retrieve_multi_agent', lambda s: node_retrieve_multi_agent(s, runtime))
    g.add_node('reference_tracker', lambda s: node_reference_tracker(s, runtime))
    g.add_node('appendix', lambda s: node_appendix(s, runtime))
    g.add_node('generate', lambda s: node_generate(s, runtime))

    g.set_entry_point('extract_targets')
    g.add_edge('extract_targets', 'retrieve_multi_agent')
    g.add_edge('retrieve_multi_agent', 'reference_tracker')
    g.add_conditional_edges(
        'reference_tracker',
        route_ref_loop,
        {
            'reference_tracker': 'reference_tracker',
            'appendix': 'appendix',
        },
    )
    g.add_edge('appendix', 'generate')
    g.add_edge('generate', END)

    return g.compile()


## Step H-1. 수동 실행 모드 (노드별 한 단계씩)


In [None]:
def run_graph_once(user_query: str, max_hops: int = 3):
    runtime = init_runtime()
    app = build_graph_rag(runtime)
    out = app.invoke({'user_query': user_query, 'max_hops': max_hops})
    return {
        'targets': out.get('targets', []),
        'target_hits_count': {k: len(v) for k, v in (out.get('target_hits', {}) or {}).items()},
        'retrieved': len(out.get('contexts', [])),
        'appendix': out.get('appendix', []),
        'answer': out.get('answer', ''),
        'contexts': out.get('contexts', []),
        'pending_refs': out.get('pending_refs', []),
        'seen_ref_keys': out.get('seen_ref_keys', []),
        'hop_count': out.get('hop_count', 0),
    }


In [None]:
def run_hop_debug(runtime, user_query: str, targets: list[str], max_hops: int = 5, k: int = 6) -> dict:
    # 초기 fan-out/fan-in
    init = run_multi_target_agents(runtime, user_query=user_query, targets=targets, k=k)
    state = {
        'user_query': user_query,
        'targets': targets,
        'target_hits': init['target_hits'],
        'hits': init['contexts'],
        'contexts': list(init['contexts']),
        'pending_refs': list(init['pending_refs']),
        'seen_ref_keys': [],
        'hop_count': 0,
        'max_hops': max_hops,
    }

    hop_logs = []
    for hop in range(max_hops):
        before_pending = len(state.get('pending_refs', []))
        before_ctx = len(state.get('contexts', []))

        if before_pending == 0:
            hop_logs.append({
                'hop': hop,
                'before_pending': before_pending,
                'after_pending': before_pending,
                'before_contexts': before_ctx,
                'after_contexts': before_ctx,
                'picked_ref': None,
                'note': 'no pending refs, stop',
            })
            break

        next_ref = state['pending_refs'][0]
        picked_key = f"{next_ref.get('law_id')}:{next_ref.get('article')}"

        state = node_reference_tracker(state, runtime)

        after_pending = len(state.get('pending_refs', []))
        after_ctx = len(state.get('contexts', []))

        hop_logs.append({
            'hop': hop,
            'before_pending': before_pending,
            'after_pending': after_pending,
            'before_contexts': before_ctx,
            'after_contexts': after_ctx,
            'picked_ref': picked_key,
            'new_contexts': after_ctx - before_ctx,
        })

    return {
        'state': state,
        'hop_logs': hop_logs,
    }


### 디버깅 실행 예시 3: hop별로 ref를 어떻게 타는지 확인


In [None]:
# 인터랙티브 reference 그래프 (PyVis, notebook inline srcdoc)
import hashlib
import html as _html


def build_reference_graph_data(contexts):
    nodes = {}
    edges = []

    in_scope = {
        '건축법': '001823',
        '건축법 시행령': '002118',
    }
    law_id_to_name = {v: k for k, v in in_scope.items()}

    circled_to_num = {
        '①': '1', '②': '2', '③': '3', '④': '4', '⑤': '5',
        '⑥': '6', '⑦': '7', '⑧': '8', '⑨': '9', '⑩': '10',
        '⑪': '11', '⑫': '12', '⑬': '13', '⑭': '14', '⑮': '15',
        '⑯': '16', '⑰': '17', '⑱': '18', '⑲': '19', '⑳': '20',
    }

    def norm_para(v: str) -> str:
        t = str(v or '').strip()
        if not t:
            return ''
        m = re.search(r'제\s*(\d+)\s*항', t)
        if m:
            return m.group(1)
        if t in circled_to_num:
            return circled_to_num[t]
        m = re.search(r'(\d+)', t)
        return m.group(1) if m else ''

    def safe_meta(item):
        if isinstance(item, dict):
            return item.get('metadata', {}) or {}
        return {}

    def safe_content(item):
        if isinstance(item, dict):
            return str(item.get('content', ''))
        return ''

    def add_article_placeholder_if_missing(chunk_key: str):
        if chunk_key in nodes:
            return
        parts = chunk_key.split(':')
        if len(parts) < 3:
            return
        lid, anum, asub = parts[0], parts[1], parts[2]
        lname = law_id_to_name.get(lid, lid)
        sub_txt = f"의{asub}" if asub not in ['', '0'] else ''
        nodes[chunk_key] = {
            'id': chunk_key,
            'label': f"{lname}\n제{anum}조{sub_txt}\n(미조회)",
            'title': f"[{chunk_key}] {lname} 제{anum}조{sub_txt} - 아직 contexts에 없음",
            'group': 'unfetched',
        }

    for idx, item in enumerate(contexts):
        meta = safe_meta(item)
        content = safe_content(item)

        law_id = str(meta.get('law_id', '') or '').zfill(6)
        article_num = str(meta.get('article_num', '') or '')
        article_sub = str(meta.get('article_sub', '') or '0')

        if law_id and article_num:
            article_key = f"{law_id}:{article_num}:{article_sub if article_sub else '0'}"
        else:
            article_key = f"doc:{idx}"

        law_name = str(meta.get('law_name', law_id_to_name.get(law_id, 'unknown')))
        title = str(meta.get('article_title', ''))
        sub_txt = f"의{article_sub}" if article_sub not in ['', '0'] else ''

        paragraphs = meta.get('paragraphs', []) or []
        para_nodes = []
        para_content_by_num = {}

        if isinstance(paragraphs, list) and paragraphs:
            for p in paragraphs:
                if not isinstance(p, dict):
                    continue
                pnum = norm_para(p.get('num', ''))
                pcontent = str(p.get('content', '') or '').strip()
                if not pnum:
                    continue
                nid = f"{article_key}:p{pnum}"
                para_nodes.append((pnum, nid))
                para_content_by_num[pnum] = pcontent
                preview = pcontent.replace('\n', ' ')[:220]
                nodes[nid] = {
                    'id': nid,
                    'label': f"{law_name}\n제{article_num}조{sub_txt} 제{pnum}항",
                    'title': f"[{nid}] {law_name} 제{article_num}조{sub_txt} 제{pnum}항 {title}\n{preview}",
                    'group': f"{law_name}-paragraph",
                }

            preview = content[:220].replace('\n', ' ')
            nodes[article_key] = {
                'id': article_key,
                'label': f"{law_name}\n제{article_num}조{sub_txt}",
                'title': f"[{article_key}] {law_name} 제{article_num}조{sub_txt} {title}\n{preview}",
                'group': law_name,
            }
            for _, nid in para_nodes:
                edges.append((article_key, nid, 'contains-paragraph'))
        else:
            preview = content[:220].replace('\n', ' ')
            nodes[article_key] = {
                'id': article_key,
                'label': f"{law_name}\n제{article_num}조{sub_txt}" if article_num else article_key,
                'title': f"[{article_key}] {law_name} 제{article_num}조{sub_txt} {title}\n{preview}",
                'group': law_name,
            }

        def source_node_for_ref(r: dict) -> str:
            rp = norm_para(r.get('paragraph', ''))
            if rp and f"{article_key}:p{rp}" in nodes:
                return f"{article_key}:p{rp}"

            raw = str(r.get('raw', '') or '').strip()
            if raw and para_nodes:
                for pnum, nid in para_nodes:
                    ptxt = para_content_by_num.get(pnum, '')
                    if ptxt and raw in ptxt:
                        return nid

            if para_nodes:
                return para_nodes[0][1]
            return article_key

        def target_node_key_for_ref_law_article(ref_law_id: str, ref_article: str, ref_paragraph: str = '') -> str:
            t_main, t_sub = parse_article_token(ref_article)
            base = make_chunk_key(ref_law_id, t_main or ref_article, t_sub)
            rp = norm_para(ref_paragraph)
            return f"{base}:p{rp}" if rp else base

        for r in meta.get('internal_refs', []) or []:
            if not isinstance(r, dict):
                continue
            tgt_article = str(r.get('article', '') or '')
            if not (tgt_article and law_id):
                continue
            src = source_node_for_ref(r)
            tgt = target_node_key_for_ref_law_article(law_id, tgt_article, str(r.get('paragraph', '') or ''))
            edges.append((src, tgt, 'internal'))

        for r in meta.get('external_refs', []) or []:
            if not isinstance(r, dict):
                continue
            lname = str(r.get('law_name', '') or '')
            art = str(r.get('article', '') or '')
            mapped = in_scope.get(lname)
            src = source_node_for_ref(r)
            if mapped and art:
                tgt = target_node_key_for_ref_law_article(mapped, art, str(r.get('paragraph', '') or ''))
                edges.append((src, tgt, 'external-in-scope'))
            else:
                tgt = f"external:{lname}:{art}"
                if tgt not in nodes:
                    nodes[tgt] = {
                        'id': tgt,
                        'label': f"외부\n{lname} 제{art}조",
                        'title': f"external ref: {lname} 제{art}",
                        'group': 'external',
                    }
                edges.append((src, tgt, 'external-out-of-scope'))

    for src, tgt, etype in list(edges):
        if tgt in nodes or tgt.startswith('external:'):
            continue
        if ':p' in tgt:
            base, ptag = tgt.rsplit(':p', 1)
            add_article_placeholder_if_missing(base)
            parts = base.split(':')
            lid = parts[0] if len(parts) > 0 else ''
            anum = parts[1] if len(parts) > 1 else '?'
            asub = parts[2] if len(parts) > 2 else '0'
            lname = law_id_to_name.get(lid, lid)
            sub_txt = f"의{asub}" if asub not in ['', '0'] else ''
            nodes[tgt] = {
                'id': tgt,
                'label': f"{lname}\n제{anum}조{sub_txt} 제{ptag}항\n(미조회)",
                'title': f"[{tgt}] {lname} 제{anum}조{sub_txt} 제{ptag}항 - 아직 contexts에 없음",
                'group': 'unfetched',
            }
            edges.append((base, tgt, 'contains-paragraph'))
        elif ':' in tgt:
            add_article_placeholder_if_missing(tgt)

    edge_seen = set()
    uniq_edges = []
    for e in edges:
        if e in edge_seen:
            continue
        edge_seen.add(e)
        uniq_edges.append(e)

    return nodes, uniq_edges


def render_reference_graph(contexts):
    try:
        from pyvis.network import Network
        from IPython.display import HTML, display
    except Exception:
        raise ImportError('pyvis가 필요합니다. conda run -n natna pip install pyvis')

    nodes, edges = build_reference_graph_data(contexts)
    print(f'graph data -> nodes: {len(nodes)}, edges: {len(edges)}')

    if not nodes:
        print('[warn] contexts가 비어 있어 그래프를 그릴 노드가 없습니다.')
        return None

    net = Network(height='760px', width='100%', directed=True, notebook=True, cdn_resources='in_line')
    net.force_atlas_2based(gravity=-50, central_gravity=0.01, spring_length=120, spring_strength=0.08)

    for n in nodes.values():
        net.add_node(n['id'], label=n['label'], title=n['title'], group=n['group'])

    color_by_type = {
        'internal': '#2E86DE',
        'external-in-scope': '#16A085',
        'external-out-of-scope': '#7F8C8D',
        'contains-paragraph': '#BDC3C7',
    }
    for src, tgt, etype in edges:
        if tgt not in nodes:
            net.add_node(tgt, label=tgt, title=tgt, group='unknown')
        net.add_edge(
            src,
            tgt,
            title=etype,
            color=color_by_type.get(etype, '#999999'),
            dashes=(etype == 'external-out-of-scope'),
        )

    html_doc = net.generate_html(notebook=True)
    iframe = f'<iframe style="width:100%;height:800px;border:1px solid #ddd;" srcdoc="{_html.escape(html_doc)}"></iframe>'
    display(HTML(iframe))
    return html_doc


In [None]:
# 시각화 헬퍼: 단계별 스냅샷 (그래프 + 내용 + ref 로그)
def show_state_snapshot(state: dict, stage: str, context_limit: int = 12, show_graph: bool = True):
    from IPython.display import display
    try:
        import pandas as pd
    except Exception:
        pd = None

    contexts = list(state.get('contexts', []) or [])
    pending = list(state.get('pending_refs', []) or [])
    print(f'[{stage}] contexts={len(contexts)}, pending_refs={len(pending)}, hop_count={state.get("hop_count", 0)}')

    trd = state.get('target_retrieve_debug', {}) or {}
    if trd:
        print('target_retrieve_debug:')
        for k, v in trd.items():
            print(f"  - {k}: requested={v.get('requested_k')}, returned={v.get('returned_k')}")

    if show_graph and contexts:
        render_reference_graph(contexts[:context_limit])

    rows = []
    for c in contexts[:context_limit]:
        m = c.get('metadata', {}) or {}
        article_sub = str(m.get('article_sub', '0'))
        article_suffix = ('의' + article_sub) if article_sub not in ['', '0'] else ''
        _internal_keys = [
            f"{m.get('law_id')}:{str(r.get('article','')).strip()}"
            for r in (m.get('internal_refs', []) or []) if isinstance(r, dict) and str(r.get('article','')).strip()
        ]
        _external_keys = []
        for r in (m.get('external_refs', []) or []):
            if not isinstance(r, dict):
                continue
            _lname = str(r.get('law_name', '')).strip()
            _article = str(r.get('article', '')).strip()
            _mapped = IN_SCOPE_LAW_NAME_TO_ID.get(_lname, '')
            if _mapped and _article:
                _external_keys.append(f"{_mapped}:{_article}")
            elif _lname or _article:
                _external_keys.append(f"external:{_lname}:{_article}")

        rows.append({
            'chunk_key': f"{m.get('law_id')}:{m.get('article_num')}:{m.get('article_sub', '0')}",
            'law_name': m.get('law_name'),
            'article': f"제{m.get('article_num')}조{article_suffix}",
            'title': m.get('article_title', ''),
            'content_preview': (c.get('content') or '').replace('\n', ' ')[:220],
            'internal_ref_cnt': len(_internal_keys),
            'internal_ref_keys': ', '.join(_internal_keys[:8]),
            'external_ref_cnt': len(_external_keys),
            'external_ref_keys': ', '.join(_external_keys[:8]),
        })

    if pd is not None and rows:
        display(pd.DataFrame(rows))
    else:
        print('context rows:', rows[:3])

    pref = []
    for r in pending[:10]:
        pref.append({
            'law_id': r.get('law_id'),
            'article': r.get('article'),
            'priority': r.get('priority'),
            'source': r.get('source'),
            'reason_preview': str(r.get('decision_reason', ''))[:120],
        })

    if pd is not None and pref:
        display(pd.DataFrame(pref))
    else:
        print('pending refs head:', pref)

    batch = list(state.get('ref_batch_decisions', []) or [])
    if batch:
        last = batch[-1]
        fetched = last.get('fetched_chunks', []) or []
        checked = last.get('checked_refs', []) or []
        if pd is not None:
            display(pd.DataFrame(batch))

            if checked:
                checked_rows = []
                for x in checked:
                    checked_rows.append({
                        'ref_key': x.get('ref_key'),
                        'follow': x.get('follow', False),
                        'priority': x.get('priority', 0),
                        'source': x.get('source', ''),
                        'check_reason': str(x.get('check_reason', ''))[:220],
                        'ref_preview': ' | '.join([(p.get('content_preview') or '')[:90] for p in (x.get('ref_docs_preview') or [])[:2]]),
                    })
                display(pd.DataFrame(checked_rows))

            if fetched:
                fetched_rows = []
                for x in fetched:
                    fetched_rows.append({
                        'ref_key': x.get('ref_key'),
                        'fetched_count': x.get('fetched_count', 0),
                        'fetched_keys': ', '.join(x.get('fetched_keys', [])),
                        'fetched_preview': ' | '.join(x.get('fetched_previews', [])[:2]),
                        'source': x.get('source', ''),
                        'priority': x.get('priority', 0),
                    })
                display(pd.DataFrame(fetched_rows))
        else:
            print('batch decisions:', batch)
            print('checked refs:', checked)
            print('latest fetched:', fetched)




def build_ref_audit_rows(runtime, state: dict, target: str | None = None, hit_limit: int = 12) -> list[dict]:
    tgt = target or ((state.get('targets') or ['일반'])[0])
    docs = (state.get('target_hits', {}) or {}).get(tgt, [])
    rows = inspect_expand_decisions(
        runtime,
        user_query=state.get('user_query', ''),
        target=tgt,
        docs=docs,
        limit=hit_limit,
    )

    flat = []
    for r in rows:
        proposed = {f"{x.get('law_id')}:{x.get('article')}": x for x in (r.get('proposed_refs') or [])}
        for c in (r.get('ref_candidates_preview') or []):
            key = f"{c.get('law_id')}:{c.get('article')}"
            p = proposed.get(key, {})
            selected = key in proposed

            previews = c.get('ref_docs_preview') or p.get('ref_docs_preview') or []
            preview_text = ' | '.join([(x.get('content_preview') or '')[:140] for x in previews[:2]])
            preview_keys = ', '.join([x.get('chunk_key', '') for x in previews[:2]])
            paragraph_match = any(bool(x.get('paragraph_match', False)) for x in previews) if previews else False


            # fallback: preview가 비면 DB 직접 재조회로 보강
            if not preview_text.strip() and ':' in key:
                try:
                    _lid, _art = key.split(':', 1)
                    _docs = get_article(runtime['client'], 'building_law', _lid, _art)
                    _pv = build_ref_docs_preview(
                        _docs,
                        paragraph=str(c.get('paragraph', '') or ''),
                        item=str(c.get('item', '') or ''),
                        ref_key=key,
                        max_docs=2,
                        max_chars=260,
                    )
                    if _pv:
                        preview_text = ' | '.join([(x.get('content_preview') or '')[:140] for x in _pv[:2]])
                        preview_keys = ', '.join([x.get('chunk_key', '') for x in _pv[:2]])
                except Exception:
                    pass

            flat.append({
                'target': tgt,
                'from_chunk': r.get('chunk_key', ''),
                'ref_key': key,
                'paragraph': c.get('paragraph', ''),
                'item': c.get('item', ''),
                'source': c.get('source', ''),
                'raw_ref': c.get('raw', ''),
                'selected': selected,
                'priority': p.get('priority', 0) if selected else 0,
                'llm_expand': r.get('llm_expand', None),
                'llm_priority': r.get('llm_priority', None),
                'reason': (p.get('decision_reason') or c.get('reason') or r.get('llm_reason') or '')[:220],
                'ref_preview_keys': preview_keys,
                'ref_preview': preview_text,
                'ref_preview_len': len(preview_text.strip()),
                'paragraph_match': paragraph_match,
            })

    seen=set()
    out=[]
    for x in flat:
        k=(x['from_chunk'], x['ref_key'])
        if k in seen:
            continue
        seen.add(k)
        out.append(x)
    return out




def display_ref_audit(runtime, state: dict, target: str | None = None, hit_limit: int = 12):
    from IPython.display import display
    try:
        import pandas as pd
    except Exception:
        rows = build_ref_audit_rows(runtime, state, target=target, hit_limit=hit_limit)
        print('rows:', len(rows))
        return rows

    rows = build_ref_audit_rows(runtime, state, target=target, hit_limit=hit_limit)
    df = pd.DataFrame(rows)
    if df.empty:
        print('ref audit: empty')
        return df

    cols = [
        'target', 'from_chunk', 'ref_key', 'paragraph', 'item', 'source', 'raw_ref',
        'selected', 'priority', 'reason',
        'ref_preview_keys', 'ref_preview_len', 'paragraph_match', 'ref_preview',
    ]
    df = df[cols]

    def row_style(r):
        if bool(r.get('selected')):
            return ['background-color: #e9f7ef'] * len(r)
        return ['background-color: #f0f0f0; color: #666'] * len(r)

    try:
        display(df.style.apply(row_style, axis=1))
    except Exception:
        display(df)

    print('selected refs:', int(df['selected'].sum()), '/', len(df))
    return df




def show_ref_fulltext(state: dict, ref_key: str, max_chars: int = 6000):
    # ref_key: '001823:58' 또는 '001823:58:0'
    parts = str(ref_key).split(':')
    if len(parts) < 2:
        print('invalid ref_key, expected law_id:article or law_id:article:sub')
        return

    law_id = parts[0]
    article = parts[1]
    sub = parts[2] if len(parts) > 2 else None

    hits = []
    for c in (state.get('contexts') or []):
        m = c.get('metadata', {}) or {}
        if str(m.get('law_id', '')) != law_id:
            continue
        if str(m.get('article_num', '')) != article:
            continue
        if sub is not None and str(m.get('article_sub', '0')) != str(sub):
            continue
        hits.append(c)

    if not hits:
        # sub 없는 키로 재시도
        if sub is not None:
            for c in (state.get('contexts') or []):
                m = c.get('metadata', {}) or {}
                if str(m.get('law_id', '')) == law_id and str(m.get('article_num', '')) == article:
                    hits.append(c)

    if not hits:
        print(f'not found in contexts: {ref_key}')
        return

    for i, c in enumerate(hits, 1):
        m = c.get('metadata', {}) or {}
        ckey = f"{m.get('law_id')}:{m.get('article_num')}:{m.get('article_sub', '0')}"
        print(f"\n[{i}] {ckey} | {m.get('law_name')} 제{m.get('article_num')}조 | {m.get('article_title','')}")
        txt = (c.get('content') or '')[:max_chars]
        print(txt)



def debug_db_fetch_for_refs(runtime, ref_keys: list[str], collection: str = 'building_law', max_preview: int = 160):
    rows = []
    client = runtime['client']
    for rk in ref_keys:
        parts = str(rk).split(':')
        if len(parts) < 2:
            rows.append({'ref_key': rk, 'ok': False, 'count': 0, 'note': 'invalid ref_key'})
            continue
        law_id = parts[0]
        article = parts[1]
        docs = get_article(client, collection, law_id, article)

        # 본문 추출 우선순위: content -> page_content -> metadata.content_original -> metadata.page_content
        previews = []
        meta_keys = []
        for d in (docs or [])[:2]:
            m = (d.get('metadata', {}) or {}) if isinstance(d, dict) else {}
            txt = (
                (d.get('content') if isinstance(d, dict) else '')
                or (d.get('page_content') if isinstance(d, dict) else '')
                or m.get('content_original', '')
                or m.get('page_content', '')
                or ''
            )
            previews.append(str(txt).replace('\n', ' ')[:max_preview])
            meta_keys.append(sorted(list(m.keys()))[:8])

        rows.append({
            'ref_key': rk,
            'ok': bool(docs),
            'count': len(docs or []),
            'preview': ' | '.join(previews),
            'meta_keys_head': meta_keys,
        })

    try:
        import pandas as pd
        from IPython.display import display
        display(pd.DataFrame(rows))
    except Exception:
        print(rows)
    return rows



def precheck_refs_between_step1_step2(state: dict, runtime, k: int = 6, limit_per_target: int = 10):
    """STEP1~2 사이 디버깅용: 초기 retrieve 후 ref를 DB조회+LLM판단까지 미리 수행"""
    user_query = state.get('user_query', '')
    targets = state.get('targets', []) or ['일반']

    base = run_multi_target_agents(runtime, user_query=user_query, targets=targets, k=k)

    ref_reviews = {}
    pending = []
    seen = set()

    for t in targets:
        docs = (base.get('target_hits', {}) or {}).get(t, [])
        rows = inspect_expand_decisions(
            runtime,
            user_query=user_query,
            target=t,
            docs=docs,
            limit=limit_per_target,
        )
        ref_reviews[t] = rows

        for r in rows:
            for p in (r.get('proposed_refs') or []):
                key = (str(p.get('law_id','')), str(p.get('article','')))
                if key in seen:
                    continue
                seen.add(key)
                pending.append({
                    'law_id': p.get('law_id'),
                    'article': p.get('article'),
                    'paragraph': p.get('paragraph', ''),
                    'item': p.get('item', ''),
                    'source': p.get('source', ''),
                    'source_chunk_key': r.get('chunk_key', ''),
                    'priority': int(p.get('priority', 0) or 0),
                    'decision_reason': p.get('decision_reason', ''),
                    'llm_reason': p.get('decision_reason', ''),
                    'ref_docs_preview': p.get('ref_docs_preview', []),
                    'raw_ref': p.get('raw_ref', ''),
                })

    pending.sort(key=lambda x: int(x.get('priority', 0)), reverse=True)

    return {
        'target_hits': base.get('target_hits', {}),
        'target_decisions': base.get('target_decisions', {}),
        'contexts': base.get('contexts', []),
        'pending_refs': pending,
        'ref_reviews': ref_reviews,
        'targets': targets,
    }


def apply_precheck_to_state(state: dict, precheck: dict):
    state['targets'] = precheck.get('targets', state.get('targets', []))
    state['target_hits'] = precheck.get('target_hits', {})
    state['target_decisions'] = precheck.get('target_decisions', {})
    state['hits'] = list(precheck.get('contexts', []))
    state['contexts'] = list(precheck.get('contexts', []))
    state['pending_refs'] = list(precheck.get('pending_refs', []))
    state['seen_ref_keys'] = []
    state['precheck_ref_reviews'] = precheck.get('ref_reviews', {})
    return state



def prefetch_refs_between_step1_step2(state: dict, runtime, k: int = 6):
    user_query = state.get('user_query', '')
    targets = state.get('targets', []) or ['일반']
    base = run_multi_target_agents(runtime, user_query=user_query, targets=targets, k=k)

    prefetched = []
    seen = set()

    def parse_para_item_from_raw(raw: str) -> tuple[str, str]:
        txt = str(raw or '')
        m_para = re.search(r'제\s*(\d+)\s*항', txt)
        m_item = re.search(r'제\s*(\d+)\s*호', txt)
        return (m_para.group(1) if m_para else ''), (m_item.group(1) if m_item else '')

    for t in targets:
        docs = (base.get('target_hits', {}) or {}).get(t, [])
        for d in docs:
            m = d.get('metadata', {}) or {}
            src_key = f"{m.get('law_id')}:{m.get('article_num')}:{m.get('article_sub','0')}"
            for r in extract_ref_candidates(m)[:12]:
                law_id = str(r.get('law_id','')).strip()
                article = str(r.get('article','')).strip()
                paragraph = str(r.get('paragraph','') or '').strip()
                item = str(r.get('item','') or '').strip()
                raw_ref = str((r.get('source_ref') or {}).get('raw','') or '')

                if not paragraph and not item and raw_ref:
                    p2, i2 = parse_para_item_from_raw(raw_ref)
                    paragraph = paragraph or p2
                    item = item or i2

                k2 = (law_id, article, paragraph, item, src_key)
                if k2 in seen:
                    continue
                seen.add(k2)

                ref_key = f"{law_id}:{article or '__law__'}"
                ref_docs = get_ref_docs_for_candidate(runtime, law_id, article, user_query=user_query, target=t) if law_id else []
                ref_preview = build_ref_docs_preview(ref_docs, paragraph=paragraph, item=item, ref_key=ref_key, max_docs=2, max_chars=260)

                prefetched.append({
                    'target': t,
                    'source_chunk_key': src_key,
                    'law_id': law_id,
                    'article': article,
                    'paragraph': paragraph,
                    'item': item,
                    'source': r.get('source',''),
                    'raw_ref': raw_ref,
                    'ref_docs_count': len(ref_docs or []),
                    'ref_docs_preview': ref_preview,
                })

    state['targets'] = targets
    state['target_hits'] = base.get('target_hits', {})
    state['target_decisions'] = base.get('target_decisions', {})
    state['hits'] = list(base.get('contexts', []))
    state['contexts'] = list(base.get('contexts', []))
    state['prefetched_refs'] = prefetched
    return state



def llm_judge_prefetched_refs(state: dict, runtime):
    user_query = state.get('user_query', '')
    judged = []
    pending = []
    seen = set()

    try:
        need_ref, need_reason = llm_need_refs_for_state_context(runtime, state)
    except Exception as e:
        need_ref, need_reason = True, f'llm_error:{e}'

    if not need_ref:
        for r in (state.get('prefetched_refs') or []):
            judged.append({**r, 'follow': False, 'priority': 0, 'decision_reason': f'chunk_context_sufficient: {need_reason}'})
        state['judged_refs'] = judged
        state['pending_refs'] = []
        state['seen_ref_keys'] = []
        return state

    for r in (state.get('prefetched_refs') or []):
        ref_key = f"{r.get('law_id')}:{r.get('article') or '__law__'}"
        preview = r.get('ref_docs_preview', []) or []
        target = str(r.get('target') or ((state.get('targets') or ['일반'])[0]))

        if not preview:
            follow, pri, reason = False, 0, 'ref_docs_not_found'
        else:
            try:
                follow, pri, reason = llm_should_follow_ref_by_content(
                    get_runtime_llm(runtime, 'ref_expander'),
                    user_query=user_query,
                    target=target,
                    parent_chunk_preview='',
                    ref_key=ref_key,
                    ref_docs_preview=preview,
                )
            except Exception as e:
                follow, pri, reason = False, 0, f'llm_error:{e}'

        row = {**r, 'follow': bool(follow), 'priority': int(pri or 0), 'decision_reason': str(reason or '')}
        judged.append(row)

        dedup_key = (str(r.get('law_id','')), str(r.get('article','')))
        if follow and dedup_key not in seen:
            seen.add(dedup_key)
            pending.append({
                'law_id': r.get('law_id'),
                'article': r.get('article'),
                'paragraph': r.get('paragraph',''),
                'item': r.get('item',''),
                'source': r.get('source',''),
                'source_chunk_key': r.get('source_chunk_key',''),
                'priority': int(pri or 0),
                'decision_reason': str(reason or ''),
                'llm_reason': str(reason or ''),
                'ref_docs_preview': preview,
                'raw_ref': r.get('raw_ref',''),
            })

    pending.sort(key=lambda x: int(x.get('priority', 0)), reverse=True)
    state['judged_refs'] = judged
    state['pending_refs'] = pending
    state['seen_ref_keys'] = []
    return state



In [None]:
# STEP 0) 단일 예시 쿼리/상태 초기화
example_query = '서울특별시 종로구 송현동 48-24번지, 49-4번지, 대지면적 9787m2, 연면적 25676m2, 층수 지하 2층 지상3층, 건폐율 60%이하, 용적률 150%이하, 면적표 25696m2, 높이 16m 이하, 도시지역, 제1종일반주고, 고도지구(16m) 건축선을 알려줘'
manual_state = {
    'user_query': example_query,
    'max_hops': 3,
}
manual_state


In [None]:
# STEP 1) target 추출
manual_state = node_extract_targets(manual_state, runtime)
{
    'query': manual_state.get('user_query'),
    'targets': manual_state.get('targets', []),
    'hop_count': manual_state.get('hop_count'),
    'max_hops': manual_state.get('max_hops'),
}


In [None]:
# STEP 1.5-A) ref 사전조회 (DB 조회만)
manual_state = prefetch_refs_between_step1_step2(manual_state, runtime, k=5)
{
    'targets': manual_state.get('targets', []),
    'contexts_count': len(manual_state.get('contexts', [])),
    'prefetched_refs_count': len(manual_state.get('prefetched_refs', [])),
}

try:
    import pandas as pd
    from IPython.display import display
    _pre = pd.DataFrame(manual_state.get('prefetched_refs', []))
    if not _pre.empty:
        cols = ['target','source_chunk_key','law_id','article','paragraph','item','source','raw_ref','ref_docs_count']
        display(_pre[cols])
        _pre['_pv_cnt'] = _pre['ref_docs_preview'].apply(lambda x: len(x or []))
        print('prefetch preview missing:', int((_pre['_pv_cnt']==0).sum()), '/', len(_pre))
except Exception as _e:
    print('prefetch render failed:', _e)


In [None]:
manual_state["target_hits"]["건축선"]

In [None]:
# STEP 1.5-B) 사전조회 ref에 대한 LLM 판단
manual_state = llm_judge_prefetched_refs(manual_state, runtime)
{
    'judged_refs_count': len(manual_state.get('judged_refs', [])),
    'pending_refs_count': len(manual_state.get('pending_refs', [])),
    'pending_refs_head': manual_state.get('pending_refs', [])[:5],
}

show_state_snapshot(manual_state, stage='STEP1.5B-after-judge', context_limit=10, show_graph=True)

inspect_target = (manual_state.get('targets') or ['일반'])[0]
ref_audit_df = display_ref_audit(runtime, manual_state, target=inspect_target, hit_limit=10)
ref_audit_df

try:
    _empty = int((ref_audit_df['ref_preview_len'] == 0).sum())
    _all = len(ref_audit_df)
    print('ref_preview_empty_ratio:', f'{_empty}/{_all}')
except Exception as _e:
    print('ref preview diag failed:', _e)

try:
    _keys = list(ref_audit_df['ref_key'].dropna().astype(str).unique())[:8]
except Exception:
    _keys = []
print('db_fetch_check_keys:', _keys)
_ = debug_db_fetch_for_refs(runtime, _keys)


In [None]:
_keys = list(ref_audit_df['ref_key'].dropna().astype(str).unique())[:8]
_keys, ref_audit_df['ref_key']

In [None]:
_ = debug_db_fetch_for_refs(runtime, _keys)

In [None]:
# STEP 3) 특정 target의 초기 hits 확인 + ref 본문 열람
inspect_target = (manual_state.get('targets') or ['일반'])[0]
inspect_target_hits(manual_state.get('target_hits', {}), inspect_target, limit=8)

# 예시: 아래 키를 ref_audit_df의 ref_key 값으로 바꿔서 본문 확인
# show_ref_fulltext(manual_state, '001823:58')


In [None]:
# STEP 4) 확장 판단 상세 확인 (LLM reason 포함)
rows = inspect_expand_decisions(
    runtime,
    user_query=manual_state.get('user_query', ''),
    target=inspect_target,
    docs=(manual_state.get('target_hits', {}) or {}).get(inspect_target, []),
    limit=5,
)
show_expand_detail(rows, idx=0) if rows else {'msg': 'rows 없음'}


In [None]:
rows

In [None]:
# STEP 5) hop 1회 실행
before_ctx = len(manual_state.get('contexts', []))
before_pending = len(manual_state.get('pending_refs', []))
manual_state = node_reference_tracker(manual_state, runtime)
{
    'hop_count': manual_state.get('hop_count', 0),
    'contexts_before': before_ctx,
    'contexts_after': len(manual_state.get('contexts', [])),
    'pending_before': before_pending,
    'pending_after': len(manual_state.get('pending_refs', [])),
    'ref_batch_decision_last': (manual_state.get('ref_batch_decisions', []) or [None])[-1],
}

show_state_snapshot(manual_state, stage='STEP5-after-hop1', context_limit=12, show_graph=True)


In [None]:
# STEP 6) 남은 hop 자동 실행 (원하면 횟수 조절)
while manual_state.get('pending_refs') and manual_state.get('hop_count', 0) < manual_state.get('max_hops', 3):
    manual_state = node_reference_tracker(manual_state, runtime)
{
    'hop_count': manual_state.get('hop_count', 0),
    'contexts_count': len(manual_state.get('contexts', [])),
    'pending_refs_count': len(manual_state.get('pending_refs', [])),
    'ref_batch_decisions': manual_state.get('ref_batch_decisions', []),
}

show_state_snapshot(manual_state, stage='STEP6-after-loop', context_limit=14, show_graph=True)


In [None]:
# STEP 7) appendix + 최종 답변 생성
manual_state = node_appendix(manual_state, runtime)
manual_state = node_generate(manual_state, runtime)
{
    'targets': manual_state.get('targets', []),
    'appendix_count': len(manual_state.get('appendix', [])),
    'answer_preview': manual_state.get('answer', '')[:1200],
}

show_state_snapshot(manual_state, stage='STEP7-final', context_limit=14, show_graph=False)


In [None]:
print(manual_state["answer"])

In [None]:
# STEP 8) (선택) 최종 컨텍스트 그래프 시각화
contexts = manual_state.get('contexts', [])
print('contexts:', len(contexts))
if contexts:
    render_reference_graph(contexts)
