# 05. Qdrant 인덱싱 (chunk별 abbr_map JSON 사용)

`03`에서 생성한 `abbr_maps_by_law.json`을 읽어 인덱싱에 반영합니다.

## Qdrant Cloud 설정

클라우드 사용 시 `QDRANT_URL`, `QDRANT_API_KEY`를 설정하세요. 미설정이면 로컬 경로를 사용합니다.

In [None]:
import os

# 예시 (실제 값으로 변경)
# os.environ["QDRANT_URL"] = "https://<cluster>.cloud.qdrant.io:6333"
# os.environ["QDRANT_API_KEY"] = "<api-key>"



In [None]:
import json
import os
import re
import uuid
import hashlib
from dataclasses import asdict, dataclass, field
from pathlib import Path

import requests
from dotenv import load_dotenv

load_dotenv()


def normalize_to_list(v):
    if v is None:
        return []
    if isinstance(v, dict):
        return [v]
    if isinstance(v, list):
        return v
    return []


def classify_law_type(name):
    if '시행령' in name:
        return '시행령'
    if '시행규칙' in name:
        return '시행규칙'
    return '법률'


def parse_article_numbers(article: dict, header: str) -> tuple[str, str]:
    main = str(article.get('조문번호', '') or '').strip()
    sub = str(article.get('조문가지번호', '') or '').strip()

    if not main or (not sub and '의' in header):
        m = re.search(r'제\s*(\d+)\s*조(?:\s*의\s*(\d+))?', header or '')
        if m:
            if not main:
                main = m.group(1) or ''
            if not sub:
                sub = m.group(2) or ''

    return main, sub


def make_chunk_key(law_id: str, article_num: str, article_sub: str | int | None = '') -> str:
    lid = str(law_id).strip().zfill(6)
    sub = str(article_sub or '').strip()
    sub_num = sub if sub else '0'
    return f"{lid}:{str(article_num).strip()}:{sub_num}"


@dataclass
class Reference:
    law_name: str | None
    article: str
    paragraph: str | None = None
    item: str | None = None
    raw: str = ''


@dataclass
class ArticleChunk:
    law_name: str
    law_id: str
    law_type: str
    article_num: str
    article_sub: str = ''
    article_title: str = ''
    content: str = ''
    paragraphs: list[dict] = field(default_factory=list)
    internal_refs: list[Reference] = field(default_factory=list)
    external_refs: list[Reference] = field(default_factory=list)
    abbreviations: dict[str, str] = field(default_factory=dict)
    effective_date: str = ''
    change_type: str = ''

    def payload(self):
        return {
            'law_name': self.law_name,
            'law_id': self.law_id,
            'law_type': self.law_type,
            'article_num': self.article_num,
            'article_sub': self.article_sub,
            'article_title': self.article_title,
            'content_original': self.content,
            'paragraphs': self.paragraphs,
            'internal_refs': [asdict(r) for r in self.internal_refs],
            'external_refs': [asdict(r) for r in self.external_refs],
            'abbreviations': self.abbreviations,
            'effective_date': self.effective_date,
            'change_type': self.change_type,
        }


def fetch_law_json(law_id):
    oc = os.getenv('OC', '')
    if not oc:
        raise ValueError('Missing OC in .env')
    url = 'http://www.law.go.kr/DRF/lawService.do'
    params = {'OC': oc, 'target': 'eflaw', 'ID': law_id, 'type': 'JSON'}
    resp = requests.get(url, params=params, timeout=30)
    resp.raise_for_status()
    return resp.json()


def parse_law_data(data):
    law_name = data['법령']['기본정보']['법령명_한글']
    law_id = str(data['법령']['기본정보']['법령ID']).zfill(6)
    out = []
    for article in normalize_to_list(data['법령']['조문'].get('조문단위')):
        if article.get('조문여부') != '조문':
            continue
        header = str(article.get('조문내용', '')).strip()
        article_num, article_sub = parse_article_numbers(article, header)

        content_parts = [header] if header else []
        paragraphs = []
        for para in normalize_to_list(article.get('항')):
            para_content = str(para.get('항내용', '')).strip()
            if para_content:
                content_parts.append(para_content)
            paragraphs.append({'num': str(para.get('항번호', '')).strip(), 'content': para_content})

        out.append(ArticleChunk(
            law_name=law_name,
            law_id=law_id,
            law_type=classify_law_type(law_name),
            article_num=article_num,
            article_sub=article_sub,
            article_title=str(article.get('조문제목', '')).strip(),
            content='\n'.join([x for x in content_parts if x]),
            paragraphs=paragraphs,
            effective_date=str(article.get('조문시행일자', '')),
            change_type=str(article.get('조문제개정유형', '')),
        ))
    return out


# LEGACY fallback (현재 파이프라인에서는 사용하지 않음)
INTERNAL_PATTERN = re.compile(r'제(\d+(?:의\d+)?)조(?:제(\d+)항)?(?:제(\d+)호)?')
EXTERNAL_PATTERN = re.compile(r'「([^」]+)」\s*제(\d+(?:의\d+)?)조(?:\s*제(\d+)항)?(?:\s*제(\d+)호)?')


def extract_refs(chunks):
    for c in chunks:
        for m in EXTERNAL_PATTERN.finditer(c.content):
            if m.group(1) != c.law_name:
                c.external_refs.append(Reference(m.group(1), m.group(2), m.group(3), m.group(4), m.group(0)))
        clean = EXTERNAL_PATTERN.sub('', c.content)
        for m in INTERNAL_PATTERN.finditer(clean):
            c.internal_refs.append(Reference(c.law_name, m.group(1), m.group(2), m.group(3), m.group(0)))


def apply_chunk_abbr_map(chunks, chunk_abbr_maps):
    for c in chunks:
        key_new = make_chunk_key(c.law_id, c.article_num, c.article_sub)
        key_old = f"{c.law_id}:{c.article_num}"
        amap = chunk_abbr_maps.get(key_new)
        if amap is None:
            amap = chunk_abbr_maps.get(key_old, {})
        c.abbreviations = amap if isinstance(amap, dict) else {}


def index_qdrant(
    chunks,
    collection_name='building_law',
    qdrant_path='./qdrant_data',
    qdrant_url=None,
    qdrant_api_key=None,
    prefer_grpc=False,
    batch_size=32,
    max_retries=6,
    base_sleep=1.0,
    failed_log_path='data/processed/qdrant_failed_chunks.json',
):
    import math
    import time

    from langchain_core.documents import Document
    from langchain_naver import ClovaXEmbeddings
    from langchain_qdrant import QdrantVectorStore
    from qdrant_client import QdrantClient
    from qdrant_client.http.exceptions import UnexpectedResponse
    from qdrant_client.http.models import Distance, VectorParams

    def is_rate_limit_error(err: Exception) -> bool:
        msg = str(err)
        return ('429' in msg) or ('RateLimitError' in msg) or ('rate exceeded' in msg.lower())

    def compute_content_hash(chunk) -> str:
        raw = f"{chunk.law_id}|{chunk.article_num}|{chunk.article_sub}|{chunk.article_title}|{chunk.content}"
        return hashlib.sha256(raw.encode('utf-8')).hexdigest()

    def save_failed(failed_items):
        path = Path(failed_log_path)
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps(failed_items, ensure_ascii=False, indent=2), encoding='utf-8')
        return str(path)

    url = (qdrant_url or os.getenv('QDRANT_URL') or '').strip()
    api_key = (qdrant_api_key or os.getenv('QDRANT_API_KEY') or '').strip()

    if url:
        if '<' in url or 'your-' in url:
            raise ValueError('QDRANT_URL 예시값이 그대로 들어가 있습니다. 실제 클러스터 URL로 바꾸세요.')
        if not api_key:
            raise ValueError('QDRANT_URL이 설정된 경우 QDRANT_API_KEY도 필요합니다.')
        client = QdrantClient(url=url, api_key=api_key, prefer_grpc=prefer_grpc)
    else:
        client = QdrantClient(path=qdrant_path)

    try:
        exists = client.collection_exists(collection_name)
    except UnexpectedResponse as e:
        if '403' in str(e):
            raise RuntimeError(
                'Qdrant 403 Forbidden: API Key 권한 또는 URL이 잘못됐습니다. '
                'Cloud 콘솔 API Key(Write)와 endpoint(:6333) 확인하세요.'
            ) from e
        raise

    if not exists:
        client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=1024, distance=Distance.COSINE),
        )

    embeddings = ClovaXEmbeddings(model='bge-m3')
    store = QdrantVectorStore(client=client, collection_name=collection_name, embedding=embeddings)

    existing_hash_by_source = {}
    next_offset = None
    while True:
        points, next_offset = client.scroll(
            collection_name=collection_name,
            offset=next_offset,
            limit=1000,
            with_payload=True,
            with_vectors=False,
        )
        for pt in points:
            payload = pt.payload or {}
            sk = payload.get('source_key')
            if sk:
                existing_hash_by_source[str(sk)] = payload.get('content_hash')
        if next_offset is None:
            break

    candidates = []
    for c in chunks:
        source_key = make_chunk_key(c.law_id, c.article_num, c.article_sub)
        point_id = str(uuid.uuid5(uuid.NAMESPACE_URL, source_key))
        content_hash = compute_content_hash(c)

        payload = c.payload()
        payload['content_hash'] = content_hash
        payload['source_key'] = source_key
        doc = Document(page_content=c.content, metadata=payload)
        candidates.append((source_key, point_id, content_hash, doc))

    total = len(candidates)
    if total == 0:
        return {'created': 0, 'updated': 0, 'skipped': 0, 'written': 0, 'failed': 0, 'total': 0, 'failed_log': ''}

    created = 0
    updated = 0
    skipped = 0
    failed_items = []
    to_upsert = []

    for source_key, point_id, h, doc in candidates:
        old_hash = existing_hash_by_source.get(source_key)
        if old_hash is None:
            created += 1
            to_upsert.append((source_key, point_id, h, doc))
        elif old_hash != h:
            updated += 1
            to_upsert.append((source_key, point_id, h, doc))
        else:
            skipped += 1

    total_upsert = len(to_upsert)
    if total_upsert == 0:
        print(f'no changes: skipped={skipped}, total={total}')
        return {
            'created': created,
            'updated': updated,
            'skipped': skipped,
            'written': 0,
            'failed': 0,
            'total': total,
            'failed_log': '',
        }

    num_batches = math.ceil(total_upsert / batch_size)
    written = 0

    def upsert_one(source_key, point_id, doc):
        for attempt in range(max_retries + 1):
            try:
                store.add_documents(documents=[doc], ids=[point_id])
                return True, ''
            except Exception as e:
                if is_rate_limit_error(e) and attempt < max_retries:
                    wait = min(base_sleep * (2 ** attempt), 30.0)
                    time.sleep(wait)
                    continue
                return False, str(e)

    for bi in range(num_batches):
        s = bi * batch_size
        e = min((bi + 1) * batch_size, total_upsert)
        batch = to_upsert[s:e]

        d_batch = [x[3] for x in batch]
        i_batch = [x[1] for x in batch]

        batch_done = False
        for attempt in range(max_retries + 1):
            try:
                store.add_documents(documents=d_batch, ids=i_batch)
                written += len(d_batch)
                print(f"batch {bi+1}/{num_batches} ok: +{len(d_batch)} (written={written})")
                time.sleep(0.25)
                batch_done = True
                break
            except Exception as err:
                if is_rate_limit_error(err) and attempt < max_retries:
                    wait = min(base_sleep * (2 ** attempt), 30.0)
                    print(f"rate limited on batch {bi+1}, retry {attempt+1}/{max_retries}, sleep={wait:.1f}s")
                    time.sleep(wait)
                    continue
                print(f"batch {bi+1} failed, fallback to per-point retry: {err}")
                break

        if batch_done:
            continue

        for source_key, point_id, _h, doc in batch:
            ok, msg = upsert_one(source_key, point_id, doc)
            if ok:
                written += 1
            else:
                failed_items.append({
                    'source_key': source_key,
                    'point_id': point_id,
                    'error': msg[:500],
                })

    failed_log = ''
    if failed_items:
        failed_log = save_failed(failed_items)
        print(f"failed chunks: {len(failed_items)} -> {failed_log}")

    return {
        'created': created,
        'updated': updated,
        'skipped': skipped,
        'written': written,
        'failed': len(failed_items),
        'total': total,
        'failed_log': failed_log,
    }



def sanitize_filename_component(name: str) -> str:
    return str(name).replace(' ', '_')


def _normalize_ref_list(refs):
    out = []
    for r in refs or []:
        if not isinstance(r, dict):
            continue
        out.append(
            Reference(
                law_name=r.get('law_name'),
                article=str(r.get('article', '')).strip(),
                paragraph=(str(r.get('paragraph')).strip() if r.get('paragraph') not in [None, ''] else None),
                item=(str(r.get('item')).strip() if r.get('item') not in [None, ''] else None),
                raw=str(r.get('raw', '')).strip(),
            )
        )
    return out


def apply_ref_map_to_chunks(chunks, ref_map_by_law):
    applied = 0
    for c in chunks:
        law_id = str(c.law_id).strip().zfill(6)
        law_map = ref_map_by_law.get(law_id, {})
        if not isinstance(law_map, dict):
            continue

        key = make_chunk_key(c.law_id, c.article_num, c.article_sub)
        row = law_map.get(key)

        # backward compatibility (old key format)
        if row is None:
            row = law_map.get(f"{c.law_id}:{c.article_num}")

        if not isinstance(row, dict):
            continue

        c.internal_refs = _normalize_ref_list(row.get('internal_refs', []))
        c.external_refs = _normalize_ref_list(row.get('external_refs', []))
        applied += 1

    return applied




def _split_article_token(token: str) -> tuple[str, str]:
    t = str(token or '').strip()
    if not t:
        return '', ''
    m = re.fullmatch(r'(\d+)(?:의(\d+))?', t)
    if m:
        return m.group(1) or '', m.group(2) or ''
    return t, ''


def _normalize_chunk_ref_map_keys(raw_map: dict, target_law_id: str) -> dict:
    norm = {}
    if not isinstance(raw_map, dict):
        return norm

    for k, v in raw_map.items():
        if not isinstance(v, dict):
            continue

        parts = str(k).split(':')
        article_num = ''
        article_sub = ''

        if len(parts) >= 3:
            # law:article:sub
            article_num = str(parts[1]).strip()
            article_sub = str(parts[2]).strip()
        elif len(parts) == 2:
            # law:article  (article may be 4 or 4의2)
            article_token = str(parts[1]).strip()
            article_num, article_sub = _split_article_token(article_token)
        elif len(parts) == 1:
            # article only
            article_num, article_sub = _split_article_token(parts[0])

        if not article_num:
            continue

        nk = make_chunk_key(target_law_id, article_num, article_sub)
        norm[nk] = v

    return norm


def load_golden_ref_map_by_law(base_dir='data/processed/chunks_golden/chunk_ref_map'):
    candidates = [
        Path(base_dir),
        Path('notebooks/research_mvp') / base_dir,
    ]
    base = next((x for x in candidates if x.exists()), None)
    if base is None:
        print('[warn] golden ref map dir not found:', candidates)
        return {}

    out = {}
    for p in sorted(base.glob('*.json')):
        law_id = p.stem.strip().zfill(6)  # file name is source of truth: 001823 / 002118
        try:
            data = json.loads(p.read_text(encoding='utf-8'))
            out[law_id] = _normalize_chunk_ref_map_keys(data, law_id)
        except Exception as e:
            print('[warn] failed to load golden ref map:', p, e)
    return out




def load_ordin_chunks_from_export(chunks_ordin_dir: str | None = None):
    candidates = [
        Path(chunks_ordin_dir) if chunks_ordin_dir else None,
        Path('data/processed/chunks_ordin'),
        Path('data/processed/chunks_ordin'),
        Path('notebooks/research_mvp/data/processed/chunks_ordin'),
    ]
    candidates = [p for p in candidates if p is not None]

    src = next((p for p in candidates if p.exists()), None)
    if src is None:
        return [], None

    all_file = src / 'all_ordin_chunks.json'
    if not all_file.exists():
        return [], src

    rows = json.loads(all_file.read_text(encoding='utf-8'))
    out = []
    for r in rows:
        try:
            out.append(ArticleChunk(
                law_name=str(r.get('law_name', '')).strip(),
                law_id=str(r.get('law_id', '')).strip(),
                law_type='조례',
                article_num=str(r.get('article_num', '')).strip(),
                article_sub=str(r.get('article_sub', '') or '').strip(),
                article_title=str(r.get('article_title', '')).strip(),
                content=str(r.get('content', '')).strip(),
                paragraphs=r.get('paragraphs', []) if isinstance(r.get('paragraphs', []), list) else [],
                effective_date=str(r.get('effective_date', '')).strip(),
                change_type=str(r.get('change_type', '')).strip(),
            ))
        except Exception:
            continue

    return out, src


In [None]:
# STEP 0: Qdrant 연결 사전 점검
import os
import requests

url = (os.getenv('QDRANT_URL') or '').strip()
api_key = (os.getenv('QDRANT_API_KEY') or '').strip()

if not url:
    print('Mode: LOCAL (QDRANT_URL 미설정)')
else:
    print('Mode: CLOUD')
    print('API key set =', bool(api_key))

    headers = {'api-key': api_key} if api_key else {}
    try:
        r = requests.get(f"{url}/collections", headers=headers, timeout=15)
        print('GET /collections status =', r.status_code)
        if r.status_code == 403:
            print('-> 403: API key 권한 또는 URL이 잘못됐습니다.')
            print('   1) Cloud 콘솔 API Key(Write 권한) 확인')
            print('   2) endpoint가 API endpoint인지 확인 (:6333 포함 권장)')
        elif r.status_code >= 400:
            print('-> error body:', r.text[:300])
        else:
            data = r.json()
            print('collections:', [c.get('name') for c in data.get('result', {}).get('collections', [])][:10])
    except Exception as e:
        print('request failed:', e)


In [None]:
# STEP 1: raw 데이터 수집/로드 (법령 + 조례)
all_chunks = []

# 1) 법령(건축법/시행령) raw
raw_dir_candidates = [
    Path('data/processed/raw'),
    Path('notebooks/research_mvp/data/processed/raw'),
]
raw_dir = next((p for p in raw_dir_candidates if p.exists()), None)

if raw_dir is not None:
    files = sorted(raw_dir.glob('*.json'))
    print('raw source: local', raw_dir)
    for fp in files:
        data = json.loads(fp.read_text(encoding='utf-8'))
        all_chunks.extend(parse_law_data(data))
else:
    print('raw source: law api')
    for law_id in ('1823', '2118'):
        data = fetch_law_json(law_id)
        all_chunks.extend(parse_law_data(data))

# 2) 자치법규(조례) parsed export
ordin_chunks, ordin_src = load_ordin_chunks_from_export()
if ordin_src is not None:
    print('ordin source:', ordin_src)
print('ordin chunks loaded:', len(ordin_chunks))
all_chunks.extend(ordin_chunks)

print('loaded chunks total:', len(all_chunks))
print('law ids sample:', sorted({str(c.law_id) for c in all_chunks})[:10])
print('law_type counts:')
from collections import Counter
print(dict(Counter([str(c.law_type) for c in all_chunks])))


In [None]:
all_chunks

In [None]:
# STEP 2: 03 노트북에서 만든 chunk별 abbr_map JSON 로드
abbr_candidates = [
    Path('data/processed/abbr_maps_by_chunk.json'),
    Path('notebooks/research_mvp/data/processed/abbr_maps_by_chunk.json'),
    Path('data/processed/abbr_maps_by_chunk.agent.json'),
    Path('notebooks/research_mvp/data/processed/abbr_maps_by_chunk.agent.json'),
]
abbr_json_path = next((p for p in abbr_candidates if p.exists()), None)

if abbr_json_path is None:
    raise FileNotFoundError('먼저 03_refs_and_abbr.ipynb를 실행해서 chunk별 abbr_map JSON을 생성하세요.')

chunk_abbr_maps = json.loads(abbr_json_path.read_text(encoding='utf-8'))
print('abbr source:', abbr_json_path)
print('chunk abbr count:', len(chunk_abbr_maps))


In [None]:
# STEP 3: golden ref map 반영 + chunk별 축약어 매핑
# 정규식 ref 추출은 사용하지 않음 (golden ref map만 신뢰)
ref_map_by_law = load_golden_ref_map_by_law('data/processed/chunks_golden/chunk_ref_map')
applied = apply_ref_map_to_chunks(all_chunks, ref_map_by_law)
print('golden refs applied chunks:', applied)

cov = {}
unmatched_by_law = {}
for c in all_chunks:
    lid = str(c.law_id).strip().zfill(6)
    k = make_chunk_key(c.law_id, c.article_num, c.article_sub)

    cov.setdefault(lid, {'total': 0, 'with_ref': 0, 'without_ref': 0, 'mapped_key': 0})
    cov[lid]['total'] += 1

    if c.internal_refs or c.external_refs:
        cov[lid]['with_ref'] += 1
    else:
        cov[lid]['without_ref'] += 1

    law_map = ref_map_by_law.get(lid, {})
    if isinstance(law_map, dict) and k in law_map:
        cov[lid]['mapped_key'] += 1
    else:
        unmatched_by_law.setdefault(lid, []).append(k)

print('coverage by law:', cov)
for lid, keys in unmatched_by_law.items():
    print(f'unmatched sample [{lid}] ({len(keys)}):', keys[:10])

apply_chunk_abbr_map(all_chunks, chunk_abbr_maps)

sample = next(c for c in all_chunks if c.law_name == '건축법 시행령')
print('sample law:', sample.law_name)
print('chunk key:', make_chunk_key(sample.law_id, sample.article_num, sample.article_sub))
print('article num/sub:', sample.article_num, sample.article_sub or '0')
print('refs internal/external:', len(sample.internal_refs), len(sample.external_refs))
print('abbr size:', len(sample.abbreviations))
print('content head:', sample.content[:180])


In [None]:
# STEP 3-1: chunk 품질 점검 (abbr / refs)
from collections import Counter
import re

def chunk_key(c):
    return make_chunk_key(c.law_id, c.article_num, c.article_sub)

def abbr_size(c):
    amap = c.abbreviations if isinstance(c.abbreviations, dict) else {}
    if '약어' in amap and isinstance(amap.get('약어'), dict):
        amap = amap['약어']
    return len(amap)

n_total = len(all_chunks)
n_abbr_nonempty = sum(1 for c in all_chunks if abbr_size(c) > 0)
n_internal = sum(1 for c in all_chunks if len(c.internal_refs) > 0)
n_external = sum(1 for c in all_chunks if len(c.external_refs) > 0)

print('total chunks:', n_total)
print('abbr non-empty:', n_abbr_nonempty)
print('internal refs non-empty:', n_internal)
print('external refs non-empty:', n_external)

suspect_abbr_miss = [
    c for c in all_chunks
    if ('이하' in c.content and abbr_size(c) == 0)
]
print('suspect abbr missing:', len(suspect_abbr_miss))

article_ref_pat = re.compile(r'제\d+(?:의\d+)?조')
suspect_ref_miss = [
    c for c in all_chunks
    if article_ref_pat.search(c.content) and len(c.internal_refs) == 0 and len(c.external_refs) == 0
]
print('suspect ref missing:', len(suspect_ref_miss))

cnt_by_law = Counter(c.law_name for c in all_chunks)
print('chunks by law:', dict(cnt_by_law))


In [None]:
# STEP 3-2: 의심 청크 샘플 확인
def preview_chunk(c):
    print('---')
    print('key:', chunk_key(c))
    print('law:', c.law_name, '| article:', c.article_num, c.article_sub or '0', c.article_title)
    print('abbr size:', abbr_size(c))
    print('internal/external:', len(c.internal_refs), len(c.external_refs))
    print('head:', (c.content or '')[:260].replace('\n', ' / '))

print('[suspect_abbr_miss sample]')
for c in suspect_abbr_miss[:5]:
    preview_chunk(c)

print('[suspect_ref_miss sample]')
for c in suspect_ref_miss[:5]:
    preview_chunk(c)


In [None]:
# STEP 3-3: 단건 상세 점검 (원하는 key로 확인)
inspect_key = chunk_key(all_chunks[4])  # 예: '001823:4:2'
target = next((c for c in all_chunks if chunk_key(c) == inspect_key), None)

if not target:
    print('not found:', inspect_key)
else:
    print('key:', inspect_key)
    print('law/article:', target.law_name, target.article_num, target.article_sub or '0', target.article_title)
    print('abbr:', target.abbreviations)
    print('internal_refs:', [asdict(r) for r in target.internal_refs])
    print('external_refs:', [asdict(r) for r in target.external_refs])
    print('content\n', target.content[:1200])


In [None]:
# STEP 3-4: refs/abbr 반영된 chunk 결과 저장
from dataclasses import asdict

out_dir = Path('data/processed/chunks_enriched')
out_dir.mkdir(parents=True, exist_ok=True)

def chunk_to_enriched_dict(c):
    return {
        'chunk_key': make_chunk_key(c.law_id, c.article_num, c.article_sub),
        'law_name': c.law_name,
        'law_id': c.law_id,
        'law_type': c.law_type,
        'article_num': c.article_num,
        'article_sub': c.article_sub,
        'article_title': c.article_title,
        'content': c.content,
        'abbreviations': c.abbreviations,
        'internal_refs': [asdict(r) for r in c.internal_refs],
        'external_refs': [asdict(r) for r in c.external_refs],
        'effective_date': c.effective_date,
        'change_type': c.change_type,
    }

enriched = [chunk_to_enriched_dict(c) for c in all_chunks]

all_path = out_dir / 'all_chunks_enriched.json'
all_path.write_text(json.dumps(enriched, ensure_ascii=False, indent=2), encoding='utf-8')

by_law = {}
for row in enriched:
    by_law.setdefault(row['law_id'], []).append(row)

for law_id, rows in by_law.items():
    law_name = rows[0]['law_name']
    p_law = out_dir / f"{law_id}_{sanitize_filename_component(law_name)}_chunks_enriched.json"
    p_law.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding='utf-8')

ref_map = {
    row['chunk_key']: {
        'internal_refs': row['internal_refs'],
        'external_refs': row['external_refs'],
    }
    for row in enriched
}
ref_path = out_dir / 'chunk_ref_map.json'
ref_path.write_text(json.dumps(ref_map, ensure_ascii=False, indent=2), encoding='utf-8')

print('saved all:', all_path)
print('saved ref map:', ref_path)
print('law files:', len(by_law))

next((r for r in enriched if r['internal_refs'] or r['external_refs']), enriched[0] if enriched else {})


In [None]:
all_chunks[-20:]

In [None]:
len(all_chunks)

In [None]:
# STEP 4: Qdrant 단일 컬렉션 인덱싱 (Cloud/Local 자동 + rate-limit + hash incremental)
stats = index_qdrant(
    all_chunks[476:],
    batch_size=8,
    max_retries=3,
    base_sleep=20.0,
)
print('index stats:', stats)
if stats.get('failed'):
    print('failed log file:', stats.get('failed_log'))


## Rate Limit 대응 팁

- 429가 계속 나면 `batch_size`를 더 낮추세요(예: 24 -> 12 -> 8).
- `base_sleep`를 1.0 -> 2.0으로 올리면 재시도 간격이 늘어납니다.
- 재실행 시 이미 저장된 ID는 덮어쓰기(upsert) 동작입니다.

## Hash Incremental Upsert

- `content_hash`가 같으면 임베딩/업로드를 건너뜁니다(`skipped`).
- 신규는 `created`, 내용 변경은 `updated`로 집계됩니다.

In [None]:
# STEP 5: 실패 chunk만 재시도
failed_path = Path('data/processed/qdrant_failed_chunks.json')
if not failed_path.exists():
    print('no failed log found')
else:
    failed_items = json.loads(failed_path.read_text(encoding='utf-8'))
    failed_keys = {x['source_key'] for x in failed_items}
    retry_chunks = [c for c in all_chunks if make_chunk_key(c.law_id, c.article_num, c.article_sub) in failed_keys]
    print('retry chunk count:', len(retry_chunks))
    retry_stats = index_qdrant(
        retry_chunks,
        batch_size=8,
        max_retries=5,
        base_sleep=2.0,
    )
    print('retry stats:', retry_stats)
