# RAG ETL Pipeline –¥–ª—è Yandex Handbook

**–ê—Ä—Ö–∏—Ç–µ–∫—Ç—É—Ä–∞:**
- **Child chunks** (450 —Ç–æ–∫–µ–Ω–æ–≤) ‚Üí Qdrant –¥–ª—è —Ç–æ—á–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞ (HYBRID: dense + sparse)
- **Parent chunks** (1500 —Ç–æ–∫–µ–Ω–æ–≤) ‚Üí Redis –¥–ª—è –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞
- **ParentDocumentRetriever** —Å–≤—è–∑—ã–≤–∞–µ—Ç chunks

**–£—Å—Ç–∞–Ω–æ–≤–∫–∞ –ø–∞–∫–µ—Ç–æ–≤:**
```bash
uv add langchain-core langchain-community langchain-openai langchain-text-splitters langchain-qdrant
uv add qdrant-client python-dotenv tiktoken tqdm
```

## 1. –ü–µ—Ä–µ–º–µ–Ω–Ω—ã–µ –æ–∫—Ä—É–∂–µ–Ω–∏—è

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

# OpenAI –Ω–∞—Å—Ç—Ä–æ–π–∫–∏
openai_api_key = os.getenv('OPENAI_API_KEY')
openai_api_base = os.getenv('OPENAI_API_BASE')
embedding_model_name = os.getenv('OPENAI_EVBEDDING_MODEL_NAME', 'text-embedding-3-large')

# Qdrant –Ω–∞—Å—Ç—Ä–æ–π–∫–∏
qdrant_url = os.getenv('QDRANT_URL', 'http://localhost:6333')

# Redis –Ω–∞—Å—Ç—Ä–æ–π–∫–∏
redis_url = os.getenv('REDIS_URL', 'redis://localhost:6379')

print(" –ü–µ—Ä–µ–º–µ–Ω–Ω—ã–µ –æ–∫—Ä—É–∂–µ–Ω–∏—è –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
print(f"   ‚Ä¢ OpenAI API: {'‚úì' if openai_api_key else '‚úó'}")
print(f"   ‚Ä¢ Embedding Model: {embedding_model_name}")
print(f"   ‚Ä¢ Qdrant URL: {qdrant_url}")
print(f"   ‚Ä¢ Redis URL: {redis_url}")

 –ü–µ—Ä–µ–º–µ–Ω–Ω—ã–µ –æ–∫—Ä—É–∂–µ–Ω–∏—è –∑–∞–≥—Ä—É–∂–µ–Ω—ã
   ‚Ä¢ OpenAI API: ‚úì
   ‚Ä¢ Embedding Model: text-embedding-3-large
   ‚Ä¢ Qdrant URL: http://localhost:6333
   ‚Ä¢ Redis URL: redis://localhost:6379


## 2. –ò–º–ø–æ—Ä—Ç—ã

In [2]:
import tiktoken
import re
import hashlib
import json
from typing import Dict, Any
from pathlib import Path
from uuid import uuid4

# LangChain
from langchain_core.documents import Document
from langchain_core.load import dumpd, loads
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from langchain_qdrant.fastembed_sparse import FastEmbedSparse
from langchain_community.storage import RedisStore

# Qdrant
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams

# Progress bar
from tqdm.auto import tqdm

print(" –ò–º–ø–æ—Ä—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã")

  from .autonotebook import tqdm as notebook_tqdm


 –ò–º–ø–æ—Ä—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã


## 3. –§—É–Ω–∫—Ü–∏–∏ –æ—á–∏—Å—Ç–∫–∏ —Ç–µ–∫—Å—Ç–∞

In [3]:
def clean_document_text(text: str) -> str:
    """–û—á–∏—Å—Ç–∫–∞ Markdown-—Ç–µ–∫—Å—Ç–∞ –æ—Ç —Å—Å—ã–ª–æ–∫, —Ñ–∞–π–ª–æ–≤, —Ñ–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏—è"""
    if not text:
        return ""
    
    # –£–¥–∞–ª–µ–Ω–∏–µ —Å—Å—ã–ª–æ–∫ [—Ç–µ–∫—Å—Ç](url) ‚Üí —Ç–µ–∫—Å—Ç
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏–π ![alt](src)
    text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', text)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ URL
    text = re.sub(r'https?://\S+', '', text)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ —Ñ–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏—è **—Ç–µ–∫—Å—Ç** ‚Üí —Ç–µ–∫—Å—Ç
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ HTML —Ç–µ–≥–æ–≤
    text = re.sub(r'<[^>]+>', '', text)
    
    # –£–¥–∞–ª–µ–Ω–∏–µ –ª–∏—à–Ω–∏—Ö –ø—Ä–æ–±–µ–ª–æ–≤
    text = re.sub(r'\s{2,}', ' ', text)
    text = text.strip()
    
    # Lowercase –¥–ª—è –ª—É—á—à–µ–≥–æ –ø–æ–∏—Å–∫–∞
    return text.lower()


## 4. –§—É–Ω–∫—Ü–∏–∏ –¥–ª—è –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö

In [4]:
def stable_parent_id(meta: Dict[str, Any], text: str) -> str:
    """–î–µ—Ç–µ—Ä–º–∏–Ω–∏—Ä–æ–≤–∞–Ω–Ω—ã–π parent_id"""
    base = (
        str(meta.get("source", "")) + "|" +
        str(meta.get("H1", "")) + "|" + 
        str(meta.get("H2", "")) + "|" + 
        str(meta.get("H3", "")) + "|" +
        (text[:128])
    ).encode("utf-8")
    return hashlib.md5(base).hexdigest()

## 5. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ (–ë–ï–ó –æ—á–∏—Å—Ç–∫–∏)

In [5]:
# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞
encoder = tiktoken.get_encoding("cl100k_base")
tiktoken_len = lambda text: len(encoder.encode(text))

# –ü—É—Ç—å –∫ –¥–∞–Ω–Ω—ã–º
DATA_PATH = "/home/llm-dev/project/lifelong_learning_assistant/data/yandex-handbook-downloaded"

# –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–æ–∫—É–º–µ–Ω—Ç—ã –ë–ï–ó –æ—á–∏—Å—Ç–∫–∏ (–Ω—É–∂–Ω—ã –∑–∞–≥–æ–ª–æ–≤–∫–∏ ##, ###)
print(f" –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏–∑: {DATA_PATH}")
md_files = list(Path(DATA_PATH).glob("*.md"))
print(f" –ù–∞–π–¥–µ–Ω–æ {len(md_files)} .md —Ñ–∞–π–ª–æ–≤")

documents = []
for file_path in md_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    doc = Document(
        page_content=content,
        metadata={
            'source': str(file_path),
            'filename': file_path.name,
            'total_tokens': tiktoken_len(content)
        }
    )
    documents.append(doc)

print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(documents)} –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")

# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
if documents:
    token_counts = [doc.metadata['total_tokens'] for doc in documents]
    print(f"\n –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞:")
    print(f"   ‚Ä¢ –°—Ä–µ–¥–Ω–∏–π —Ä–∞–∑–º–µ—Ä: {sum(token_counts)/len(token_counts):.0f} —Ç–æ–∫–µ–Ω–æ–≤")
    print(f"   ‚Ä¢ –ú–∏–Ω–∏–º—É–º: {min(token_counts)} —Ç–æ–∫–µ–Ω–æ–≤")
    print(f"   ‚Ä¢ –ú–∞–∫—Å–∏–º—É–º: {max(token_counts)} —Ç–æ–∫–µ–Ω–æ–≤")

 –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–æ–∫—É–º–µ–Ω—Ç—ã –∏–∑: /home/llm-dev/project/lifelong_learning_assistant/data/yandex-handbook-downloaded
 –ù–∞–π–¥–µ–Ω–æ 68 .md —Ñ–∞–π–ª–æ–≤
–ó–∞–≥—Ä—É–∂–µ–Ω–æ 68 –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤

 –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞:
   ‚Ä¢ –°—Ä–µ–¥–Ω–∏–π —Ä–∞–∑–º–µ—Ä: 9532 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–∏–Ω–∏–º—É–º: 873 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–∞–∫—Å–∏–º—É–º: 27611 —Ç–æ–∫–µ–Ω–æ–≤


## 6. –ó–∞–≥—Ä—É–∑–∫–∞ index.json –¥–ª—è –Ω–∞–∑–≤–∞–Ω–∏–π –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤

In [6]:
# –ó–∞–≥—Ä—É–∂–∞–µ–º index.json
INDEX_PATH = "/home/llm-dev/project/lifelong_learning_assistant/data/yandex-handbook-downloaded/index.json"

with open(INDEX_PATH, 'r', encoding='utf-8') as f:
    index_data = json.load(f)

# –°–æ–∑–¥–∞–µ–º –º–∞–ø–ø–∏–Ω–≥ filename -> title
filename_to_title = {}
for doc_id, doc_info in index_data.items():
    filename = doc_id + '.md'
    title = doc_info.get('title', '')
    filename_to_title[filename] = title

print(f" –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(filename_to_title)} –Ω–∞–∑–≤–∞–Ω–∏–π –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")
print(f"\n –ü—Ä–∏–º–µ—Ä—ã:")
for i, (fn, title) in enumerate(list(filename_to_title.items())[:3]):
    print(f"   ‚Ä¢ {fn} ‚Üí {title}")

 –ó–∞–≥—Ä—É–∂–µ–Ω–æ 68 –Ω–∞–∑–≤–∞–Ω–∏–π –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤

 –ü—Ä–∏–º–µ—Ä—ã:
   ‚Ä¢ about.md ‚Üí –ù–∞—á–∞—Ç—å —É—á–∏—Ç—å—Å—è
   ‚Ä¢ pervie-shagi.md ‚Üí –ü–µ—Ä–≤—ã–µ —à–∞–≥–∏
   ‚Ä¢ mashinnoye-obucheniye.md ‚Üí –ú–∞—à–∏–Ω–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ


## 7. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ —Å–ø–ª–∏—Ç—Ç–µ—Ä–æ–≤


In [7]:
# Markdown Header Splitter –¥–ª—è parent chunks (–∏—Å–ø–æ–ª—å–∑—É–µ–º H1, H2, H3 –∫–∞–∫ –≤ clean_wiki_docs)
headers_to_split_on = [
    ("#", "H1"),
    ("##", "H2"),
    ("###", "H3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    strip_headers=False
)

# Recursive splitter –¥–ª—è –±–æ–ª—å—à–∏—Ö parent chunks
parent_recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
    keep_separator=True
)

# Child splitter
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=70,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
    keep_separator=True
)


## 8. –°–æ–∑–¥–∞–Ω–∏–µ parent chunks

In [8]:
parent_docs = []
total_before_clean = 0
total_after_clean = 0

for doc in documents:
    # –ü–æ–ª—É—á–∞–µ–º –Ω–∞–∑–≤–∞–Ω–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç–∞ –∏–∑ index.json
    doc_filename = doc.metadata.get('filename', '')
    doc_title = filename_to_title.get(doc_filename, '')
    
    # 1. –†–∞–∑–±–∏–≤–∞–µ–º –ø–æ –∑–∞–≥–æ–ª–æ–≤–∫–∞–º (–î–û –æ—á–∏—Å—Ç–∫–∏!)
    md_chunks = markdown_splitter.split_text(doc.page_content or "")
    
    for chunk in md_chunks:
        # –û–±—ä–µ–¥–∏–Ω—è–µ–º –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ
        merged_meta = {**(doc.metadata or {}), **(chunk.metadata or {})}
        
        # –î–æ–±–∞–≤–ª—è–µ–º –Ω–∞–∑–≤–∞–Ω–∏–µ –¥–æ–∫—É–º–µ–Ω—Ç–∞
        merged_meta["doc_title"] = doc_title
        
        # –°–æ–∑–¥–∞–µ–º breadcrumbs –∏ path (–∫–∞–∫ –≤ clean_wiki_docs)
        headers = [merged_meta.get(k, "") for k in ("H1", "H2", "H3") if k in merged_meta]
        merged_meta["breadcrumbs"] = " / ".join(headers) if headers else ""
        
        # Path: doc_title / H1 / H2 / H3
        path_parts = [doc_title] if doc_title else []
        path_parts.extend(headers)
        merged_meta["path"] = " / ".join(path_parts) if path_parts else ""
        
        # Level: –æ–ø—Ä–µ–¥–µ–ª—è–µ–º –ø–æ –Ω–∞–ª–∏—á–∏—é –∑–∞–≥–æ–ª–æ–≤–∫–æ–≤
        merged_meta["level"] = 3 if "H3" in merged_meta else 2 if "H2" in merged_meta else 1 if "H1" in merged_meta else 0
        
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–∞–∑–º–µ—Ä –¥–æ –æ—á–∏—Å—Ç–∫–∏
        total_before_clean += tiktoken_len(chunk.page_content)
        
        # 2. –¢–ï–ü–ï–†–¨ –æ—á–∏—â–∞–µ–º —Ç–µ–∫—Å—Ç chunk (lowercase –∏ —É–¥–∞–ª–µ–Ω–∏–µ —à—É–º–∞)
        chunk.page_content = clean_document_text(chunk.page_content)
        total_after_clean += tiktoken_len(chunk.page_content)
        
        # 3. –°–æ–∑–¥–∞–µ–º parent_id
        merged_meta["parent_id"] = stable_parent_id(merged_meta, chunk.page_content)
        chunk.metadata = merged_meta
        
        # –ï—Å–ª–∏ —á–∞–Ω–∫ —Å–ª–∏—à–∫–æ–º –±–æ–ª—å—à–æ–π - —Ä–∞–∑–±–∏–≤–∞–µ–º
        if tiktoken_len(chunk.page_content) > 1500:
            sub_chunks = parent_recursive_splitter.split_documents([chunk])
            for sub in sub_chunks:
                sub.metadata = merged_meta.copy()
                sub.metadata["parent_id"] = stable_parent_id(sub.metadata, sub.page_content)
            parent_docs.extend(sub_chunks)
        else:
            parent_docs.append(chunk)

print(f" –°–æ–∑–¥–∞–Ω–æ {len(parent_docs)} parent chunks")

# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
parent_tokens = [tiktoken_len(doc.page_content) for doc in parent_docs]
levels = [doc.metadata.get('level', 0) for doc in parent_docs]

print(f"\n –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ parent chunks:")
print(f"   ‚Ä¢ –°—Ä–µ–¥–Ω–µ–µ: {sum(parent_tokens)/len(parent_tokens):.0f} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –ú–∏–Ω–∏–º—É–º: {min(parent_tokens)} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –ú–∞–∫—Å–∏–º—É–º: {max(parent_tokens)} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –ú–µ–¥–∏–∞–Ω–∞: {sorted(parent_tokens)[len(parent_tokens)//2]} —Ç–æ–∫–µ–Ω–æ–≤")

print(f"\n –≠—Ñ—Ñ–µ–∫—Ç –æ—á–∏—Å—Ç–∫–∏:")
print(f"   ‚Ä¢ –î–æ –æ—á–∏—Å—Ç–∫–∏: {total_before_clean:,} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –ü–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏: {total_after_clean:,} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –°–æ–∫—Ä–∞—â–µ–Ω–∏–µ: {(1 - total_after_clean/total_before_clean)*100:.1f}%")

print(f"\n –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ —É—Ä–æ–≤–Ω—è–º –∑–∞–≥–æ–ª–æ–≤–∫–æ–≤:")
for i in range(4):
    count = levels.count(i)
    if count > 0:
        level_name = ["–ë–µ–∑ –∑–∞–≥–æ–ª–æ–≤–∫–∞", "H1", "H2", "H3"][i]
        print(f"   ‚Ä¢ {level_name}: {count} chunks ({count/len(parent_docs)*100:.1f}%)")

# –ü—Ä–æ–≤–µ—Ä—è–µ–º –ø—Ä–∏–º–µ—Ä—ã –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö
print(f"\n –ü—Ä–∏–º–µ—Ä –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö:")
if parent_docs:
    sample = parent_docs[10]  # –ë–µ—Ä–µ–º 10-–π –¥–ª—è —Ä–∞–∑–Ω–æ–æ–±—Ä–∞–∑–∏—è
    print(f"   ‚Ä¢ doc_title: {sample.metadata.get('doc_title', '–Ω–µ—Ç')}")
    print(f"   ‚Ä¢ path: {sample.metadata.get('path', '–Ω–µ—Ç')}")
    print(f"   ‚Ä¢ breadcrumbs: {sample.metadata.get('breadcrumbs', '–Ω–µ—Ç')}")
    print(f"   ‚Ä¢ level: {sample.metadata.get('level', 0)}")

 –°–æ–∑–¥–∞–Ω–æ 923 parent chunks

 –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ parent chunks:
   ‚Ä¢ –°—Ä–µ–¥–Ω–µ–µ: 668 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–∏–Ω–∏–º—É–º: 21 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–∞–∫—Å–∏–º—É–º: 1500 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–µ–¥–∏–∞–Ω–∞: 584 —Ç–æ–∫–µ–Ω–æ–≤

 –≠—Ñ—Ñ–µ–∫—Ç –æ—á–∏—Å—Ç–∫–∏:
   ‚Ä¢ –î–æ –æ—á–∏—Å—Ç–∫–∏: 653,624 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ü–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏: 595,848 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –°–æ–∫—Ä–∞—â–µ–Ω–∏–µ: 8.8%

 –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ —É—Ä–æ–≤–Ω—è–º –∑–∞–≥–æ–ª–æ–≤–∫–æ–≤:
   ‚Ä¢ –ë–µ–∑ –∑–∞–≥–æ–ª–æ–≤–∫–∞: 50 chunks (5.4%)
   ‚Ä¢ H2: 401 chunks (43.4%)
   ‚Ä¢ H3: 472 chunks (51.1%)

 –ü—Ä–∏–º–µ—Ä –º–µ—Ç–∞–¥–∞–Ω–Ω—ã—Ö:
   ‚Ä¢ doc_title: –ë–∞–π–µ—Å–æ–≤—Å–∫–∏–π –ø–æ–¥—Ö–æ–¥ –∫ –æ—Ü–µ–Ω–∏–≤–∞–Ω–∏—é
   ‚Ä¢ path: –ë–∞–π–µ—Å–æ–≤—Å–∫–∏–π –ø–æ–¥—Ö–æ–¥ –∫ –æ—Ü–µ–Ω–∏–≤–∞–Ω–∏—é / –°–æ–ø—Ä—è–∂—ë–Ω–Ω—ã–µ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è
   ‚Ä¢ breadcrumbs: –°–æ–ø—Ä—è–∂—ë–Ω–Ω—ã–µ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è
   ‚Ä¢ level: 2


In [9]:
parent_docs[0].metadata

{'source': '/home/llm-dev/project/lifelong_learning_assistant/data/yandex-handbook-downloaded/veroyatnostnyj-podhod-v-ml.md',
 'filename': 'veroyatnostnyj-podhod-v-ml.md',
 'total_tokens': 6115,
 'doc_title': '–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–Ω—ã–π –ø–æ–¥—Ö–æ–¥ –≤ ML',
 'breadcrumbs': '',
 'path': '–í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–Ω—ã–π –ø–æ–¥—Ö–æ–¥ –≤ ML',
 'level': 0,
 'parent_id': '43a09260959c8673a66e2810d418de54'}

## 9. –ù–∞—Å—Ç—Ä–æ–π–∫–∞ Qdrant –∏ Redis

In [10]:
# Dense embeddings (OpenAI)
embeddings = OpenAIEmbeddings(
    model=embedding_model_name,
    openai_api_key=openai_api_key,
    openai_api_base=openai_api_base,
    chunk_size=1000
)

# Sparse embeddings (BM25 –¥–ª—è –≥–∏–±—Ä–∏–¥–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞)
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

print(" –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ –Ω–∞—Å—Ç—Ä–æ–µ–Ω—ã (dense + sparse)")

# Qdrant
qdrant_client = QdrantClient(host="localhost", port=6333)
collection_name = "yandex_handbook_child_chunks"

try:
    qdrant_client.delete_collection(collection_name)
    print(f"üóëÔ∏è –£–¥–∞–ª–µ–Ω–∞ —Å—Ç–∞—Ä–∞—è –∫–æ–ª–ª–µ–∫—Ü–∏—è")
except:
    pass

# –°–æ–∑–¥–∞–µ–º –∫–æ–ª–ª–µ–∫—Ü–∏—é —Å –ø–æ–¥–¥–µ—Ä–∂–∫–æ–π –≥–∏–±—Ä–∏–¥–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞
qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "dense": VectorParams(size=3072, distance=Distance.COSINE)
    },
    sparse_vectors_config={
        "sparse": SparseVectorParams(
            index=SparseIndexParams(on_disk=False),
            modifier=models.Modifier.IDF,
        )
    },
)

# –í–µ–∫—Ç–æ—Ä–Ω–æ–µ —Ö—Ä–∞–Ω–∏–ª–∏—â–µ –≤ –≥–∏–±—Ä–∏–¥–Ω–æ–º —Ä–µ–∂–∏–º–µ
vectorstore = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    retrieval_mode=RetrievalMode.HYBRID,
    vector_name="dense",
    sparse_vector_name="sparse",
)

print(f" Qdrant –Ω–∞—Å—Ç—Ä–æ–µ–Ω (HYBRID —Ä–µ–∂–∏–º): {collection_name}")

# Redis
parent_store = RedisStore(
    redis_url=redis_url,
    namespace="rag:parents"
)

print(f" Redis –Ω–∞—Å—Ç—Ä–æ–µ–Ω: {redis_url}")

Fetching 18 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:00<00:00, 18.51it/s]


 –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ –Ω–∞—Å—Ç—Ä–æ–µ–Ω—ã (dense + sparse)
üóëÔ∏è –£–¥–∞–ª–µ–Ω–∞ —Å—Ç–∞—Ä–∞—è –∫–æ–ª–ª–µ–∫—Ü–∏—è
 Qdrant –Ω–∞—Å—Ç—Ä–æ–µ–Ω (HYBRID —Ä–µ–∂–∏–º): yandex_handbook_child_chunks
 Redis –Ω–∞—Å—Ç—Ä–æ–µ–Ω: redis://localhost:6379


## 10. –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ Qdrant –∏ Redis

In [11]:
print(" –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ...")

# –§—É–Ω–∫—Ü–∏–∏ –¥–ª—è Redis
def doc_to_bytes(doc: Document) -> bytes:
    as_dict = dumpd(doc)
    return json.dumps(as_dict, ensure_ascii=False).encode("utf-8")

# 1. –ó–∞–≥—Ä—É–∂–∞–µ–º parent chunks –≤ Redis
parent_ids = []
batch = []
for doc in parent_docs:
    pid = doc.metadata["parent_id"]
    batch.append((pid, doc_to_bytes(doc)))
    parent_ids.append(pid)
    
    if len(batch) >= 1000:
        parent_store.mset(batch)
        batch.clear()

if batch:
    parent_store.mset(batch)

print(f" –ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(parent_ids)} parent chunks –≤ Redis")

 –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ...
 –ó–∞–≥—Ä—É–∂–µ–Ω–æ 923 parent chunks –≤ Redis


In [12]:
# 2. –°–æ–∑–¥–∞–µ–º child chunks –∏ –∑–∞–≥—Ä—É–∂–∞–µ–º –≤ Qdrant
child_docs = []
for parent in parent_docs:
    parent_id = parent.metadata["parent_id"]
    children = child_splitter.split_documents([parent])
    
    for child in children:
        # –û–±–Ω–æ–≤–ª—è–µ–º –º–µ—Ç–∞–¥–∞–Ω–Ω—ã–µ
        child.metadata.update(parent.metadata)
        child.metadata["parent_id"] = parent_id
        child.metadata["child_id"] = str(uuid4())
        child.metadata["is_child_chunk"] = True
        
        # –î–æ–±–∞–≤–ª—è–µ–º path –≤ –Ω–∞—á–∞–ª–æ —Ç–µ–∫—Å—Ç–∞ –¥–ª—è –ª—É—á—à–µ–≥–æ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞
        path = child.metadata.get("path", "")
        if path:
            child.page_content = f"{path}\n\n{child.page_content}"
        
        child_docs.append(child)

print(f" –°–æ–∑–¥–∞–Ω–æ {len(child_docs)} child chunks")
print(f"   ‚Ä¢ Path –¥–æ–±–∞–≤–ª–µ–Ω –≤ –Ω–∞—á–∞–ª–æ –∫–∞–∂–¥–æ–≥–æ child chunk –¥–ª—è –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞")

# –ó–∞–≥—Ä—É–∂–∞–µ–º –≤ Qdrant —Å –ø—Ä–æ–≥—Ä–µ—Å—Å-–±–∞—Ä–æ–º
if child_docs:
    batch_size = 100
    print(f"\n –ó–∞–≥—Ä—É–∂–∞–µ–º {len(child_docs)} child chunks –≤ Qdrant...")
    
    for i in tqdm(range(0, len(child_docs), batch_size), desc="–ó–∞–≥—Ä—É–∑–∫–∞ –≤ Qdrant"):
        batch = child_docs[i:i+batch_size]
        vectorstore.add_documents(batch)
    
    print(f" Child chunks –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ Qdrant")

# –ò—Ç–æ–≥–æ–≤–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞
print(f"\n –ò—Ç–æ–≥–æ–≤–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞:")
print(f"   ‚Ä¢ –ò—Å—Ö–æ–¥–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: {len(documents)}")
print(f"   ‚Ä¢ Parent chunks: {len(parent_docs)}")
print(f"   ‚Ä¢ Child chunks: {len(child_docs)}")
print(f"   ‚Ä¢ –°–æ–æ—Ç–Ω–æ—à–µ–Ω–∏–µ child/parent: {len(child_docs)/len(parent_docs):.1f}")

collection_info = qdrant_client.get_collection(collection_name)
print(f"\n Qdrant: {collection_info.points_count:,} –≤–µ–∫—Ç–æ—Ä–æ–≤")
print(f" Redis: {len(parent_ids):,} parent –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤")

# –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ –ø–æ child chunks
child_tokens = [tiktoken_len(doc.page_content) for doc in child_docs]
print(f"\n –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ child chunks:")
print(f"   ‚Ä¢ –°—Ä–µ–¥–Ω–µ–µ: {sum(child_tokens)/len(child_tokens):.0f} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –ú–∏–Ω–∏–º—É–º: {min(child_tokens)} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –ú–∞–∫—Å–∏–º—É–º: {max(child_tokens)} —Ç–æ–∫–µ–Ω–æ–≤")
print(f"   ‚Ä¢ –ú–µ–¥–∏–∞–Ω–∞: {sorted(child_tokens)[len(child_tokens)//2]} —Ç–æ–∫–µ–Ω–æ–≤")

# –ü—Ä–∏–º–µ—Ä child chunk —Å path
print(f"\n –ü—Ä–∏–º–µ—Ä child chunk:")
if child_docs:
    sample = child_docs[5]
    preview = sample.page_content[:300].replace('\n', ' ')
    print(f"   ‚Ä¢ –ü—Ä–µ–≤—å—é: {preview}...")

 –°–æ–∑–¥–∞–Ω–æ 2071 child chunks
   ‚Ä¢ Path –¥–æ–±–∞–≤–ª–µ–Ω –≤ –Ω–∞—á–∞–ª–æ –∫–∞–∂–¥–æ–≥–æ child chunk –¥–ª—è –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞

 –ó–∞–≥—Ä—É–∂–∞–µ–º 2071 child chunks –≤ Qdrant...


–ó–∞–≥—Ä—É–∑–∫–∞ –≤ Qdrant: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [01:44<00:00,  4.98s/it]


 Child chunks –∑–∞–≥—Ä—É–∂–µ–Ω—ã –≤ Qdrant

 –ò—Ç–æ–≥–æ–≤–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞:
   ‚Ä¢ –ò—Å—Ö–æ–¥–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤: 68
   ‚Ä¢ Parent chunks: 923
   ‚Ä¢ Child chunks: 2071
   ‚Ä¢ –°–æ–æ—Ç–Ω–æ—à–µ–Ω–∏–µ child/parent: 2.2

 Qdrant: 2,071 –≤–µ–∫—Ç–æ—Ä–æ–≤
 Redis: 923 parent –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤

 –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ child chunks:
   ‚Ä¢ –°—Ä–µ–¥–Ω–µ–µ: 355 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–∏–Ω–∏–º—É–º: 20 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–∞–∫—Å–∏–º—É–º: 515 —Ç–æ–∫–µ–Ω–æ–≤
   ‚Ä¢ –ú–µ–¥–∏–∞–Ω–∞: 396 —Ç–æ–∫–µ–Ω–æ–≤

 –ü—Ä–∏–º–µ—Ä child chunk:
   ‚Ä¢ –ü—Ä–µ–≤—å—é: –í–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç–Ω—ã–π –ø–æ–¥—Ö–æ–¥ –≤ ML / –°–ª—É—á–∞–π–Ω–æ—Å—Ç—å –∫–∞–∫ –∏—Å—Ç–æ—á–Ω–∏–∫ –Ω–µ—Å–æ–≤–µ—Ä—à–µ–Ω—Å—Ç–≤–∞ –º–æ–¥–µ–ª–∏  —Å–≥–µ–Ω–µ—Ä–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –∏–∑ –º–æ–¥–µ–ª–µ–π —Å –∫–∞–∂–¥—ã–º –∏–∑ —Ç–∏–ø–æ–≤ —à—É–º–∞: –Ω–æ—Ä–º–∞–ª—å–Ω—ã–º, –ª–∞–ø–ª–∞—Å–æ–≤—Å–∫–∏–º –∏ –∫–æ—à–∏. !9 –∫–∞–∫ –≤—ã –º–æ–≥–ª–∏ –∑–∞–º–µ—Ç–∏—Ç—å, –≤ –∫–∞–∂–¥–æ–º –∏–∑ –ø–æ–¥—Ö–æ–¥–æ–≤ –ø–æ—Å–ª–µ —Ç–æ–≥–æ, –∫–∞–∫ –º—ã –∑–∞—Ñ–∏–∫—Å–∏—Ä–æ–≤–∞–ª–∏ –ø—Ä–∏

## 11. –¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ –ø–æ–∏—Å–∫–∞

In [13]:
def bytes_to_doc(b: bytes) -> Document:
    return loads(b.decode("utf-8"))

def search(query: str, k: int = 10):
    """–ü–æ–∏—Å–∫ —Å Parent-Child Retrieval"""
    # –ù–∞—Ö–æ–¥–∏–º child chunks
    child_results = vectorstore.similarity_search(query, k=k)
    
    # –ü–æ–ª—É—á–∞–µ–º parent chunks –∏–∑ Redis
    parent_ids = [doc.metadata.get("parent_id") for doc in child_results]
    parent_bytes = parent_store.mget(parent_ids)
    
    # –í–æ—Å—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º parent –¥–æ–∫—É–º–µ–Ω—Ç—ã
    results = []
    for pb in parent_bytes:
        if pb:
            parent_doc = bytes_to_doc(pb)
            parent_doc.metadata["chunk_type"] = "parent"
            results.append(parent_doc)
    
    return results

# –¢–µ—Å—Ç–æ–≤—ã–µ –∑–∞–ø—Ä–æ—Å—ã
test_queries = [
    "–±–∞–π–µ—Å–æ–≤—Å–∫–∏–π –ø–æ–¥—Ö–æ–¥ –≤ –º–∞—à–∏–Ω–Ω–æ–º –æ–±—É—á–µ–Ω–∏–∏",
    "—Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è –≤ –ª–∏–Ω–µ–π–Ω–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏",
    "–∞–ø–æ—Å—Ç–µ—Ä–∏–æ—Ä–Ω–æ–µ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤"
]

print("üîç –¢–µ—Å—Ç–∏—Ä—É–µ–º –ø–æ–∏—Å–∫:\n" + "="*60)

for i, query in enumerate(test_queries, 1):
    print(f"\n{i}. –ó–∞–ø—Ä–æ—Å: '{query}'")
    results = search(query)
    
    if results:
        best = results[0]
        print(f"    –ù–∞–π–¥–µ–Ω–æ: {len(results)} parent chunks")
        print(f"    –õ—É—á—à–∏–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç:")
        print(f"      ‚Ä¢ –§–∞–π–ª: {best.metadata.get('filename', 'unknown')}")
        print(f"      ‚Ä¢ Breadcrumbs: {best.metadata.get('breadcrumbs', '–Ω–µ—Ç')}")
        print(f"      ‚Ä¢ –†–∞–∑–º–µ—Ä: {tiktoken_len(best.page_content)} —Ç–æ–∫–µ–Ω–æ–≤")
        preview = best.page_content[:200].replace('\n', ' ')
        print(f"      ‚Ä¢ –ü—Ä–µ–≤—å—é: {preview}...")


üîç –¢–µ—Å—Ç–∏—Ä—É–µ–º –ø–æ–∏—Å–∫:

1. –ó–∞–ø—Ä–æ—Å: '–±–∞–π–µ—Å–æ–≤—Å–∫–∏–π –ø–æ–¥—Ö–æ–¥ –≤ –º–∞—à–∏–Ω–Ω–æ–º –æ–±—É—á–µ–Ω–∏–∏'


  return loads(b.decode("utf-8"))


    –ù–∞–π–¥–µ–Ω–æ: 10 parent chunks
    –õ—É—á—à–∏–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç:
      ‚Ä¢ –§–∞–π–ª: bajesovskij-podhod-k-ocenivaniyu.md
      ‚Ä¢ Breadcrumbs: –ë–∞–π–µ—Å–æ–≤—Å–∫–∏–π –ø–æ–¥—Ö–æ–¥ –∏ –¥–æ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–µ–π
      ‚Ä¢ –†–∞–∑–º–µ—Ä: 326 —Ç–æ–∫–µ–Ω–æ–≤
      ‚Ä¢ –ü—Ä–µ–≤—å—é: ## –±–∞–π–µ—Å–æ–≤—Å–∫–∏–π –ø–æ–¥—Ö–æ–¥ –∏ –¥–æ–æ–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–µ–π –¥–æ —Å–∏—Ö –ø–æ—Ä –º—ã –≤ –æ—Å–Ω–æ–≤–Ω–æ–º —Ä–∞—Å—Å—É–∂–¥–∞–ª–∏ –æ –º–æ–¥–µ–ª—è—Ö –º–∞—à–∏–Ω–Ω–æ–≥–æ –æ–±—É—á–µ–Ω–∏—è –∫–∞–∫ –æ —á—ë–º-—Ç–æ, —á—Ç–æ –æ–¥–∏–Ω —Ä–∞–∑ –æ–±—É—á–∞–µ—Ç—Å—è –∏ –¥–∞–ª—å—à–µ –Ω–∞–≤—Å–µ–≥–¥–∞ –∑–∞—Å—Ç—ã–≤–∞–µ—Ç –≤ —Ç–∞–∫–æ–º –≤–∏–¥–µ, –Ω–æ –≤ –∂–∏–∑–Ω–∏ —Ç–∞...

2. –ó–∞–ø—Ä–æ—Å: '—Ä–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è –≤ –ª–∏–Ω–µ–π–Ω–æ–π —Ä–µ–≥—Ä–µ—Å—Å–∏–∏'
    –ù–∞–π–¥–µ–Ω–æ: 10 parent chunks
    –õ—É—á—à–∏–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç:
      ‚Ä¢ –§–∞–π–ª: tonkosti-obucheniya.md
      ‚Ä¢ Breadcrumbs: –†–µ–≥—É–ª—è—Ä–∏–∑–∞—Ü–∏—è –Ω–µ–π—Ä–æ–Ω–Ω—ã—Ö —Å–µ—Ç–µ–π
      ‚Ä¢ –†–∞–∑–º–µ—Ä: 417 —Ç–æ–∫–µ–Ω–æ–≤
      ‚Ä¢ –ü—Ä