In [1]:
from pathlib import Path
from datasets import Dataset
import os

from transformers import AutoTokenizer
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

In [2]:
# --- КОНФИГ ---
MODEL_ID = "Qwen/Qwen3-Embedding-0.6B"
INPUT_FILE = Path('../data/processed/parquet/processed_data.parquet')
OUTPUT_PARQUET = Path('../data/processed/all_chunks.parquet')

CHUNK_SIZE_TOKENS = 512 
CHUNK_OVERLAP_TOKENS = 64

BATCH_SIZE = 100

In [3]:
ds = Dataset.from_parquet(str(INPUT_FILE))
print(f"Dataset загружен. Количество строк: {len(ds)}")

Dataset загружен. Количество строк: 14594


In [4]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-0.6B")

md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
    ("#", "Header_1"), ("##", "Header_2"), ("###", "Header_3")
])

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=tokenizer,
    chunk_size=512,
    chunk_overlap=50
)

In [5]:
def process_batch(batch, tokenizer, md_splitter, text_splitter):
    chunk_texts, chunk_doc_ids, chunk_sources, chunk_metadata = [], [], [], []

    for doc_id, md_content, source_path in zip(batch['doc_id'], batch['md'], batch['source_path']):
        # Защита от None или пустых строк
        if md_content is None or not isinstance(md_content, str) or len(md_content.strip()) == 0:
            continue
            
        try:
            # Сплитим Markdown
            sections = md_splitter.split_text(md_content)
            # Сплитим на чанки
            chunks = text_splitter.split_documents(sections)
            
            for chunk in chunks:
                headers = [chunk.metadata.get(f"Header_{i}", "") for i in range(1, 4)]
                header_context = " > ".join([h for h in headers if h])
                full_text = f"{header_context}: {chunk.page_content}" if header_context else chunk.page_content
                
                # Дополнительная проверка на длину итогового чанка
                if len(full_text.strip()) < 10:
                    continue
                    
                chunk_texts.append(full_text)
                chunk_doc_ids.append(doc_id)
                chunk_sources.append(source_path)
                chunk_metadata.append(chunk.metadata)
                
        except Exception as e:
            print(f"\n[ERROR] Ошибка при обработке doc_id {doc_id} ({source_path}): {e}")
            continue

    return {
        "text": chunk_texts, 
        "doc_id": chunk_doc_ids, 
        "source_path": chunk_sources, 
        "metadata": chunk_metadata
    }

In [6]:
chunked_ds = ds.map(
    process_batch, 
    batched=True,
    batch_size=BATCH_SIZE,
    remove_columns=ds.column_names,
    num_proc=os.cpu_count(),
    fn_kwargs={'tokenizer': tokenizer, 'md_splitter': md_splitter, 'text_splitter': text_splitter}
)

Map (num_proc=12):   0%|          | 0/14594 [00:00<?, ? examples/s]

In [7]:
# 4. Сохранение итогового результата
OUTPUT_PARQUET.parent.mkdir(parents=True, exist_ok=True)
chunked_ds.to_parquet(str(OUTPUT_PARQUET))

print(f"Готово. Итоговое количество чанков: {len(chunked_ds)}")

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Готово. Итоговое количество чанков: 767801
