In [None]:
import json
import os
from pathlib import Path
from tqdm.auto import tqdm
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

# --- КОНФИГ ---
INPUT_DIR = Path('../data/processed/md')
OUTPUT_FILE = Path('../data/processed/chunks.jsonl')
# Размер чанков (в символах)
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 200

In [None]:
headers_to_split_on = [
    ("#", "Title"),
    ("##", "Section"),
    ("###", "Subsection"),
]
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [None]:
def run_chunking():
    files = list(INPUT_DIR.glob('*.md'))
    print(f"Начинаем нарезку {len(files)} файлов...")
    
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    
    count = 0
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
        for file_path in tqdm(files, desc="Processing"):
            try:
                with open(file_path, 'r', encoding='utf-8') as f_in:
                    text = f_in.read()
                
                header_splits = md_splitter.split_text(text)
                
                final_chunks = text_splitter.split_documents(header_splits)
                
                arxiv_id = file_path.stem.split('v')[0]
                
                for chunk in final_chunks:
                    # Фильтр совсем мелких текстов
                    if len(chunk.page_content) < 50:
                        continue
                    
                    record = {
                        "id": arxiv_id,
                        "text": chunk.page_content,
                        "metadata": chunk.metadata, # Здесь лежат Section, Title
                        "source": file_path.name
                    }
                    
                    f_out.write(json.dumps(record, ensure_ascii=False) + '\n')
                    count += 1
                    
            except Exception as e:
                print(f"Ошибка в файле {file_path.name}: {e}")

    print(f"Готово. Создано {count} чанков.")
    print(f"Файл сохранен: {OUTPUT_FILE}")

In [None]:
run_chunking()