In [1]:
from datasets import load_dataset
from pathlib import Path
import os

In [2]:
# 1. Загрузка
data_files = [str(f) for f in Path('../data/raw/parquet/').glob('*.parquet')]
ds = load_dataset("parquet", data_files=data_files, split="train")

# 2. Фильтрация по размеру
ds = ds.filter(lambda x: len(x['html']) >= 50 * 1024)

# 3. Выбираем выборку из N статей
N = 100
ds = ds.select(range(N))

In [3]:
def process_example(example):
    # Импорты ВНУТРИ функции для работы в multiprocessing на Windows
    from bs4 import BeautifulSoup
    from markdownify import markdownify as md
    import re

    html_text = example['html']
    
    if "Conversion to HTML had a Fatal error" in html_text:
        return {"markdown": None}
        
    soup = BeautifulSoup(html_text, 'html.parser')
    article = soup.find('article') or soup.find(class_='ltx_page_content')
    
    if not article:
        return {"markdown": None}

    # Чистка мусора
    for selector in ['.ltx_bibliography', '.ltx_authors', '.ltx_role_footnotetext', '.ltx_ERROR']:
        for el in soup.select(selector):
            el.decompose()
            
    header = soup.find(['h2', 'h3'], string=re.compile(r'^\s*Acknowledgements?\s*$', re.IGNORECASE))
    if header and header.find_parent('section'):
        header.find_parent('section').decompose()

    # Обработка LaTeX
    math_registry = {}
    for i, math in enumerate(article.find_all(class_='ltx_Math')):
        latex = math.get('alttext', '')
        if latex:
            placeholder = f"MATHITEM{i}END"
            math_registry[placeholder] = f"${latex}$"
            math.replace_with(f" {placeholder} ")

    markdown_text = md(str(article), heading_style="ATX")

    for placeholder, original_latex in math_registry.items():
        markdown_text = markdown_text.replace(placeholder, original_latex)

    return {"markdown": markdown_text}

In [4]:
# 3. Масштабируемая обработка
processed_ds = ds.map(
    process_example, 
    num_proc=os.cpu_count(),
    # Удаляем только старый HTML, doc_id и source_path остаются автоматически
    remove_columns=["html"] 
)

Map (num_proc=12):   0%|          | 0/100 [00:00<?, ? examples/s]

In [5]:
# 4. Финальная очистка и сохранение
processed_ds = processed_ds.filter(lambda x: x['markdown'] is not None)
processed_ds.to_parquet('../data/processed/texts.parquet')

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

6097342