In [1]:
from datasets import load_dataset
from pathlib import Path
import os
from tqdm.notebook import tqdm

In [2]:
# Конфигурация путей
# Используем абсолютные пути или относительно корня проекта
RAW_DATA_PATH = Path('../data/raw/parquet/')
PROCESSED_DATA_PATH = Path('../data/processed/parquet/')

# Убедимся, что папка для вывода существует
Path(PROCESSED_DATA_PATH).mkdir(parents=True, exist_ok=True)

In [3]:
def process_batch(batch):
    from bs4 import BeautifulSoup
    import re
    from markdownify import markdownify as md
    processed_htmls = []
    
    for html_text in batch['html']:
        # 1. Валидация входных данных
        if not html_text or "Conversion to HTML had a Fatal error" in html_text:
            processed_htmls.append(None)
            continue
            
        soup = BeautifulSoup(html_text, 'html.parser')
        article = soup.find('article') or soup.find(class_='ltx_page_content')
        
        if not article:
            processed_htmls.append(None)
            continue

        # 2. Очистка мусора
        # Удаляем References
        refs_section = soup.find('section', class_='ltx_bibliography')
        if refs_section:
            refs_section.decompose()
            
        # Удаляем Acknowledgements
        header = soup.find(['h2', 'h3'], string=re.compile(r'^\s*Acknowledgements?\s*$', re.IGNORECASE))
        if header:
            section = header.find_parent('section')
            if section:
                section.decompose()
                
        # Удаляем авторов
        authors_section = soup.find('div', class_='ltx_authors')
        if authors_section:
            authors_section.decompose()
            
        # Удаляем footnotetext
        fnt = soup.find('span', class_='ltx_role_footnotetext')
        if fnt:
            fnt.decompose()
            
        # Удаляем ошибки
        error_section = soup.find('span', class_='ltx_ERROR')
        if error_section:
            section = error_section.find_parent('div')
            if section:
                section.decompose()
        
        # 3. Обработка математики (Latex)
        math_registry = {}
        for i, math in enumerate(article.find_all(class_='ltx_Math')):
            latex = math.get('alttext', '')
            if latex:
                placeholder = f"MATHITEM{i}END" 
                math_registry[placeholder] = f"${latex}$"
                math.replace_with(f" {placeholder} ")

        # 4. Конвертация в Markdown
        markdown_text = md(str(article), heading_style="ATX")

        # 5. Возврат математики на место
        for placeholder, original_latex in math_registry.items():
            markdown_text = markdown_text.replace(placeholder, original_latex)
            
        processed_htmls.append(markdown_text)

    batch['html'] = processed_htmls
    
    return batch

In [4]:
ds = load_dataset("parquet", data_files=str(RAW_DATA_PATH / "*.parquet"))

ds_processed = ds.map(
    process_batch,
    batched=True,
    batch_size=10,
    num_proc=os.cpu_count()
)

ds_processed = ds_processed.rename_column('html', 'md')
ds_processed['train'].to_parquet(PROCESSED_DATA_PATH / "processed_data.parquet")

Map (num_proc=12):   0%|          | 0/14594 [00:00<?, ? examples/s]

AttributeError: 'DatasetDict' object has no attribute 'to_parquet'