In [None]:
from pathlib import Path
from bs4 import BeautifulSoup
import re
from markdownify import markdownify as md
from tqdm.notebook import tqdm

In [None]:
def preprocess_arxiv_html(html_text):
    if "Conversion to HTML had a Fatal error" in html_text:
        return None
        
    soup = BeautifulSoup(html_text, 'html.parser')
    article = soup.find('article') or soup.find(class_='ltx_page_content')
    
    if not article:
        return None

    # Удаляем References
    refs_section = soup.find('section', class_='ltx_bibliography')
    if refs_section:
        refs_section.decompose()
    # Удаляем Acknowledgements
    header = soup.find(['h2', 'h3'], string=re.compile(r'^\s*Acknowledgements?\s*$', re.IGNORECASE))
    if header:
        section = header.find_parent('section')
        if section:
            section.decompose()
    # Удаляем авторов
    authors_section = soup.find('div', class_='ltx_authors')
    if authors_section:
        authors_section.decompose()
    # Удаляем footnotetext
    fnt = soup.find('span', class_='ltx_role_footnotetext')
    if fnt:
        fnt.decompose()
    # Удаляем ошибки
    error_section = soup.find('span', class_='ltx_ERROR')
    if error_section:
        section = error_section.find_parent('div')
        if section:
            section.decompose()
        
    
    math_registry = {}
    for i, math in enumerate(article.find_all(class_='ltx_Math')):
        latex = math.get('alttext', '')
        if latex:
            placeholder = f"MATHITEM{i}END" 
            math_registry[placeholder] = f"${latex}$"
            math.replace_with(f" {placeholder} ")

    markdown_text = md(str(article), heading_style="ATX")

    for placeholder, original_latex in math_registry.items():
        markdown_text = markdown_text.replace(placeholder, original_latex)

    return markdown_text

In [None]:
DATA_DIR = Path('../data/raw/html/')

PROCESSED_DATA_DIR = Path('../data/processed/md/')
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
data = DATA_DIR.glob('*.html')

data = [f for f in data if f.stat().st_size >= 50 * 1024] # не обрабатываем файлы меньше 50 Кб
data = list(data)[:100] ############################# ТУТ МЕНЯЕМ КОЛИЧЕСТВО СТАТЕЙ ДЛЯ ТЕСТОВ

In [None]:
for html_file in tqdm(data, total=len(data)):
    try:
        html_content = html_file.read_text(encoding='utf-8')
        md_content = preprocess_arxiv_html(html_content)
        filename = f'{PROCESSED_DATA_DIR / html_file.stem}.md'
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(md_content)
    except Exception as e:
        print(e)

In [None]:
data_processed = list(PROCESSED_DATA_DIR.glob('*.md'))

In [None]:
for i, file in enumerate(data_processed):
    content = file.read_text(encoding='utf-8')
    print(f'{i+1}.|{file.stem}|\n {content[:50]}', end=f'\n{"="*70}\n')