In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

ds = load_dataset("math-ai/AutoMathText", "web-0.50-to-1.00", split="train[:50%]")
ds = ds.remove_columns(['url', 'date', 'meta'])

In [None]:
import re
import os
from datasets import Dataset

class  LaTeX_placeholder_swap:
    """
    Функтор для замены LaTeX в строчке на placeholder'ы и обратно.
    """
    _INLINE_MATH = re.compile(r'\$(.+?)\$')
    _DISPLAY_MATH = re.compile(r'\$\$(.+?)\$\$')
    _PLACEHOLDER_BASE = "___LATEX_PLACEHOLDER___"

    @classmethod
    def _reset(cls):
        cls._counter = 0
        cls._exprs = []

    @classmethod
    def _replace_with_placeholder(cls, match) -> str:
        placeholder = f"{cls._PLACEHOLDER_BASE}{cls._counter}"
        cls._exprs.append(match.group(0))
        cls._counter += 1
        return placeholder

    @classmethod
    def latex_to_placeholder(cls, text: str) -> str:
        cls._reset()
        text = cls._DISPLAY_MATH.sub(cls._replace_with_placeholder, text)
        text = cls._INLINE_MATH.sub(cls._replace_with_placeholder, text)
        return text

    @classmethod
    def placeholder_to_latex(cls, text: str) -> str:
        for i, orig in enumerate(cls._exprs):
            text = text.replace(f"{cls._PLACEHOLDER_BASE}{i}", orig)
        return text


    
class LaTeXify:
    """
    Функтор для замены математических выражений на их аналоги в LaTeX формате.
    """

    FUNC_EXPR    = re.compile(r"\b(exp|sin|cos|tan|ln|log)(?:\s+|\()(?P<arg>[A-Za-z0-9]+)\)?", re.IGNORECASE)
    GREEK        = re.compile(r"\b(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\b", re.IGNORECASE)
    ABSOLUTE     = re.compile(r"\|\s*([A-Za-z0-9_{}^\\]+)\s*\|")
    LIMIT        = re.compile(r"lim_?\{?([^}]+)\}?")
    INTERVAL     = re.compile(r"([\[\(])\s*([\-]?[0-9\.A-Za-z_]+)\s*,\s*([\-]?[0-9\.A-Za-z_]+)\s*([\]\)])")
    ARROW        = re.compile(r"->|→")
    SUBSCRIPT    = re.compile(r"\b([A-Za-z])_([0-9]+)\b")
    SUPERSCRIPT  = re.compile(r"\b([A-Za-z])\^([0-9]+)\b")
    SETS         = re.compile(r"\bR\+?\b")
    IMPLICIT_MUL = re.compile(r"\b(\d+)([A-Za-z])\b")
    EXPR         = re.compile(r"\b([A-Za-z]|[0-9]+)\s*([/\^*+\-xX⋅])\s*([A-Za-z]|[0-9]+)\b")

    def __call__(self, ds: Dataset) -> Dataset:
        return ds.map(
            self._process_batch,
            batched=True,
            batch_size=2048,
            num_proc=os.cpu_count()
        )

    @staticmethod
    def _binop(m) -> str:
                a, op, b = m.group(1), m.group(2), m.group(3)
                if op == '/': return f"$\\frac{{{a}}}{{{b}}}$"
                elif op == '^': return f"${a}^{{{b}}}$"
                elif op in ('x','X', '⋅', '*'): return f"${a} \\cdot {b}$"
                return f"${a} {op} {b}$"

    def _process_batch(self, batch: dict) -> dict:
        out = []
        for text in batch['text']:
            text =LaTeX_placeholder_swap.latex_to_placeholder(text)
            text = self.FUNC_EXPR.sub(lambda m: f"$\\{m.group(1).lower()}({m.group('arg')})$", text)
            text = self.GREEK.sub(lambda m: f"$\\{m.group(1).lower()}$", text)
            text = self.ABSOLUTE.sub(lambda m: f"${{\lvert {m.group(1)} \rvert}}$", text)
            text = self.LIMIT.sub(lambda m: f"$\\lim_{{{m.group(1)}}}$", text)
            text = self.INTERVAL.sub(lambda m: f"${m.group(1)}{m.group(2)}, {m.group(3)}{m.group(4)}$", text)
            text = self.ARROW.sub(lambda _: "$\\to$", text)
            text = self.SETS.sub(lambda m: "$R^+$" if '+' in m.group(0) else "$R$", text)
            
            text = self.EXPR.sub(self._binop, text)

            out.append(LaTeX_placeholder_swap.placeholder_to_latex(text))
        return {'text': out}


In [None]:
class Translate:
  """
  Функтор для перевода строк датасета с английского на русский.
  """

  def __call__(self, ds: Dataset) -> None:
    pass

In [None]:
"""
Тут должно появиться применение всех функторов к ds. Возможно что-то еще. 
"""