In [71]:
# Wikipedia: https://en.wikipedia.org/wiki/Unicode_subscripts_and_superscripts
import string

COMMANDS = {}

superscript_numbers = "⁰¹²³⁴⁵⁶⁷⁸⁹"
for i, ch in enumerate(superscript_numbers):
    COMMANDS[f"^{{{i}}}"] = ch

subscript_numbers = "₀₁₂₃₄₅₆₇₈₉"
for i, ch in enumerate(subscript_numbers):
    COMMANDS[f"_{{{i}}}"] = ch

superscript_lowercase = "ᵃᵇᶜᵈᵉᶠᵍʰⁱʲᵏˡᵐⁿᵒᵖ𐞥ʳˢᵗᵘᵛʷˣʸᶻ"
superscript_uppercase = "ᴬᴮꟲᴰᴱꟳᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾꟴᴿ ᵀᵁⱽᵂ   "
for latex, ch in zip(
    string.ascii_letters, superscript_lowercase + superscript_uppercase
):
    if ch != " ":
        COMMANDS[f"^{{{latex}}}"] = ch

subscript_lowercase = "ₐ   ₑ  ₕᵢⱼₖₗₘₙₒₚ ᵣₛₜᵤᵥ ₓ  "
for latex, ch in zip(string.ascii_letters, subscript_lowercase * 2):
    if ch != " ":
        COMMANDS[f"_{{{latex}}}"] = ch

greek_lowercase = (
    r"\alpha",
    r"\beta",
    r"\gamma",
    r"\delta",
    r"\epsilon",
    r"\zeta",
    r"\eta",
    r"\theta",
    r"\iota",
    r"\kappa",
    r"\lambda",
    r"\mu",
    r"\nu",
    r"\xi",
    "o",
    r"\pi",
    r"\rho",
    r"\sigma",
    r"\tau",
    r"\upsilon",
    r"\phi",
    r"\chi",
    r"\psi",
    r"\omega",
)

superscript_lowercase_greek = " ᵝᵞᵟᵋ  ᶿᶥ          ᶹᵠᵡ  "
subscript_lowercase_greek = " ᵦᵧ             ᵨ   ᵩᵪ  "
for latex, sup, sub in zip(
    greek_lowercase, superscript_lowercase_greek, subscript_lowercase_greek
):
    if sup != " ":
        COMMANDS[f"^{{{latex}}}"] = sup
    if sub != " ":
        COMMANDS[f"_{{{latex}}}"] = sub


In [72]:
# Symbols extracted from http://milde.users.sourceforge.net/LUCR/Math/data/unimathsymbols.txt, which is under Copyright 2011 by Günter Milde and licensed under the LaTeX Project Public License (LPPL)

from pathlib import Path


def match(comments):
    matches = [
        ("PLUS", "+"),
        ("MINUS", "-"),
        ("EQUALS", "="),
        ("LEFT PARENTHESIS", "("),
        ("RIGHT PARENTHESIS", ")"),
    ]
    for match, latex in matches:
        if match in comments:
            return latex
    assert False, f"unmatched: {comments}"  # never arrive here


with open(Path.home() / "unimathsymbols.txt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        items = line.split("^")
        _, ch, latex, latex2, clas, category, requirements, comments = items
        comments = comments[:-1]
        if latex:
            if len(ch) > 1:
                COMMANDS[latex] = ch[1]
            else:
                COMMANDS[latex] = ch
        elif latex2:
            COMMANDS[latex2] = ch
        elif comments.startswith("SUPERSCRIPT"):
            latex = f"^{{{match(comments)}}}"
            COMMANDS[latex] = ch
        elif comments.startswith("SUBSCRIPT"):
            latex = f"_{{{match(comments)}}}"
            COMMANDS[latex] = ch
        else:
            pass

In [73]:
# enhancements
COMMANDS[r"\to"] = COMMANDS[r"\rightarrow"]
COMMANDS[r"^{\ast}"] = "*"
COMMANDS[r"\hbar"] = COMMANDS[r"\hslash"]
COMMANDS["h"] = "ℎ"

In [74]:
from lark import Lark
from lark import Tree
from lark.visitors import Transformer, Visitor, Discard, v_args, Interpreter

parser = Lark(r"""
start: (item | math)*

?atom: CHARACTER
    | COMMAND

?item: atom
    | WS+
    | group

CHARACTER: /[a-zA-Z0-9,\.\?!\-+\(\)\*\/]/ | ESCAPED
ESCAPED: "\\\\" | "\\#" | "\\%" | "\\&"  | "\\{" | "\\}" | "\\_"
group: "{" item* "}"
math: "$" item* "$"
SUBSCRIPT: "_"
SUPERSCRIPT: "^"
COMMAND: (("\\" WORD WS*) | SUBSCRIPT | SUPERSCRIPT)

%import common.WS
%import common.WORD
""", parser="lalr")

In [75]:
HAS_ARG = {
    r"_",
    r"^",
    r"\grave",
    r"\acute",
    r"\hat",
    r"\tilde",
    r"\bar",
    r"\overline",
    r"\breve",
    r"\dot",
    r"\ddot",
    r"\mathring",
    r"\check",
    r"\utilde",
    r"\underbar",
    r"\underline",
    r"\not",
    r"\lvec",
    r"\vec",
    r"\LVec",
    r"\vec",
    r"\dddot",
    r"\ddddot",
    r"\overleftrightarrow",
    r"\underleftarrow",
    r"\underrightarrow",
    r"\mathbf",
    r"\text",
    r"\mathrm",
    r"\left",
    r"\right",
    r"\big",
    r"\Big",
    r"\Bigg",
    r"\sqrt",
}

IGNORE_AS_FALLBACK = {
    r"\text",
    r"\mathbf",
    r"\mathrm",
    r"\left",
    r"\right",
    r"\big",
    r"\Big",
    r"\Bigg",
}

ESCAPED = {
    r"\}": "}",
    r"\{": "{",
    "\\\\": "\\",
}


def handle_cmd(state, x):
    cmd_stack = state["command"]
    if cmd_stack:
        while cmd_stack:
            cmd = cmd_stack.pop()
            if state["math"]:
                if cmd:
                    if cmd in (r"\text", "\mathrm"):
                        pass
                    elif cmd in COMMANDS:
                        # some unicode modifier, e.g. \dot, \vec
                        x = COMMANDS.get(x, x)
                        x += COMMANDS[cmd]
                    else:
                        latex = f"{cmd}{{{x}}}"
                        if latex in COMMANDS:
                            x = COMMANDS.get(latex)
                        elif cmd not in IGNORE_AS_FALLBACK:
                            x = latex
                else:
                    x = COMMANDS.get(x, x)
            elif cmd:
                x = f"{cmd}{{{x}}}"
    else:
        if state["math"]:
            x = COMMANDS.get(x, x)
    return x


def transform(ch, state=None):
    if state is None:
        state = {
            "math": False,
            "command": [],
        }
    if isinstance(ch, Tree):
        r = []
        undo_math = False
        saved = None
        if ch.data == "math":
            state["math"] = True
        elif ch.data == "group":
            saved = state["command"].copy()
        for x in ch.children:
            r.append(transform(x, state))
            if saved:
                state["command"] = saved.copy()
        if ch.data == "math":
            state["math"] = False
        elif ch.data == "group" and state["command"]:
            state["command"] = []
        return "".join(r)
    if ch.type == "CHARACTER":
        x = ESCAPED.get(ch.value, ch.value)
        return handle_cmd(state, x)
    if ch.type == "WS":
        return "" if state["math"] else " "
    if ch.type == "COMMAND":
        x = ch.value.strip()
        if x in HAS_ARG:
            if x == r"\sqrt":
                state["command"].append(r"\overline")
                return COMMANDS[r"\sqrt"]
            state["command"].append(x)
            return ""
        return handle_cmd(state, x)
    # never arrive here
    assert False, f"unknown token {ch}" 


def parse(s):
    tree = parser.parse(s)
    return transform(tree)

sinput = r"foo?!-1+2. \}  \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$"
print(sinput)
parse(sinput)

foo?!-1+2. \}  \\ $\left(\mathbf{\alpha + 1}^2_x y\right)$ bar $\beta^{12}$ $\bar p {}^foo$


'foo?!-1+2. } \\ (𝛂+𝟏²ₓ𝑦) bar 𝛽¹² 𝑝̄ᶠ𝑜𝑜'

In [76]:
parse(r"$D^{\ast\ast} \to hhee$")

'𝐷**→ℎℎ𝑒𝑒'

In [77]:
parse(r"$\mathbf{xyz + 1}$")

'𝐱𝐲𝐳+𝟏'

In [78]:
parse(r"$\sqrt {1Aas\alpha}$")

'√1̅𝐴̅𝑎̅𝑠̅𝛼̅'

In [85]:
parse(r"$\vec{x} b^2 \vec\alpha\overline\alpha K^0_S p_\text{T} \text T$")

'𝑥⃗𝑏²𝛼⃗𝛼̅𝐾⁰ₛ𝑝ₜT'

In [80]:
parse(r"$\sqrt{abcd}$")

'√𝑎̅𝑏̅𝑐̅𝑑̅'

In [82]:
parse(r"$p_T / \text{GeV}c^{-1}$")

'𝑝ₜ/GeV𝑐⁻¹'