Skip to content

Commit

Permalink
feat: Support Unicode (sub|super)script characters (#3633)
Browse files Browse the repository at this point in the history
* feat: Support Unicode (sub|super)script characters

* Acquire tokens via repeated fetch()

* Match more Unicode (sub|super)script characters

* Update docs with new characters

* Add Greek characters to RegEx

* Pick up review comments

Co-authored-by: Erik Demaine <edemaine@mit.edu>
  • Loading branch information
ronkok and edemaine committed May 20, 2022
1 parent c31256f commit d8fc35e
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 0 deletions.
3 changes: 3 additions & 0 deletions docs/supported.md
Expand Up @@ -190,6 +190,9 @@ $\allowbreak α β γ δ ϵ ζ η θ ι κ λ μ ν ξ o π \allowbreak ρ σ τ

Direct Input: $∂ ∇ ℑ Ⅎ ℵ ℶ ℷ ℸ ⅁ ℏ ð − ∗$
ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖÙÚÛÜÝÞßàáâãäåçèéêëìíîïðñòóôöùúûüýþÿ
₊₋₌₍₎₀₁₂₃₄₅₆₇₈₉ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓᵦᵧᵨᵩᵪ⁺⁻⁼⁽⁾⁰¹²³⁴⁵⁶⁷⁸⁹ᵃᵇᶜᵈᵉᵍʰⁱʲᵏˡᵐⁿᵒᵖʳˢᵗᵘʷˣʸᶻᵛᵝᵞᵟᵠᵡ

Math-mode Unicode (sub|super)script characters will render as if you had written regular characters in a subscript or superscript. For instance, `A²⁺³` will render the same as `A^{2+3}`.

</div>
<div class="katex-cards" id="math-alpha">
Expand Down
24 changes: 24 additions & 0 deletions src/Parser.js
Expand Up @@ -9,6 +9,7 @@ import ParseError from "./ParseError";
import {combiningDiacriticalMarksEndRegex} from "./Lexer";
import Settings from "./Settings";
import SourceLocation from "./SourceLocation";
import {uSubsAndSups, unicodeSubRegEx} from "./unicodeSupOrSub";
import {Token} from "./Token";

// Pre-evaluate both modules as unicodeSymbols require String.normalize()
Expand Down Expand Up @@ -399,6 +400,29 @@ export default class Parser {
}
// Put everything into an ordgroup as the superscript
superscript = {type: "ordgroup", mode: this.mode, body: primes};
} else if (uSubsAndSups[lex.text]) {
// A Unicode subscript or superscript character.
// We treat these similarly to the unicode-math package.
// So we render a string of Unicode (sub|super)scripts the
// same as a (sub|super)script of regular characters.
let str = uSubsAndSups[lex.text];
const isSub = unicodeSubRegEx.test(lex.text);
this.consume();
// Continue fetching tokens to fill out the string.
while (true) {
const token = this.fetch().text;
if (!(uSubsAndSups[token])) { break; }
if (unicodeSubRegEx.test(token) !== isSub) { break; }
this.consume();
str += uSubsAndSups[token];
}
// Now create a (sub|super)script.
const body = (new Parser(str, this.settings)).parse();
if (isSub) {
subscript = {type: "ordgroup", mode: "math", body};
} else {
superscript = {type: "ordgroup", mode: "math", body};
}
} else {
// If it wasn't ^, _, or ', stop parsing super/subscripts
break;
Expand Down
108 changes: 108 additions & 0 deletions src/unicodeSupOrSub.js
@@ -0,0 +1,108 @@
// Helpers for Parser.js handling of Unicode (sub|super)script characters.

export const unicodeSubRegEx = /^[₊₋₌₍₎₀₁₂₃₄₅₆₇₈₉ₐₑₕᵢⱼₖₗₘₙₒₚᵣₛₜᵤᵥₓᵦᵧᵨᵩᵪ]/;

export const uSubsAndSups = Object.freeze({
'₊': '+',
'₋': '-',
'₌': '=',
'₍': '(',
'₎': ')',
'₀': '0',
'₁': '1',
'₂': '2',
'₃': '3',
'₄': '4',
'₅': '5',
'₆': '6',
'₇': '7',
'₈': '8',
'₉': '9',
'\u2090': 'a',
'\u2091': 'e',
'\u2095': 'h',
'\u1D62': 'i',
'\u2C7C': 'j',
'\u2096': 'k',
'\u2097': 'l',
'\u2098': 'm',
'\u2099': 'n',
'\u2092': 'o',
'\u209A': 'p',
'\u1D63': 'r',
'\u209B': 's',
'\u209C': 't',
'\u1D64': 'u',
'\u1D65': 'v',
'\u2093': 'x',
'\u1D66': 'β',
'\u1D67': 'γ',
'\u1D68': 'ρ',
'\u1D69': '\u03d5',
'\u1D6A': 'χ',
'⁺': '+',
'⁻': '-',
'⁼': '=',
'⁽': '(',
'⁾': ')',
'⁰': '0',
'¹': '1',
'²': '2',
'³': '3',
'⁴': '4',
'⁵': '5',
'⁶': '6',
'⁷': '7',
'⁸': '8',
'⁹': '9',
'\u1D2C': 'A',
'\u1D2E': 'B',
'\u1D30': 'D',
'\u1D31': 'E',
'\u1D33': 'G',
'\u1D34': 'H',
'\u1D35': 'I',
'\u1D36': 'J',
'\u1D37': 'K',
'\u1D38': 'L',
'\u1D39': 'M',
'\u1D3A': 'N',
'\u1D3C': 'O',
'\u1D3E': 'P',
'\u1D3F': 'R',
'\u1D40': 'T',
'\u1D41': 'U',
'\u2C7D': 'V',
'\u1D42': 'W',
'\u1D43': 'a',
'\u1D47': 'b',
'\u1D9C': 'c',
'\u1D48': 'd',
'\u1D49': 'e',
'\u1DA0': 'f',
'\u1D4D': 'g',
'\u02B0': 'h',
'\u2071': 'i',
'\u02B2': 'j',
'\u1D4F': 'k',
'\u02E1': 'l',
'\u1D50': 'm',
'\u207F': 'n',
'\u1D52': 'o',
'\u1D56': 'p',
'\u02B3': 'r',
'\u02E2': 's',
'\u1D57': 't',
'\u1D58': 'u',
'\u1D5B': 'v',
'\u02B7': 'w',
'\u02E3': 'x',
'\u02B8': 'y',
'\u1DBB': 'z',
'\u1D5D': 'β',
'\u1D5E': 'γ',
'\u1D5F': 'δ',
'\u1D60': '\u03d5',
'\u1D61': 'χ',
'\u1DBF': 'θ',
});
4 changes: 4 additions & 0 deletions test/katex-spec.js
Expand Up @@ -275,6 +275,10 @@ describe("A subscript and superscript parser", function() {
expect`x_{x^x}`.toParse();
expect`x_{x_x}`.toParse();
});

it("should work with Unicode (sub|super)script characters", function() {
expect`A² + B²⁺³ + ¹²C + E₂³ + F₂₊₃`.toParseLike("A^{2} + B^{2+3} + ^{12}C + E_{2}^{3} + F_{2+3}");
});
});

describe("A subscript and superscript tree-builder", function() {
Expand Down

0 comments on commit d8fc35e

Please sign in to comment.