Skip to content

Commit

Permalink
Even less duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
Kijewski committed Apr 8, 2024
1 parent 40f5b92 commit 395b2eb
Show file tree
Hide file tree
Showing 3 changed files with 1,046 additions and 1,510 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Changelog

**1.6.7 (2024-03-01)**
**1.6.7 (unreleased)**

* Update to Unicode 15.1.0

Expand Down
29 changes: 15 additions & 14 deletions scripts/make_unicode_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def main(input_file, output_file):
"nd": IdentifierPart,
}

planes = defaultdict(lambda: [0] * 0x1000)
planes = defaultdict(lambda: [0] * 0x100)

for input_line in input_file:
m = match(r"^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+([A-Z][a-z])", input_line)
Expand All @@ -43,19 +43,19 @@ def main(input_file, output_file):
end = int(end or start, 16)
start = int(start, 16)
for i in range(start, end + 1):
planes[i // 0x1000][i % 0x1000] = idx
planes[i // 0x100][i % 0x100] = idx

# per: https://spec.json5.org/#white-space
for i in (0x9, 0xA, 0xB, 0xC, 0xD, 0x20, 0xA0, 0x2028, 0x2028, 0x2029, 0xFEFF):
planes[i // 0x1000][i % 0x1000] = WhiteSpace
planes[i // 0x100][i % 0x100] = WhiteSpace

# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (ord("$"), ord("_"), ord("\\")):
planes[i // 0x1000][i % 0x1000] = IdentifierStart
planes[i // 0x100][i % 0x100] = IdentifierStart

# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (0x200C, 0x200D):
planes[i // 0x1000][i % 0x1000] = IdentifierPart
planes[i // 0x100][i % 0x100] = IdentifierPart

print("#ifndef JSON5EncoderCpp_unicode_cat_of", file=output_file)
print("#define JSON5EncoderCpp_unicode_cat_of", file=output_file)
Expand All @@ -71,8 +71,8 @@ def main(input_file, output_file):
print("static unsigned unicode_cat_of(std::uint32_t codepoint) {", file=output_file)

demiplane_to_idx = OrderedDict() # demiplane_idx → data_idx
data_to_idx = [None] * 272 # demiplane data → data_idx
for i in range(272):
data_to_idx = [None] * 17 * 0x100 # demiplane data → data_idx
for i in range(17 * 0x100):
plane_data = ""
plane = planes[i]
while plane and plane[-1] == 0:
Expand All @@ -91,7 +91,7 @@ def main(input_file, output_file):
demiplane_to_idx[plane_data] = produced_idx

print(
" static std::uint8_t data{:03d}[0x1000 / 4] = {{".format(
" static const std::uint8_t data{:03d}[0x100 / 4] __attribute__((__aligned__(64))) = {{".format(
produced_idx
),
file=output_file,
Expand All @@ -102,9 +102,10 @@ def main(input_file, output_file):
data_to_idx[i] = produced_idx
print(file=output_file)

print(" // A 'demiplane' is a 1/16th of a Unicode plane.", file=output_file)
print(" static std::uint8_t *demiplanes[272] = {", end="", file=output_file)
for i in range(272):
print(" // A 'demiplane' is a 1/256th of a Unicode plane.", file=output_file)
print(" // This way a 'demiplane' fits nicely into a cache line.", file=output_file)
print(" static const std::uint8_t *demiplanes[17 * 0x100] __attribute__((__aligned__(64))) = {", end="", file=output_file)
for i in range(17 * 0x100):
if i % 8 == 0:
print("\n ", end="", file=output_file)
print(" data{:03d},".format(data_to_idx[i]), end="", file=output_file)
Expand All @@ -113,15 +114,15 @@ def main(input_file, output_file):
print(file=output_file)

print(
" std::uint16_t demiplane_idx = std::uint16_t(codepoint / 0x1000);",
" std::uint32_t demiplane_idx = codepoint / 0x100;",
file=output_file,
)
print(
" if (JSON5EncoderCpp_expect(demiplane_idx >= 272, false)) return 1;",
" if (JSON5EncoderCpp_expect(demiplane_idx >= 17 * 0x100, false)) return 1;",
file=output_file,
)
print(
" std::uint16_t datum_idx = std::uint16_t(codepoint & 0x0fff);",
" std::uint32_t datum_idx = codepoint & (0x100 - 1);",
file=output_file,
)
print(
Expand Down

0 comments on commit 395b2eb

Please sign in to comment.