Skip to content

Commit

Permalink
Update char_classes for Unicode 11 and test the result
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Speer committed Sep 14, 2018
1 parent 360f39b commit 896822f
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
Binary file modified ftfy/char_classes.dat
Binary file not shown.
19 changes: 19 additions & 0 deletions tests/test_futuristic_codepoints.py
Expand Up @@ -40,3 +40,22 @@ def test_unicode_10():
# all versions for consistency.
thalim = "\U00011A1A\U00011A2C\U00011A01\U00011A38"
assert sequence_weirdness(thalim) == 0


def test_unicode_11():
# Unicode 11 has implemented the mtavruli form of the Georgian script.
# They are analogous to capital letters in that they can be used to
# emphasize text or write a headline.
#
# Python will convert to that form when running .upper() on Georgian text,
# starting in version 3.7.0. We want to recognize the result as reasonable
# text on all versions.
#
# This text is the mtavruli form of "ქართული ენა", meaning "Georgian
# language".

georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
assert sequence_weirdness(georgian_mtavruli_text) == 0

mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252')
assert fix_encoding(mojibake) == georgian_mtavruli_text

0 comments on commit 896822f

Please sign in to comment.