Skip to content

Commit

Permalink
Fix toned characters where there is a mismatch between double and sin…
Browse files Browse the repository at this point in the history
…gle width Roman characters
  • Loading branch information
batterseapower committed Nov 22, 2009
1 parent 981f2c9 commit 592acf0
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
13 changes: 11 additions & 2 deletions pinyin/model.py
Expand Up @@ -591,10 +591,19 @@ def checkLength(self, needed):
raise TonedCharactersFromReadingException("Length mismatch: %s vs %s" % (self.characters, needed))

def checkToken(self, corresponding, token):
if corresponding != unicode(token):
# NFKC: apply the compatability decomposition, followed by the canonical composition.
# The reason we do this is that some CEDICT characters are stored with double-width
# Roman letters in the character columns, but normal ones in the reading, like so:
# U盤 U盘 [U pan2] /USB flash drive/see also 閃存盤|闪存盘[shan3 cun2 pan2]/
#
# By putting the token into NFKC those crazy letters get turned into the normal ones
# that we can see in the reading column, and this assertion passes.
if corresponding != unicode(token) and unicodedata.normalize("NFKC", corresponding) != unicode(token):
raise TonedCharactersFromReadingException("Character mismatch: %s vs %s" % (corresponding, unicode(token)))
else:
return token
# NB: because the reading token may be one without the craziness, we need to make sure
# we use the possibly-crazy form to produce a text token here:
return Text(corresponding)

def visitText(self, text):
self.checkLength(len(text))
Expand Down
3 changes: 3 additions & 0 deletions pinyin/tests/model.py
Expand Up @@ -312,6 +312,9 @@ def testTokenizeUnrecognisedHTML(self):
#self.assertEquals([Text(u'<b />')], tokenize(u'<b />'))
self.assertEquals([Text(u'<span style="mehhhh!">'), Text("</span>")], tokenize(u'<span style="mehhhh!"></span>'))

def testTokenizeWeirdyRomanCharacters(self):
self.assertEquals([Text(u'U')], tokenize(u'U'))

class FormatReadingForDisplayTest(unittest.TestCase):
# Test data:
nihao_simp = u'你好,我喜欢学习汉语。我的汉语水平很低。'
Expand Down

0 comments on commit 592acf0

Please sign in to comment.