Permalink
Browse files

Fix toned characters where there is a mismatch between double and sin…

…gle width Roman characters
  • Loading branch information...
1 parent 981f2c9 commit 592acf0166069aa4b7d232705e086cba921a2b80 @batterseapower batterseapower committed Nov 22, 2009
Showing with 14 additions and 2 deletions.
  1. +11 −2 pinyin/model.py
  2. +3 −0 pinyin/tests/model.py
View
@@ -591,10 +591,19 @@ def checkLength(self, needed):
raise TonedCharactersFromReadingException("Length mismatch: %s vs %s" % (self.characters, needed))
def checkToken(self, corresponding, token):
- if corresponding != unicode(token):
+ # NFKC: apply the compatability decomposition, followed by the canonical composition.
+ # The reason we do this is that some CEDICT characters are stored with double-width
+ # Roman letters in the character columns, but normal ones in the reading, like so:
+ # U盤 U盘 [U pan2] /USB flash drive/see also 閃存盤|闪存盘[shan3 cun2 pan2]/
+ #
+ # By putting the token into NFKC those crazy letters get turned into the normal ones
+ # that we can see in the reading column, and this assertion passes.
+ if corresponding != unicode(token) and unicodedata.normalize("NFKC", corresponding) != unicode(token):
raise TonedCharactersFromReadingException("Character mismatch: %s vs %s" % (corresponding, unicode(token)))
else:
- return token
+ # NB: because the reading token may be one without the craziness, we need to make sure
+ # we use the possibly-crazy form to produce a text token here:
+ return Text(corresponding)
def visitText(self, text):
self.checkLength(len(text))
View
@@ -312,6 +312,9 @@ def testTokenizeUnrecognisedHTML(self):
#self.assertEquals([Text(u'<b />')], tokenize(u'<b />'))
self.assertEquals([Text(u'<span style="mehhhh!">'), Text("</span>")], tokenize(u'<span style="mehhhh!"></span>'))
+ def testTokenizeWeirdyRomanCharacters(self):
+ self.assertEquals([Text(u'')], tokenize(u''))
+
class FormatReadingForDisplayTest(unittest.TestCase):
# Test data:
nihao_simp = u'你好,我喜欢学习汉语。我的汉语水平很低。'

0 comments on commit 592acf0

Please sign in to comment.