Fix toned characters where there is a mismatch between double and sin…

…gle width Roman characters
Nick3C · Nov 22, 2009 · 592acf0 · 592acf0
1 parent 981f2c9
commit 592acf0
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 2 deletions.
diff --git a/pinyin/model.py b/pinyin/model.py
@@ -591,10 +591,19 @@ def checkLength(self, needed):
             raise TonedCharactersFromReadingException("Length mismatch: %s vs %s" % (self.characters, needed))
 
     def checkToken(self, corresponding, token):
-        if corresponding != unicode(token):
+        # NFKC: apply the compatability decomposition, followed by the canonical composition.
+        # The reason we do this is that some CEDICT characters are stored with double-width
+        # Roman letters in the character columns, but normal ones in the reading, like so:
+        # Ｕ盤 Ｕ盘 [U pan2] /USB flash drive/see also 閃存盤|闪存盘[shan3 cun2 pan2]/
+        #
+        # By putting the token into NFKC those crazy letters get turned into the normal ones
+        # that we can see in the reading column, and this assertion passes.
+        if corresponding != unicode(token) and unicodedata.normalize("NFKC", corresponding) != unicode(token):
             raise TonedCharactersFromReadingException("Character mismatch: %s vs %s" % (corresponding, unicode(token)))
         else:
-            return token
+            # NB: because the reading token may be one without the craziness, we need to make sure
+            # we use the possibly-crazy form to produce a text token here:
+            return Text(corresponding)
 
     def visitText(self, text):
         self.checkLength(len(text))

diff --git a/pinyin/tests/model.py b/pinyin/tests/model.py
@@ -312,6 +312,9 @@ def testTokenizeUnrecognisedHTML(self):
         #self.assertEquals([Text(u'<b />')], tokenize(u'<b />'))
         self.assertEquals([Text(u'<span style="mehhhh!">'), Text("</span>")], tokenize(u'<span style="mehhhh!"></span>'))
 
+    def testTokenizeWeirdyRomanCharacters(self):
+        self.assertEquals([Text(u'Ｕ')], tokenize(u'Ｕ'))
+
 class FormatReadingForDisplayTest(unittest.TestCase):
     # Test data:
     nihao_simp = u'你好，我喜欢学习汉语。我的汉语水平很低。'