Fix spacing introduced after a bracket. Closes #94

Nick3C · Jul 4, 2009 · 5dfaeb8 · 5dfaeb8
1 parent d368314
commit 5dfaeb8
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 21 deletions.
diff --git a/pinyin/dictionary.py b/pinyin/dictionary.py
@@ -106,17 +106,16 @@ def addword(words, thing):
             readingtokens = self.__readings[thing]
 
             # If we already have some text building up, add a preceding space.
-            # However, if the word we got looks like punctuation, don't do it.
+            # However, if the word we got looks like a period, don't do it.
             # This ensures consistency in the treatment of Western and Chinese
             # punctuation.  Furthermore, avoid adding double-spaces.  This is
             # also important for punctuation consistency, because Western
             # punctuation is typically followed by a space whereas the Chinese
             # equivalents are not.
-            have_some_text = len(words) > 0
-            is_punctuation = ispunctuation(thing)
-            already_have_space = have_some_text and endswithspace(words[-1])
+            words_need_space = needsspacebeforeappend(words)
+            is_punctuation = ispunctuation(flatten(readingtokens))
             reading_starts_with_er = len(readingtokens) > 0 and readingtokens[0].iser
-            if have_some_text and not(is_punctuation) and not(already_have_space) and not(reading_starts_with_er):
+            if words_need_space and not(is_punctuation) and not(reading_starts_with_er):
                 words.append(Word(Text(u' ')))
 
             # Add this reading into the token list with nice formatting
@@ -163,7 +162,7 @@ def meanings(self, sentence, prefersimptrad):
         foundmeanings, foundmeasurewords = None, None
         for recognised, word in self.parse(sentence):
             if not(recognised) and (ispunctuation(word.strip()) or word.strip() == u""):
-                # Discard punctuation and whitespace from consideration, or we don't return a reading for e.g. 你好!
+                # Discard punctuation and whitespace from consideration, or we don't return a reading for e.g. "你好!"
                 continue
 
             if not (isfirstparsedthing):
@@ -365,6 +364,9 @@ def testTraditionalPinyin(self):
         def testWesternPunctuation(self):
             self.assertEqual(self.reading(self.nihao_simp_western_punc), self.nihao_reading)
 
+        def testNoSpacesAfterBraces(self):
+            self.assertEquals(self.reading(u"(你)好!"), u"(ni3)hao3!")
+
         def testEmptyString(self):
             self.assertEqual(self.reading(u""), u"")
 

diff --git a/pinyin/pinyin.py b/pinyin/pinyin.py
@@ -292,23 +292,25 @@ def visitTonedCharacter(self, tonedcharacter):
 Report whether the supplied list of words ends with a space
 character. Used for producing pretty formatted output.
 """
-def endswithspace(words):
-    visitor = EndsWithSpaceVisitor()
+def needsspacebeforeappend(words):
+    visitor = NeedsSpaceBeforeAppendVisitor()
     [word.accept(visitor) for word in words]
-    return visitor.endswithspace
+    return visitor.needsspacebeforeappend
 
-class EndsWithSpaceVisitor(TokenVisitor):
+class NeedsSpaceBeforeAppendVisitor(TokenVisitor):
     def __init__(self):
-        self.endswithspace = False
+        self.needsspacebeforeappend = False
 
     def visitText(self, text):
-        self.endswithspace = text.endswith(u" ")
+        lastchar = text[-1]
+        self.needsspacebeforeappend = (lastchar != " " and not(utils.ispunctuation(lastchar))) or utils.ispostspacedpunctuation(text)
 
     def visitPinyin(self, pinyin):
-        self.endswithspace = False
+        self.needsspacebeforeappend = True
 
     def visitTonedCharacter(self, tonedcharacter):
-        self.endswithspace = tonedcharacter.endswith(u" ")
+        # Treat it like normal text
+        self.visitText(tonedcharacter)
 
 """
 Makes some tokens that faithfully represent the given characters
@@ -618,14 +620,22 @@ def testFlattenTonified(self):
         def testUsesWrittenTone(self):
             self.assertEquals(flatten([Word(Pinyin("hen", ToneInfo(written=2,spoken=3)))]), "hen2")
 
-    class EndsWithSpaceTest(unittest.TestCase):
-        def testEmptyDoesntEndWithSpace(self):
-            self.assertFalse(endswithspace([]))
+    class NeedsSpaceBeforeAppendTest(unittest.TestCase):
+        def testEmptyDoesntNeedSpace(self):
+            self.assertFalse(needsspacebeforeappend([]))
 
         def testEndsWithSpace(self):
-            self.assertTrue(endswithspace([Word(Text("hello "))]))
-            self.assertTrue(endswithspace([Word(Text("hello"), Text(" "), Text("World"), Text(" "))]))
-            self.assertFalse(endswithspace([Word(Text("hello"))]))
+            self.assertFalse(needsspacebeforeappend([Word(Text("hello "))]))
+            self.assertFalse(needsspacebeforeappend([Word(Text("hello"), Text(" "), Text("World"), Text(" "))]))
+
+        def testNeedsSpace(self):
+            self.assertTrue(needsspacebeforeappend([Word(Text("hello"))]))
+
+        def testPunctuation(self):
+            self.assertTrue(needsspacebeforeappend([Word(Text("."))]))
+            self.assertTrue(needsspacebeforeappend([Word(Text(","))]))
+            self.assertFalse(needsspacebeforeappend([Word(Text("("))]))
+            self.assertFalse(needsspacebeforeappend([Word(Text(")"))]))
 
     class TonedCharactersFromReadingTest(unittest.TestCase):
         def testTonedTokens(self):

diff --git a/pinyin/utils.py b/pinyin/utils.py
@@ -46,11 +46,21 @@ def suppressexceptions(action):
 def ispunctuation(text):
     # NB: can't use "all" because it's not in Python 2.4 and below, which Anki uses
     for char in text:
-        if unicodedata.category(unicode(char)) != 'Po':
+        # For General_Category list see http://unicode.org/Public/UNIDATA/UCD.html
+        # Po . , ' "
+        # Pd -
+        # Ps ( [
+        # Pe ) ]
+        if 'P' not in unicodedata.category(unicode(char)):
             return False
 
     return True
 
+"""
+Reports whether a string consists of only punctuation characters that should have a space added after them.
+"""
+def ispostspacedpunctuation(text):
+    return text == u"。" or text == "." or text == u"，" or text == ","
 
 """
 Reports the absolute directory name that the pinyin/ directory has at runtime