Skip to content

Commit

Permalink
Fix spacing introduced after a bracket. Closes #94
Browse files Browse the repository at this point in the history
  • Loading branch information
batterseapower committed Jul 4, 2009
1 parent d368314 commit 5dfaeb8
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 21 deletions.
14 changes: 8 additions & 6 deletions pinyin/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,16 @@ def addword(words, thing):
readingtokens = self.__readings[thing]

# If we already have some text building up, add a preceding space.
# However, if the word we got looks like punctuation, don't do it.
# However, if the word we got looks like a period, don't do it.
# This ensures consistency in the treatment of Western and Chinese
# punctuation. Furthermore, avoid adding double-spaces. This is
# also important for punctuation consistency, because Western
# punctuation is typically followed by a space whereas the Chinese
# equivalents are not.
have_some_text = len(words) > 0
is_punctuation = ispunctuation(thing)
already_have_space = have_some_text and endswithspace(words[-1])
words_need_space = needsspacebeforeappend(words)
is_punctuation = ispunctuation(flatten(readingtokens))
reading_starts_with_er = len(readingtokens) > 0 and readingtokens[0].iser
if have_some_text and not(is_punctuation) and not(already_have_space) and not(reading_starts_with_er):
if words_need_space and not(is_punctuation) and not(reading_starts_with_er):
words.append(Word(Text(u' ')))

# Add this reading into the token list with nice formatting
Expand Down Expand Up @@ -163,7 +162,7 @@ def meanings(self, sentence, prefersimptrad):
foundmeanings, foundmeasurewords = None, None
for recognised, word in self.parse(sentence):
if not(recognised) and (ispunctuation(word.strip()) or word.strip() == u""):
# Discard punctuation and whitespace from consideration, or we don't return a reading for e.g. 你好!
# Discard punctuation and whitespace from consideration, or we don't return a reading for e.g. "你好!"
continue

if not (isfirstparsedthing):
Expand Down Expand Up @@ -365,6 +364,9 @@ def testTraditionalPinyin(self):
def testWesternPunctuation(self):
self.assertEqual(self.reading(self.nihao_simp_western_punc), self.nihao_reading)

def testNoSpacesAfterBraces(self):
self.assertEquals(self.reading(u"(你)好!"), u"(ni3)hao3!")

def testEmptyString(self):
self.assertEqual(self.reading(u""), u"")

Expand Down
38 changes: 24 additions & 14 deletions pinyin/pinyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,23 +292,25 @@ def visitTonedCharacter(self, tonedcharacter):
Report whether the supplied list of words ends with a space
character. Used for producing pretty formatted output.
"""
def endswithspace(words):
visitor = EndsWithSpaceVisitor()
def needsspacebeforeappend(words):
visitor = NeedsSpaceBeforeAppendVisitor()
[word.accept(visitor) for word in words]
return visitor.endswithspace
return visitor.needsspacebeforeappend

class EndsWithSpaceVisitor(TokenVisitor):
class NeedsSpaceBeforeAppendVisitor(TokenVisitor):
def __init__(self):
self.endswithspace = False
self.needsspacebeforeappend = False

def visitText(self, text):
self.endswithspace = text.endswith(u" ")
lastchar = text[-1]
self.needsspacebeforeappend = (lastchar != " " and not(utils.ispunctuation(lastchar))) or utils.ispostspacedpunctuation(text)

def visitPinyin(self, pinyin):
self.endswithspace = False
self.needsspacebeforeappend = True

def visitTonedCharacter(self, tonedcharacter):
self.endswithspace = tonedcharacter.endswith(u" ")
# Treat it like normal text
self.visitText(tonedcharacter)

"""
Makes some tokens that faithfully represent the given characters
Expand Down Expand Up @@ -618,14 +620,22 @@ def testFlattenTonified(self):
def testUsesWrittenTone(self):
self.assertEquals(flatten([Word(Pinyin("hen", ToneInfo(written=2,spoken=3)))]), "hen2")

class EndsWithSpaceTest(unittest.TestCase):
def testEmptyDoesntEndWithSpace(self):
self.assertFalse(endswithspace([]))
class NeedsSpaceBeforeAppendTest(unittest.TestCase):
def testEmptyDoesntNeedSpace(self):
self.assertFalse(needsspacebeforeappend([]))

def testEndsWithSpace(self):
self.assertTrue(endswithspace([Word(Text("hello "))]))
self.assertTrue(endswithspace([Word(Text("hello"), Text(" "), Text("World"), Text(" "))]))
self.assertFalse(endswithspace([Word(Text("hello"))]))
self.assertFalse(needsspacebeforeappend([Word(Text("hello "))]))
self.assertFalse(needsspacebeforeappend([Word(Text("hello"), Text(" "), Text("World"), Text(" "))]))

def testNeedsSpace(self):
self.assertTrue(needsspacebeforeappend([Word(Text("hello"))]))

def testPunctuation(self):
self.assertTrue(needsspacebeforeappend([Word(Text("."))]))
self.assertTrue(needsspacebeforeappend([Word(Text(","))]))
self.assertFalse(needsspacebeforeappend([Word(Text("("))]))
self.assertFalse(needsspacebeforeappend([Word(Text(")"))]))

class TonedCharactersFromReadingTest(unittest.TestCase):
def testTonedTokens(self):
Expand Down
12 changes: 11 additions & 1 deletion pinyin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,21 @@ def suppressexceptions(action):
def ispunctuation(text):
# NB: can't use "all" because it's not in Python 2.4 and below, which Anki uses
for char in text:
if unicodedata.category(unicode(char)) != 'Po':
# For General_Category list see http://unicode.org/Public/UNIDATA/UCD.html
# Po . , ' "
# Pd -
# Ps ( [
# Pe ) ]
if 'P' not in unicodedata.category(unicode(char)):
return False

return True

"""
Reports whether a string consists of only punctuation characters that should have a space added after them.
"""
def ispostspacedpunctuation(text):
return text == u"。" or text == "." or text == u"," or text == ","

"""
Reports the absolute directory name that the pinyin/ directory has at runtime
Expand Down

0 comments on commit 5dfaeb8

Please sign in to comment.