From d58fbcd12d5f8901eb8adde01762a31c0b100a8f Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Fri, 12 Feb 2010 01:00:05 +0000 Subject: [PATCH] Try to reuse old sounds if possible, to speed up form filling --- pinyin/anki/mediamanager.py | 5 +++++ pinyin/mocks.py | 16 ++++++++++------ pinyin/tests/transformations.py | 20 ++++++++++---------- pinyin/tests/updatergraph.py | 15 +++++++++++++-- pinyin/transformations.py | 32 ++++++++++++++------------------ pinyin/updatergraph.py | 24 ++++++++++++++++-------- pinyin/utils.py | 12 ++++++++++++ 7 files changed, 80 insertions(+), 44 deletions(-) diff --git a/pinyin/anki/mediamanager.py b/pinyin/anki/mediamanager.py index d238c64..f9f3efd 100644 --- a/pinyin/anki/mediamanager.py +++ b/pinyin/anki/mediamanager.py @@ -1,5 +1,7 @@ import os +import anki.media + from pinyin.logger import log import pinyin.media import pinyin.utils @@ -35,3 +37,6 @@ def discovermediapacks(self): def importtocurrentdeck(self, file): return self.mw.deck.addMedia(file) + + def alreadyimported(self, file): + return os.path.exists(os.path.join(self.mw.deck.mediaDir(create=False), anki.media.mediaFilename(file))) diff --git a/pinyin/mocks.py b/pinyin/mocks.py index c140cf2..2bdcc46 100644 --- a/pinyin/mocks.py +++ b/pinyin/mocks.py @@ -33,15 +33,19 @@ def exception(self, *args): A media manager used in tests and the live preview functionality. """ class MockMediaManager(object): - def __init__(self, mediapacks, themediadir="dummy_dir"): - self.themediadir = themediadir - self.mediapacks = mediapacks + def __init__(self, mediapacks, mediadir="dummy_dir", alreadyimported=[]): + self._mediadir = mediadir + self._mediapacks = mediapacks + self._alreadyimported = alreadyimported def mediadir(self): - return self.themediadir + return self._mediadir def discovermediapacks(self): - return self.mediapacks + return self._mediapacks def importtocurrentdeck(self, filename): - return filename \ No newline at end of file + return filename + + def alreadyimported(self, path): + return path in self._alreadyimported \ No newline at end of file diff --git a/pinyin/tests/transformations.py b/pinyin/tests/transformations.py index f0edcb5..fdd8203 100644 --- a/pinyin/tests/transformations.py +++ b/pinyin/tests/transformations.py @@ -150,32 +150,32 @@ def testRandomizeBestPackOnTie(self): def testUseSpokenToneRatherThanWrittenOne(self): mediapacks = [MediaPack("Foo", { "ma2.mp3" : "ma2.mp3", "ma3.mp3" : "ma3.mp3" })] - mediapack, output, mediamissing = PinyinAudioReadings(mediapacks, [".mp3"]).audioreading([Word(Pinyin("ma", ToneInfo(written=2, spoken=3)))]) + mediapack, output, mediamissingcount = PinyinAudioReadings(mediapacks, [".mp3"]).audioreadings([Word(Pinyin("ma", ToneInfo(written=2, spoken=3)))])[0] self.assertEquals(mediapack, mediapacks[0]) - self.assertFalse(mediamissing) + self.assertEquals(mediamissingcount, 0) self.assertEquals(output, ["ma3.mp3"]) # Test helpers def assertHasReading(self, what, shouldbe, **kwargs): - bestpackshouldbe, mediapack, output, mediamissing = self.audioreading(what, **kwargs) + bestpackshouldbe, mediapack, output, mediamissingcount = self.audioreading(what, **kwargs) self.assertEquals(bestpackshouldbe, mediapack) self.assertEquals(output, shouldbe) - self.assertFalse(mediamissing) + self.assertEquals(mediamissingcount, 0) def assertHasPartialReading(self, what, shouldbe, **kwargs): - bestpackshouldbe, mediapack, output, mediamissing = self.audioreading(what, **kwargs) + bestpackshouldbe, mediapack, output, mediamissingcount = self.audioreading(what, **kwargs) self.assertEquals(bestpackshouldbe, mediapack) self.assertEquals(output, shouldbe) - self.assertTrue(mediamissing) + self.assertTrue(mediamissingcount > 0) def assertMediaMissing(self, what, **kwargs): - bestpackshouldbe, mediapack, output, mediamissing = self.audioreading(what, **kwargs) - self.assertTrue(mediamissing) + bestpackshouldbe, mediapack, output, mediamissingcount = self.audioreading(what, **kwargs) + self.assertTrue(mediamissingcount > 0) def audioreading(self, what, **kwargs): bestpackshouldbe, mediapacks = self.expandmediapacks(**kwargs) - mediapack, output, mediamissing = PinyinAudioReadings(mediapacks, [".mp3", ".ogg"]).audioreading(englishdict.reading(what)) - return bestpackshouldbe, mediapack, output, mediamissing + mediapack, output, mediamissingcount = PinyinAudioReadings(mediapacks, [".mp3", ".ogg"]).audioreadings(englishdict.reading(what))[0] + return bestpackshouldbe, mediapack, output, mediamissingcount def expandmediapacks(self, mediapacks=None, available_media=None, raw_available_media=default_raw_available_media, bestpackshouldbe=None): if mediapacks: diff --git a/pinyin/tests/updatergraph.py b/pinyin/tests/updatergraph.py index 87ba129..8d3fc29 100644 --- a/pinyin/tests/updatergraph.py +++ b/pinyin/tests/updatergraph.py @@ -190,6 +190,17 @@ def nassert(notifier): "reading" : u'san1 yue4', "audio" : None }, mediapacks=[], notifierassertion=nassert) + def testAudioUsesExistingSoundsIfPossible(self): + config = dict(colorizedpinyingeneration = False, detectmeasurewords = False, tonedisplay = "numeric") + + mediapacks = [media.MediaPack("Existing", { "san1.mp3" : "san1.mp3", "yue4.mp3" : "yue4.mp3" }), + media.MediaPack("NotExisting", { "san1.mp3" : "san1.mp3", "yue4.mp3" : "yue4.mp3" })] + + self.assertProduces({ "expression" : u"三月", "mwfieldinfact" : False }, config, { + "reading" : u'san1 yue4', "audio" : u"[sound:" + os.path.join("Existing", "san1.mp3") + "]" + + u"[sound:" + os.path.join("Existing", "yue4.mp3") + "]" + }, mediapacks=mediapacks, alreadyimported=[os.path.join("Existing", "san1.mp3"), os.path.join("Existing", "yue4.mp3")]) + def testUpdateMeasureWordAudio(self): config = dict(audioextensions = [".mp3", ".ogg"]) @@ -245,7 +256,7 @@ def testUpdateColoredCharactersFromReading(self): "color" : u'' }), reading) - def assertProduces(self, known, configdict, expected, mediapacks=None, notifierassertion=None): + def assertProduces(self, known, configdict, expected, mediapacks=None, alreadyimported=[], notifierassertion=None): if mediapacks == None: mediapacks = [media.MediaPack("Test", { "shu1.mp3" : "shu1.mp3", "shu1.ogg" : "shu1.ogg", "san1.mp3" : "san1.mp3", "qi1.ogg" : "qi1.ogg", "Kai1.mp3" : "location/Kai1.mp3", @@ -255,7 +266,7 @@ def assertProduces(self, known, configdict, expected, mediapacks=None, notifiera notifierassertion = lambda notifier: assert_equal(len(notifier.infos), 0) notifier = MockNotifier() - gbu = GraphBasedUpdater(notifier, MockMediaManager(mediapacks), pinyin.config.Config(pinyin.utils.updated({ "dictlanguage" : "en" }, configdict))) + gbu = GraphBasedUpdater(notifier, MockMediaManager(mediapacks, alreadyimported=alreadyimported), pinyin.config.Config(pinyin.utils.updated({ "dictlanguage" : "en" }, configdict))) graph = gbu.filledgraph({}, known) assert_dict_equal(dict([(key, graph[key][1]()) for key in expected.keys()]), expected, values_as_assertions=True) diff --git a/pinyin/transformations.py b/pinyin/transformations.py index 2610eec..adea445 100644 --- a/pinyin/transformations.py +++ b/pinyin/transformations.py @@ -5,6 +5,7 @@ import cStringIO import random import re +import itertools from logger import log from model import * @@ -189,29 +190,24 @@ def __init__(self, mediapacks, audioextensions): self.mediapacks = mediapacks self.audioextensions = audioextensions - def audioreading(self, tokens): + def audioreadings(self, tokens): log.info("Requested audio reading for %d tokens", len(tokens)) # Try possible packs to format the tokens. Basically, we # don't want to use a mix of sounds from different packs - bestmediapacksoutputs, bestmediamissingcount = [], len(tokens) + 1 - for mediapack in self.mediapacks: - log.info("Checking for reading in pack %s", mediapack.name) - output, mediamissingcount = audioreadingforpack(mediapack, self.audioextensions, trimerhua(tokens)) - - # We will end up choosing one of the packs that minimizes the number of errors: - if mediamissingcount == bestmediamissingcount: - # Just as good as a previous pack, so this is an alternative - bestmediapacksoutputs.append((mediapack, output)) - elif mediamissingcount < bestmediamissingcount: - # Strictly better than the previous ones, so this is the new best option - bestmediapacksoutputs = [(mediapack, output)] - bestmediamissingcount = mediamissingcount + possibilities = [(mediapack,) + audioreadingforpack(mediapack, self.audioextensions, trimerhua(tokens)) for mediapack in self.mediapacks] - # Did we get any result at all? - if len(bestmediapacksoutputs) != 0: - bestmediapack, bestoutput = random.choice(bestmediapacksoutputs) - return bestmediapack, bestoutput, (bestmediamissingcount != 0) + # Let the caller choose from only those packs that minimise the number of errors + possibilities = sorted(possibilities, using(lambda (_mediapack, output, missingmediacount): (missingmediacount, count(output)))) + return possibilities and list(itertools.takewhile(lambda (_mediapack, _output, missingmediacount): missingmediacount == possibilities[0][2], possibilities)) or [] + + # TODO: do I actually use this method? + def audioreading(self, tokens): + # Choose a random reading from the available possibilities, if any possibility exists + possibilities = self.audioreadings(tokens) + if possibilities: + bestmediapack, bestoutput, bestmissingmediacount = random.choice(possibilities) + return bestmediapack, bestoutput, (bestmissingmediacount != 0) else: return None, [], True diff --git a/pinyin/updatergraph.py b/pinyin/updatergraph.py index 0653aa0..7e21e0f 100644 --- a/pinyin/updatergraph.py +++ b/pinyin/updatergraph.py @@ -38,16 +38,24 @@ def generateaudio(notifier, mediamanager, config, dictreading): # There is no way we can generate an audio reading with no packs - give up return None - # Get the best media pack to generate the audio, along with the string of files from that pack we need to take - mediapack, output, _mediamissing = transformations.PinyinAudioReadings(mediapacks, config.audioextensions).audioreading(dictreading) + # Get the best media packs to generate the audio, along with the string of files from that pack we need to take. + # Mix up the possible audio packs while we are at it, to make things a bit more interesting... + possibilities = transformations.PinyinAudioReadings(mediapacks, config.audioextensions).audioreadings(dictreading) + random.shuffle(possibilities) - # Construct the string of audio tags from the optimal choice of sounds - output_tags = u"" - for outputfile in output: - # Install required media in the deck as we go, getting the canonical string to insert into the sound field upon installation - output_tags += u"[sound:%s]" % mediamanager.importtocurrentdeck(os.path.join(mediapack.packpath, outputfile)) + if not possibilities: + return u"" + else: + # Minimize the number of new sounds we have to import, to reduce the bloat in the audio count + mediapack, output, _mediamissingcount = maximumby(using(lambda (mediapack, output, _): count(output, lambda outputfile: mediamanager.alreadyimported(os.path.join(mediapack.packpath, outputfile)))), possibilities) - return output_tags + # Construct the string of audio tags from the optimal choice of sounds + output_tags = u"" + for outputfile in output: + # Install required media in the deck as we go, getting the canonical string to insert into the sound field upon installation + output_tags += u"[sound:%s]" % mediamanager.importtocurrentdeck(os.path.join(mediapack.packpath, outputfile)) + + return output_tags class Reformatter(object): def __init__(self, notifier, mediamanager, config): diff --git a/pinyin/utils.py b/pinyin/utils.py index 35e1109..fdc253f 100644 --- a/pinyin/utils.py +++ b/pinyin/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import itertools import os import re import sys @@ -425,6 +426,17 @@ def intersperse(what, things): return result +def maximumby(how, xs): + best = xs[0] + for x in xs: + if how(best, x) < 0: + best = x + + return best + +def count(xs, pred=bool): + return sum(itertools.imap(pred, xs)) + def substrings(text): for length in range(len(text), -1, -1): for i in range(0, len(text) - length):