Skip to content

Commit

Permalink
Added a default speaker recognizer
Browse files Browse the repository at this point in the history
Added a default speaker recognizer, default_sr, which does not
attempt to identify the speaker but just responds with the first
name as stored in the profile.

I am also passing the result around from the sr_plugin to the
intent parser, to which the identity of the speaker can be
attached to the intent object being passed to the speechhandler.

I also simplified the list of parameters being passed to the mic
object when it is created, storing them instead in the profile.
  • Loading branch information
aaronchantrill committed Oct 17, 2022
1 parent 421d062 commit 26a663a
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 104 deletions.
9 changes: 8 additions & 1 deletion naomi/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,13 +500,20 @@ def __init__(
tts_plugin_info = profile.get_arg('plugins').get_plugin(tts_slug, category='tts')
tts_plugin = tts_plugin_info.plugin_class(tts_plugin_info)

# Initialize Speaker Recognition Engine
sr_slug = profile.get_profile_var(['sr_engine'], 'default_sr')
sr_plugin_info = profile.get_arg('plugins').get_plugin(sr_slug, category='sr')
sr_plugin = sr_plugin_info.plugin_class(sr_plugin_info)
profile.set_arg('sr_plugin', sr_plugin)

# audiolog for training
if(save_audio):
save_passive_audio = True
save_active_audio = True
save_noise = True

# Instead of passing the following values to
# Instead of passing the following values to mic, store them here and
# pick them up when needed.
profile.set_arg('input_device', input_device)
profile.set_arg('output_device', output_device)
profile.set_arg('sr_plugin', sr_plugin)
Expand Down
7 changes: 4 additions & 3 deletions naomi/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def get_all_phrases(self):
phrases.extend(self.get_plugin_phrases())
return sorted(list(set(phrases)))

def query(self, texts):
def query(self, sr_response):
"""
Passes user input to the appropriate module, testing it against
each candidate module's isValid function.
Expand All @@ -119,14 +119,15 @@ def query(self, texts):
Returns:
A tuple containing a text and the module that can handle it
"""
for text in texts:
for text in sr_response['utterance']:
# convert text to upper case and remove any punctuation
text = self._intentparser.cleantext(text)
intents = self._intentparser.determine_intent(text)
for intent in intents:
# Add the intent to the response so the handler method
# can find out which intent activated it
intents[intent]['intent'] = intent
intents[intent]['user'] = sr_response['speaker']
if(profile.get_arg("print_transcript")):
print("{} {}".format(intent, intents[intent]['score']))
if(profile.get_arg('save_active_audio')):
Expand Down Expand Up @@ -164,7 +165,7 @@ def query(self, texts):
return(intents[intent])
self._logger.debug(
"No module was able to handle any of these phrases: {}".format(
str(texts)
str(sr_response('utterance'))
)
)
return (None)
25 changes: 10 additions & 15 deletions naomi/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,7 @@ def askName(self):
self.mic.say(salutation)

def greet(self):
if profile.get(['first_name']):
salutation = self.gettext("How can I be of service, {}?").format(
profile.get(["first_name"])
)
else:
salutation = self.gettext("How can I be of service?")
salutation = self.gettext("How can I be of service?")
self.mic.say(salutation)

def handleForever(self):
Expand All @@ -51,23 +46,23 @@ def handleForever(self):
"""
self._logger.debug('Starting to handle conversation.')
while True:
utterance = self.mic.listen()
sr_response = self.mic.listen()
# if listen() returns False, just ignore it
if not isinstance(utterance, bool):
if not isinstance(sr_response['utterance'], bool):
handled = False
while(" ".join(utterance) != "" and not handled):
utterance, handled = self.handleRequest(utterance)
while(" ".join(sr_response['utterance']) != "" and not handled):
sr_respone, handled = self.handleRequest(sr_response)

def handleRequest(self, utterance):
def handleRequest(self, sr_response):
handled = False
intent = self.brain.query(utterance)
intent = self.brain.query(sr_response)
if intent:
try:
self._logger.info(intent)
intent['action'](intent, self.mic)
handled = True
except Unexpected as e:
utterance = e.utterance
sr_response = e.sr_response
except Exception as e:
self._logger.error(
'Failed to service intent {}: {}'.format(intent, str(e)),
Expand All @@ -84,14 +79,14 @@ def handleRequest(self, utterance):
"Handling of phrase '{}'",
"by plugin '{}' completed"
]).format(
utterance,
sr_response['utterance'],
intent
)
)
else:
self.say_i_do_not_understand()
handled = True
return utterance, handled
return sr_response, handled

def say_i_do_not_understand(self):
self.mic.say(
Expand Down
147 changes: 75 additions & 72 deletions naomi/mic.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,49 +35,36 @@ class Mic(object):
current_thread = None

def __init__(
self,
input_device,
output_device,
active_stt_reply,
active_stt_response,
passive_stt_engine,
active_stt_engine,
special_stt_slug,
plugins,
tts_engine,
vad_plugin,
keyword=['NAOMI'],
print_transcript=False,
passive_listen=False,
save_audio=False,
save_passive_audio=False,
save_active_audio=False,
save_noise=False
self
):
self._logger = logging.getLogger(__name__)
keyword = profile.get_profile_var(['keyword'], ['NAOMI'])
if isinstance(keyword, str):
keyword = [keyword]
self._keyword = keyword
self.tts_engine = tts_engine
self.passive_stt_engine = passive_stt_engine
self.active_stt_engine = active_stt_engine
self.special_stt_slug = special_stt_slug
self.plugins = plugins
self._input_device = input_device
self._output_device = output_device
self._vad_plugin = vad_plugin
self._active_stt_reply = active_stt_reply
self._active_stt_response = active_stt_response
self.passive_listen = passive_listen
self.tts_engine = profile.get_arg('tts_plugin')
self.sr_engine = profile.get_arg('sr_plugin')
self.passive_stt_engine = profile.get_arg('passive_stt_plugin')
self.active_stt_engine = profile.get_arg('active_stt_plugin')
self.special_stt_slug = profile.get_arg('special_stt_slug')
self.plugins = profile.get_arg('plugins')
self._input_device = profile.get_arg('input_device')
self._output_device = profile.get_arg('output_device')
self._vad_plugin = profile.get_arg('vad_plugin')
self._active_stt_reply = profile.get_arg('active_stt_reply')
self._active_stt_response = profile.get_arg('active_stt_response')
self.passive_listen = profile.get_arg('passive_listen')
# transcript for monitoring
self._print_transcript = print_transcript
self._print_transcript = profile.get_arg('print_transcript')
# audiolog for training
if(save_audio):
if(profile.get_arg('save_audio', False)):
self._save_passive_audio = True
self._save_active_audio = True
self._save_noise = True
else:
self._save_passive_audio = save_passive_audio
self._save_active_audio = save_active_audio
self._save_noise = save_noise
self._save_passive_audio = profile.get_arg('save_passive_audio', False)
self._save_active_audio = profile.get_arg('save_active_audio', False)
self._save_noise = profile.get_arg('save_noise', False)
if(
(
self._save_active_audio
Expand Down Expand Up @@ -315,61 +302,70 @@ def wait_for_keyword(self, keyword=None):
self.passive_stt_engine._volume_normalization
) as f:
try:
transcribed = [word.upper() for word in self.passive_stt_engine.transcribe(f)]
sr_output = self.sr_engine.recognize_speaker(f, self.passive_stt_engine)
except Exception:
transcribed = []
sr_output = {
'speaker': None,
'confidence': 0,
'utterance': []
}
dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
self._logger.error(
"Passive transcription failed!",
exc_info=dbg
)
else:
if(len(transcribed)):
if(len(sr_output['utterance'])):
if(self._print_transcript):
visualizations.run_visualization(
"output",
f"< {transcribed}"
f"< {sr_output['utterance']}"
)
if self.check_for_keyword(transcribed, keyword):
self._log_audio(f, transcribed, "passive")
if self.check_for_keyword(sr_output['utterance'], keyword):
self._log_audio(f, sr_output['utterance'], "passive")
if(self.passive_listen):
# Take the same block of audio and put it
# through the active listener
f.seek(0)
try:
transcribed = [word.upper() for word in self.active_stt_engine.transcribe(f)]
sr_output = self.sr_engine.recognize_speaker(f, self.active_stt_engine)
except Exception:
transcribed = []
sr_output = {
'speaker': None,
'confidence': 0,
'utterance': []
}
dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
self._logger.error("Active transcription failed!", exc_info=dbg)
else:
if(" ".join(transcribed).strip() == ""):
if(" ".join(sr_output['utterance']).strip() == ""):
if(self._print_transcript):
visualizations.run_visualization("output", "<< <noise>")
self._log_audio(f, transcribed, "noise")
self._log_audio(f, sr_output, "noise")
else:
if(self._print_transcript):
visualizations.run_visualization("output", "<< {}".format(transcribed))
self._log_audio(f, transcribed, "active")
visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
self._log_audio(f, sr_output, "active")
if(profile.get_profile_flag(['passive_stt', 'verify_wakeword'], False)):
# Check if any of the wakewords identified by
# the passive stt engine appear in the active
# transcript
if self.check_for_keyword(transcribed, keyword):
return transcribed
if self.check_for_keyword(sr_output['utterance'], keyword):
return sr_output
else:
self._logger.info('Wakeword not matched in active transcription')
else:
return transcribed
return sr_output
else:
if(profile.get_profile_flag(['passive_stt', 'verify_wakeword'], False)):
transcribed = [word.upper() for word in self.active_stt_engine.transcribe(f)]
if self.check_for_keyword(transcribed, keyword):
return transcribed
sr_output = self.sr_engine.recognize_speaker(f, self.active_stt_engine)
transcribed = [word.upper() for word in sr_output['utterance']]
if self.check_for_keyword(sr_output['utterance'], keyword):
return sr_output
else:
self._logger.info('Wakeword not matched in active transcription')
else:
return transcribed
return sr_output
else:
self._log_audio(f, transcribed, "noise")
else:
Expand All @@ -379,7 +375,6 @@ def wait_for_keyword(self, keyword=None):
return False

def active_listen(self, play_prompts=True):
transcribed = []
if(play_prompts):
# let the user know we are listening
if self._active_stt_reply:
Expand All @@ -403,21 +398,25 @@ def active_listen(self, play_prompts=True):
visualizations.run_visualization("output", ">> <boop>")
self.play_file(paths.data('audio', 'beep_lo.wav'))
try:
transcribed = [word.upper() for word in self.active_stt_engine.transcribe(f)]
sr_output = self.sr_engine.recognize_speaker(f, self.active_stt_engine)
except Exception:
transcribed = []
sr_output = {
'speaker': None,
'confidence': 0,
'utterance': []
}
dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
self._logger.error("Active transcription failed!", exc_info=dbg)
else:
if(" ".join(transcribed).strip() == ""):
if(" ".join(sr_output['utterance']).strip() == ""):
if(self._print_transcript):
visualizations.run_visualization("output", "<< <noise>")
self._log_audio(f, transcribed, "noise")
self._log_audio(f, sr_output, "noise")
else:
if(self._print_transcript):
visualizations.run_visualization("output", "<< {}".format(transcribed))
self._log_audio(f, transcribed, "active")
return transcribed
visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
self._log_audio(f, sr_output, "active")
return sr_output

def listen(self):
if(self.passive_listen):
Expand All @@ -427,7 +426,11 @@ def listen(self):
# wait_for_keyword normally returns either a list of key
if isinstance(kw, bool):
if(not kw):
return []
return {
'speaker': None,
'confidence': 0,
'utterance': []
}
# if not in passive_listen mode, then the user has tried to
# interrupt, go ahead and stop talking
self.stop(wait=True)
Expand Down Expand Up @@ -458,26 +461,26 @@ def expect(self, prompt, phrases, name='expect', instructions=None):
self.add_queue(lambda: profile.set_arg('resetmic', True))
# Now wait for any sounds in the queue to be processed
while not profile.get_arg('resetmic'):
transcribed = self.listen()
sr_output = self.listen()
handled = False
if isinstance(transcribed, bool):
if isinstance(sr_output['utterance'], bool):
handled = True
else:
while(" ".join(transcribed) != "" and not handled):
transcribed, handled = profile.get_arg('application').conversation.handleRequest(transcribed)
while(" ".join(sr_output['utterance']) != "" and not handled):
sr_output, handled = profile.get_arg('application').conversation.handleRequest(sr_output)
# Now that we are past the mic reset
profile.set_arg('resetmic', False)
else:
self.say(prompt)
# Now start listening for a response
with self.special_mode(name, phrases):
while True:
transcribed = self.active_listen()
if(len(' '.join(transcribed))):
sr_output = self.active_listen()
if(len(' '.join(sr_output['utterance']))):
# Now that we have a transcription, check if it matches one of the phrases
phrase, score = profile.get_arg("application").brain._intentparser.match_phrase(transcribed, expected_phrases)
phrase, score = profile.get_arg("application").brain._intentparser.match_phrase(sr_output['utterance'], expected_phrases)
# If it does, then return the phrase
self._logger.info("Expecting: {} Got: {}".format(expected_phrases, transcribed))
self._logger.info("Expecting: {} Got: {}".format(expected_phrases, sr_output['utterance']))
self._logger.info("Score: {}".format(score))
if(score > .1):
return phrase
Expand All @@ -487,8 +490,8 @@ def expect(self, prompt, phrases, name='expect', instructions=None):
# If the user is not responding to the prompt, then assume that
# they are starting a new command. This should mean that the wake
# word would be included.
if(self.check_for_keyword(transcribed)):
raise Unexpected(transcribed)
if(self.check_for_keyword(sr_output['utterance'])):
raise Unexpected(sr_output['utterance'])
else:
# The user just said something unexpected. Remind them of their choices
if instructions is None:
Expand Down
1 change: 1 addition & 0 deletions naomi/pluginstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ def __init__(self, plugin_dirs=None):
'speechhandler': plugin.SpeechHandlerPlugin,
'tti': plugin.TTIPlugin,
'tts': plugin.TTSPlugin,
'sr': plugin.SRPlugin,
'stt': plugin.STTPlugin,
'stt_trainer': plugin.STTTrainerPlugin,
'vad': plugin.VADPlugin,
Expand Down
Loading

0 comments on commit 26a663a

Please sign in to comment.