Added a default speaker recognizer

Added a default speaker recognizer, default_sr, which does not attempt to identify the speaker but just responds with the first name as stored in the profile. I am also passing the result around from the sr_plugin to the intent parser, to which the identity of the speaker can be attached to the intent object being passed to the speechhandler. I also simplified the list of parameters being passed to the mic object when it is created, storing them instead in the profile.
NaomiProject · Oct 17, 2022 · 26a663a · 26a663a
1 parent 421d062
commit 26a663a
Show file tree

Hide file tree

Showing 8 changed files with 144 additions and 104 deletions.
diff --git a/naomi/application.py b/naomi/application.py
@@ -500,13 +500,20 @@ def __init__(
         tts_plugin_info = profile.get_arg('plugins').get_plugin(tts_slug, category='tts')
         tts_plugin = tts_plugin_info.plugin_class(tts_plugin_info)
 
+        # Initialize Speaker Recognition Engine
+        sr_slug = profile.get_profile_var(['sr_engine'], 'default_sr')
+        sr_plugin_info = profile.get_arg('plugins').get_plugin(sr_slug, category='sr')
+        sr_plugin = sr_plugin_info.plugin_class(sr_plugin_info)
+        profile.set_arg('sr_plugin', sr_plugin)
+
         # audiolog for training
         if(save_audio):
             save_passive_audio = True
             save_active_audio = True
             save_noise = True
 
-        # Instead of passing the following values to 
+        # Instead of passing the following values to mic, store them here and
+        # pick them up when needed.
         profile.set_arg('input_device', input_device)
         profile.set_arg('output_device', output_device)
         profile.set_arg('sr_plugin', sr_plugin)

diff --git a/naomi/brain.py b/naomi/brain.py
@@ -104,7 +104,7 @@ def get_all_phrases(self):
         phrases.extend(self.get_plugin_phrases())
         return sorted(list(set(phrases)))
 
-    def query(self, texts):
+    def query(self, sr_response):
         """
         Passes user input to the appropriate module, testing it against
         each candidate module's isValid function.
@@ -119,14 +119,15 @@ def query(self, texts):
         Returns:
             A tuple containing a text and the module that can handle it
         """
-        for text in texts:
+        for text in sr_response['utterance']:
             # convert text to upper case and remove any punctuation
             text = self._intentparser.cleantext(text)
             intents = self._intentparser.determine_intent(text)
             for intent in intents:
                 # Add the intent to the response so the handler method
                 # can find out which intent activated it
                 intents[intent]['intent'] = intent
+                intents[intent]['user'] = sr_response['speaker']
                 if(profile.get_arg("print_transcript")):
                     print("{} {}".format(intent, intents[intent]['score']))
                 if(profile.get_arg('save_active_audio')):
@@ -164,7 +165,7 @@ def query(self, texts):
                     return(intents[intent])
             self._logger.debug(
                 "No module was able to handle any of these phrases: {}".format(
-                    str(texts)
+                    str(sr_response('utterance'))
                 )
             )
             return (None)
diff --git a/naomi/conversation.py b/naomi/conversation.py
@@ -37,12 +37,7 @@ def askName(self):
         self.mic.say(salutation)
 
     def greet(self):
-        if profile.get(['first_name']):
-            salutation = self.gettext("How can I be of service, {}?").format(
-                profile.get(["first_name"])
-            )
-        else:
-            salutation = self.gettext("How can I be of service?")
+        salutation = self.gettext("How can I be of service?")
         self.mic.say(salutation)
 
     def handleForever(self):
@@ -51,23 +46,23 @@ def handleForever(self):
         """
         self._logger.debug('Starting to handle conversation.')
         while True:
-            utterance = self.mic.listen()
+            sr_response = self.mic.listen()
             # if listen() returns False, just ignore it
-            if not isinstance(utterance, bool):
+            if not isinstance(sr_response['utterance'], bool):
                 handled = False
-                while(" ".join(utterance) != "" and not handled):
-                    utterance, handled = self.handleRequest(utterance)
+                while(" ".join(sr_response['utterance']) != "" and not handled):
+                    sr_respone, handled = self.handleRequest(sr_response)
 
-    def handleRequest(self, utterance):
+    def handleRequest(self, sr_response):
         handled = False
-        intent = self.brain.query(utterance)
+        intent = self.brain.query(sr_response)
         if intent:
             try:
                 self._logger.info(intent)
                 intent['action'](intent, self.mic)
                 handled = True
             except Unexpected as e:
-                utterance = e.utterance
+                sr_response = e.sr_response
             except Exception as e:
                 self._logger.error(
                     'Failed to service intent {}: {}'.format(intent, str(e)),
@@ -84,14 +79,14 @@ def handleRequest(self, utterance):
                         "Handling of phrase '{}'",
                         "by plugin '{}' completed"
                     ]).format(
-                        utterance,
+                        sr_response['utterance'],
                         intent
                     )
                 )
         else:
             self.say_i_do_not_understand()
             handled = True
-        return utterance, handled
+        return sr_response, handled
 
     def say_i_do_not_understand(self):
         self.mic.say(

diff --git a/naomi/mic.py b/naomi/mic.py
@@ -35,49 +35,36 @@ class Mic(object):
     current_thread = None
 
     def __init__(
-        self,
-        input_device,
-        output_device,
-        active_stt_reply,
-        active_stt_response,
-        passive_stt_engine,
-        active_stt_engine,
-        special_stt_slug,
-        plugins,
-        tts_engine,
-        vad_plugin,
-        keyword=['NAOMI'],
-        print_transcript=False,
-        passive_listen=False,
-        save_audio=False,
-        save_passive_audio=False,
-        save_active_audio=False,
-        save_noise=False
+        self
     ):
         self._logger = logging.getLogger(__name__)
+        keyword = profile.get_profile_var(['keyword'], ['NAOMI'])
+        if isinstance(keyword, str):
+            keyword = [keyword]
         self._keyword = keyword
-        self.tts_engine = tts_engine
-        self.passive_stt_engine = passive_stt_engine
-        self.active_stt_engine = active_stt_engine
-        self.special_stt_slug = special_stt_slug
-        self.plugins = plugins
-        self._input_device = input_device
-        self._output_device = output_device
-        self._vad_plugin = vad_plugin
-        self._active_stt_reply = active_stt_reply
-        self._active_stt_response = active_stt_response
-        self.passive_listen = passive_listen
+        self.tts_engine = profile.get_arg('tts_plugin')
+        self.sr_engine = profile.get_arg('sr_plugin')
+        self.passive_stt_engine = profile.get_arg('passive_stt_plugin')
+        self.active_stt_engine = profile.get_arg('active_stt_plugin')
+        self.special_stt_slug = profile.get_arg('special_stt_slug')
+        self.plugins = profile.get_arg('plugins')
+        self._input_device = profile.get_arg('input_device')
+        self._output_device = profile.get_arg('output_device')
+        self._vad_plugin = profile.get_arg('vad_plugin')
+        self._active_stt_reply = profile.get_arg('active_stt_reply')
+        self._active_stt_response = profile.get_arg('active_stt_response')
+        self.passive_listen = profile.get_arg('passive_listen')
         # transcript for monitoring
-        self._print_transcript = print_transcript
+        self._print_transcript = profile.get_arg('print_transcript')
         # audiolog for training
-        if(save_audio):
+        if(profile.get_arg('save_audio', False)):
             self._save_passive_audio = True
             self._save_active_audio = True
             self._save_noise = True
         else:
-            self._save_passive_audio = save_passive_audio
-            self._save_active_audio = save_active_audio
-            self._save_noise = save_noise
+            self._save_passive_audio = profile.get_arg('save_passive_audio', False)
+            self._save_active_audio = profile.get_arg('save_active_audio', False)
+            self._save_noise = profile.get_arg('save_noise', False)
         if(
             (
                 self._save_active_audio
@@ -315,61 +302,70 @@ def wait_for_keyword(self, keyword=None):
                 self.passive_stt_engine._volume_normalization
             ) as f:
                 try:
-                    transcribed = [word.upper() for word in self.passive_stt_engine.transcribe(f)]
+                    sr_output = self.sr_engine.recognize_speaker(f, self.passive_stt_engine)
                 except Exception:
-                    transcribed = []
+                    sr_output = {
+                        'speaker': None,
+                        'confidence': 0,
+                        'utterance': []
+                    }
                     dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
                     self._logger.error(
                         "Passive transcription failed!",
                         exc_info=dbg
                     )
                 else:
-                    if(len(transcribed)):
+                    if(len(sr_output['utterance'])):
                         if(self._print_transcript):
                             visualizations.run_visualization(
                                 "output",
-                                f"<  {transcribed}"
+                                f"<  {sr_output['utterance']}"
                             )
-                        if self.check_for_keyword(transcribed, keyword):
-                            self._log_audio(f, transcribed, "passive")
+                        if self.check_for_keyword(sr_output['utterance'], keyword):
+                            self._log_audio(f, sr_output['utterance'], "passive")
                             if(self.passive_listen):
                                 # Take the same block of audio and put it
                                 # through the active listener
                                 f.seek(0)
                                 try:
-                                    transcribed = [word.upper() for word in self.active_stt_engine.transcribe(f)]
+                                    sr_output = self.sr_engine.recognize_speaker(f, self.active_stt_engine)
                                 except Exception:
-                                    transcribed = []
+                                    sr_output = {
+                                        'speaker': None,
+                                        'confidence': 0,
+                                        'utterance': []
+                                    }
                                     dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
                                     self._logger.error("Active transcription failed!", exc_info=dbg)
                                 else:
-                                    if(" ".join(transcribed).strip() == ""):
+                                    if(" ".join(sr_output['utterance']).strip() == ""):
                                         if(self._print_transcript):
                                             visualizations.run_visualization("output", "<< <noise>")
-                                        self._log_audio(f, transcribed, "noise")
+                                        self._log_audio(f, sr_output, "noise")
                                     else:
                                         if(self._print_transcript):
-                                            visualizations.run_visualization("output", "<< {}".format(transcribed))
-                                        self._log_audio(f, transcribed, "active")
+                                            visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
+                                        self._log_audio(f, sr_output, "active")
                                 if(profile.get_profile_flag(['passive_stt', 'verify_wakeword'], False)):
                                     # Check if any of the wakewords identified by
                                     # the passive stt engine appear in the active
                                     # transcript
-                                    if self.check_for_keyword(transcribed, keyword):
-                                        return transcribed
+                                    if self.check_for_keyword(sr_output['utterance'], keyword):
+                                        return sr_output
                                     else:
                                         self._logger.info('Wakeword not matched in active transcription')
                                 else:
-                                    return transcribed
+                                    return sr_output
                             else:
                                 if(profile.get_profile_flag(['passive_stt', 'verify_wakeword'], False)):
-                                    transcribed = [word.upper() for word in self.active_stt_engine.transcribe(f)]
-                                    if self.check_for_keyword(transcribed, keyword):
-                                        return transcribed
+                                    sr_output = self.sr_engine.recognize_speaker(f, self.active_stt_engine)
+                                    transcribed = [word.upper() for word in sr_output['utterance']]
+                                    if self.check_for_keyword(sr_output['utterance'], keyword):
+                                        return sr_output
                                     else:
                                         self._logger.info('Wakeword not matched in active transcription')
                                 else:
-                                    return transcribed
+                                    return sr_output
                         else:
                             self._log_audio(f, transcribed, "noise")
                     else:
@@ -379,7 +375,6 @@ def wait_for_keyword(self, keyword=None):
         return False
 
     def active_listen(self, play_prompts=True):
-        transcribed = []
         if(play_prompts):
             # let the user know we are listening
             if self._active_stt_reply:
@@ -403,21 +398,25 @@ def active_listen(self, play_prompts=True):
                         visualizations.run_visualization("output", ">> <boop>")
                     self.play_file(paths.data('audio', 'beep_lo.wav'))
             try:
-                transcribed = [word.upper() for word in self.active_stt_engine.transcribe(f)]
+                sr_output = self.sr_engine.recognize_speaker(f, self.active_stt_engine)
             except Exception:
-                transcribed = []
+                sr_output = {
+                    'speaker': None,
+                    'confidence': 0,
+                    'utterance': []
+                }
                 dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
                 self._logger.error("Active transcription failed!", exc_info=dbg)
             else:
-                if(" ".join(transcribed).strip() == ""):
+                if(" ".join(sr_output['utterance']).strip() == ""):
                     if(self._print_transcript):
                         visualizations.run_visualization("output", "<< <noise>")
-                    self._log_audio(f, transcribed, "noise")
+                    self._log_audio(f, sr_output, "noise")
                 else:
                     if(self._print_transcript):
-                        visualizations.run_visualization("output", "<< {}".format(transcribed))
-                    self._log_audio(f, transcribed, "active")
-        return transcribed
+                        visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
+                    self._log_audio(f, sr_output, "active")
+        return sr_output
 
     def listen(self):
         if(self.passive_listen):
@@ -427,7 +426,11 @@ def listen(self):
             # wait_for_keyword normally returns either a list of key
             if isinstance(kw, bool):
                 if(not kw):
-                    return []
+                    return {
+                        'speaker': None,
+                        'confidence': 0,
+                        'utterance': []
+                    }
             # if not in passive_listen mode, then the user has tried to
             # interrupt, go ahead and stop talking
             self.stop(wait=True)
@@ -458,26 +461,26 @@ def expect(self, prompt, phrases, name='expect', instructions=None):
             self.add_queue(lambda: profile.set_arg('resetmic', True))
             # Now wait for any sounds in the queue to be processed
             while not profile.get_arg('resetmic'):
-                transcribed = self.listen()
+                sr_output = self.listen()
                 handled = False
-                if isinstance(transcribed, bool):
+                if isinstance(sr_output['utterance'], bool):
                     handled = True
                 else:
-                    while(" ".join(transcribed) != "" and not handled):
-                        transcribed, handled = profile.get_arg('application').conversation.handleRequest(transcribed)
+                    while(" ".join(sr_output['utterance']) != "" and not handled):
+                        sr_output, handled = profile.get_arg('application').conversation.handleRequest(sr_output)
             # Now that we are past the mic reset
             profile.set_arg('resetmic', False)
         else:
             self.say(prompt)
         # Now start listening for a response
         with self.special_mode(name, phrases):
             while True:
-                transcribed = self.active_listen()
-                if(len(' '.join(transcribed))):
+                sr_output = self.active_listen()
+                if(len(' '.join(sr_output['utterance']))):
                     # Now that we have a transcription, check if it matches one of the phrases
-                    phrase, score = profile.get_arg("application").brain._intentparser.match_phrase(transcribed, expected_phrases)
+                    phrase, score = profile.get_arg("application").brain._intentparser.match_phrase(sr_output['utterance'], expected_phrases)
                     # If it does, then return the phrase
-                    self._logger.info("Expecting: {} Got: {}".format(expected_phrases, transcribed))
+                    self._logger.info("Expecting: {} Got: {}".format(expected_phrases, sr_output['utterance']))
                     self._logger.info("Score: {}".format(score))
                     if(score > .1):
                         return phrase
@@ -487,8 +490,8 @@ def expect(self, prompt, phrases, name='expect', instructions=None):
                         # If the user is not responding to the prompt, then assume that
                         # they are starting a new command. This should mean that the wake
                         # word would be included.
-                        if(self.check_for_keyword(transcribed)):
-                            raise Unexpected(transcribed)
+                        if(self.check_for_keyword(sr_output['utterance'])):
+                            raise Unexpected(sr_output['utterance'])
                         else:
                             # The user just said something unexpected. Remind them of their choices
                             if instructions is None:

diff --git a/naomi/pluginstore.py b/naomi/pluginstore.py
@@ -209,6 +209,7 @@ def __init__(self, plugin_dirs=None):
             'speechhandler': plugin.SpeechHandlerPlugin,
             'tti': plugin.TTIPlugin,
             'tts': plugin.TTSPlugin,
+            'sr': plugin.SRPlugin,
             'stt': plugin.STTPlugin,
             'stt_trainer': plugin.STTTrainerPlugin,
             'vad': plugin.VADPlugin,