Fixed problems with speech recognizer

There were numerous problems with the speech recognizer class that made it not work when using the VOSK_sr plugin. I am no longer trying to recognize the speaker when listening passively for the wake word, only when doing the active listening. This is because the passive listening needs to be very fast. I am now putting the name of the identified user in parenthesis after the active listening transcript. Plugins can access the identity of the speaker as `intent.get('user', '')`. The only plugin currently set up to use this is the shutdown plugin. I also have an update to the Greetings plugin which greets you by name when you greet Naomi. The setup still assumes en-US when downloading the VOSK models, which needs to be fixed to respect the "language" setting in the profile. The VOSK speaker recognition is not terribly accurate. It also seems like you need to retrain your speaker recognition database from new recordings when you switch to different recording hardware.
NaomiProject · Nov 6, 2022 · dab0b66 · dab0b66
1 parent 26a663a
commit dab0b66
Show file tree

Hide file tree

Showing 6 changed files with 201 additions and 173 deletions.
diff --git a/naomi/application.py b/naomi/application.py
@@ -349,7 +349,7 @@ def __init__(
             vad_slug,
             category='vad'
         )
-        vad_plugin = vad_info.plugin_class(input_device)
+        vad_plugin = vad_info.plugin_class(input_device, vad_info)
 
         # Initialize Brain
         tti_slug = profile.get_profile_var(['tti_engine'], 'Naomi TTI')

diff --git a/naomi/mic.py b/naomi/mic.py
@@ -302,27 +302,23 @@ def wait_for_keyword(self, keyword=None):
                 self.passive_stt_engine._volume_normalization
             ) as f:
                 try:
-                    sr_output = self.sr_engine.recognize_speaker(f, self.passive_stt_engine)
+                    transcribed = [word.upper() for word in self.passive_stt_engine.transcribe(f)]
                 except Exception:
-                    sr_output = {
-                        'speaker': None,
-                        'confidence': 0,
-                        'utterance': []
-                    }
+                    transcribed = []
                     dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
                     self._logger.error(
                         "Passive transcription failed!",
                         exc_info=dbg
                     )
                 else:
-                    if(len(sr_output['utterance'])):
+                    if(len(transcribed)):
                         if(self._print_transcript):
                             visualizations.run_visualization(
                                 "output",
-                                f"<  {sr_output['utterance']}"
+                                f"<  {transcribed}"
                             )
-                        if self.check_for_keyword(sr_output['utterance'], keyword):
-                            self._log_audio(f, sr_output['utterance'], "passive")
+                        if self.check_for_keyword(transcribed, keyword):
+                            self._log_audio(f, transcribed, "passive")
                             if(self.passive_listen):
                                 # Take the same block of audio and put it
                                 # through the active listener
@@ -344,8 +340,14 @@ def wait_for_keyword(self, keyword=None):
                                         self._log_audio(f, sr_output, "noise")
                                     else:
                                         if(self._print_transcript):
-                                            visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
-                                        self._log_audio(f, sr_output, "active")
+                                            visualizations.run_visualization(
+                                                "output",
+                                                "<< {} ({})".format(
+                                                    sr_output['utterance'],
+                                                    sr_output['speaker']
+                                                )
+                                            )
+                                        self._log_audio(f, sr_output['utterance'], "active")
                                 if(profile.get_profile_flag(['passive_stt', 'verify_wakeword'], False)):
                                     # Check if any of the wakewords identified by
                                     # the passive stt engine appear in the active
@@ -414,7 +416,13 @@ def active_listen(self, play_prompts=True):
                     self._log_audio(f, sr_output, "noise")
                 else:
                     if(self._print_transcript):
-                        visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
+                        visualizations.run_visualization(
+                            "output",
+                            "<< {} ({})".format(
+                                sr_output['utterance'],
+                                sr_output['speaker']
+                            )
+                        )
                     self._log_audio(f, sr_output, "active")
         return sr_output
 

diff --git a/naomi/plugin.py b/naomi/plugin.py
@@ -88,12 +88,14 @@ def intents(self):
     def handle(self, intent, mic):
         pass
 
+
 class SRPlugin(GenericPlugin, metaclass=abc.ABCMeta):
     def recognize_speaker(self, filename):
         speaker = None
         utterance = profile.get_arg('active_stt_engine').transcribe(filename)
         return {'speaker': speaker, 'utterance': utterance}
 
+
 class STTPlugin(GenericPlugin, metaclass=abc.ABCMeta):
     def __init__(self, name, phrases, *args, **kwargs):
         GenericPlugin.__init__(self, *args, **kwargs)
@@ -165,13 +167,15 @@ class VADPlugin(GenericPlugin):
     # and after last voice detected
     # minimum capture is minimum audio to capture, minus the padding
     # at the front and end
-    def __init__(self, input_device, timeout=1, minimum_capture=0.5):
-        self._logger = logging.getLogger(__name__)
+    def __init__(self, input_device, *args, **kwargs):
+        GenericPlugin.__init__(self, *args, **kwargs)
         # input device
         self._input_device = input_device
         # Here is the number of frames that have to pass without
         # detecing a voice before we respond
         chunklength = input_device._input_chunksize / input_device._input_rate
+        timeout = kwargs.get('timeout', 1)
+        minimum_capture = kwargs.get('minimum_capture', 0.5)
         self._timeout = round(timeout / chunklength)
         # Mimimum capture frames is the smallest number of frames that will
         # be recognized as audio.
@@ -466,7 +470,7 @@ def cleantext(self, text):
     @staticmethod
     def getcontractions(phrase):
         return [phrase]
-        
+
     def match_phrase(self, phrase, choices):
         # If phrase is a list, convert to a string
         # (otherwise the "split" below throws an error)

diff --git a/plugins/speechhandler/shutdownplugin/shutdown.py b/plugins/speechhandler/shutdownplugin/shutdown.py
@@ -1,7 +1,6 @@
 # -*- coding: utf-8 -*-
 import random
 from naomi import plugin
-from naomi import profile
 
 
 class ShutdownPlugin(plugin.SpeechHandlerPlugin):
@@ -12,6 +11,7 @@ def intents(self):
                     'en-US': {
                         'templates': [
                             "SHUTDOWN",
+                            "SHUT DOWN",
                             "TURN YOURSELF OFF"
                         ]
                     },
@@ -41,7 +41,7 @@ def handle(self, intent, mic):
         text -- user-input, typically transcribed speech
         mic -- used to interact with the user (for both input and output)
         """
-        name = profile.get_profile_var(['first_name'], '')
+        name = intent.get('user', '')
 
         messages = [
             self.gettext("I'm shutting down."),