Skip to content

Commit

Permalink
Fixed problems with speech recognizer
Browse files Browse the repository at this point in the history
There were numerous problems with the speech recognizer class that
made it not work when using the VOSK_sr plugin.

I am no longer trying to recognize the speaker when listening
passively for the wake word, only when doing the active listening.
This is because the passive listening needs to be very fast.

I am now putting the name of the identified user in parenthesis
after the active listening transcript. Plugins can access the
identity of the speaker as `intent.get('user', '')`. The only
plugin currently set up to use this is the shutdown plugin. I
also have an update to the Greetings plugin which greets you by
name when you greet Naomi.

The setup still assumes en-US when downloading the VOSK models,
which needs to be fixed to respect the "language" setting in the
profile.

The VOSK speaker recognition is not terribly accurate. It also
seems like you need to retrain your speaker recognition database
from new recordings when you switch to different recording
hardware.
  • Loading branch information
aaronchantrill committed Nov 6, 2022
1 parent 26a663a commit dab0b66
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 173 deletions.
2 changes: 1 addition & 1 deletion naomi/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def __init__(
vad_slug,
category='vad'
)
vad_plugin = vad_info.plugin_class(input_device)
vad_plugin = vad_info.plugin_class(input_device, vad_info)

# Initialize Brain
tti_slug = profile.get_profile_var(['tti_engine'], 'Naomi TTI')
Expand Down
34 changes: 21 additions & 13 deletions naomi/mic.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,27 +302,23 @@ def wait_for_keyword(self, keyword=None):
self.passive_stt_engine._volume_normalization
) as f:
try:
sr_output = self.sr_engine.recognize_speaker(f, self.passive_stt_engine)
transcribed = [word.upper() for word in self.passive_stt_engine.transcribe(f)]
except Exception:
sr_output = {
'speaker': None,
'confidence': 0,
'utterance': []
}
transcribed = []
dbg = (self._logger.getEffectiveLevel() == logging.DEBUG)
self._logger.error(
"Passive transcription failed!",
exc_info=dbg
)
else:
if(len(sr_output['utterance'])):
if(len(transcribed)):
if(self._print_transcript):
visualizations.run_visualization(
"output",
f"< {sr_output['utterance']}"
f"< {transcribed}"
)
if self.check_for_keyword(sr_output['utterance'], keyword):
self._log_audio(f, sr_output['utterance'], "passive")
if self.check_for_keyword(transcribed, keyword):
self._log_audio(f, transcribed, "passive")
if(self.passive_listen):
# Take the same block of audio and put it
# through the active listener
Expand All @@ -344,8 +340,14 @@ def wait_for_keyword(self, keyword=None):
self._log_audio(f, sr_output, "noise")
else:
if(self._print_transcript):
visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
self._log_audio(f, sr_output, "active")
visualizations.run_visualization(
"output",
"<< {} ({})".format(
sr_output['utterance'],
sr_output['speaker']
)
)
self._log_audio(f, sr_output['utterance'], "active")
if(profile.get_profile_flag(['passive_stt', 'verify_wakeword'], False)):
# Check if any of the wakewords identified by
# the passive stt engine appear in the active
Expand Down Expand Up @@ -414,7 +416,13 @@ def active_listen(self, play_prompts=True):
self._log_audio(f, sr_output, "noise")
else:
if(self._print_transcript):
visualizations.run_visualization("output", "<< {}".format(sr_output['utterance']))
visualizations.run_visualization(
"output",
"<< {} ({})".format(
sr_output['utterance'],
sr_output['speaker']
)
)
self._log_audio(f, sr_output, "active")
return sr_output

Expand Down
10 changes: 7 additions & 3 deletions naomi/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,14 @@ def intents(self):
def handle(self, intent, mic):
pass


class SRPlugin(GenericPlugin, metaclass=abc.ABCMeta):
def recognize_speaker(self, filename):
speaker = None
utterance = profile.get_arg('active_stt_engine').transcribe(filename)
return {'speaker': speaker, 'utterance': utterance}


class STTPlugin(GenericPlugin, metaclass=abc.ABCMeta):
def __init__(self, name, phrases, *args, **kwargs):
GenericPlugin.__init__(self, *args, **kwargs)
Expand Down Expand Up @@ -165,13 +167,15 @@ class VADPlugin(GenericPlugin):
# and after last voice detected
# minimum capture is minimum audio to capture, minus the padding
# at the front and end
def __init__(self, input_device, timeout=1, minimum_capture=0.5):
self._logger = logging.getLogger(__name__)
def __init__(self, input_device, *args, **kwargs):
GenericPlugin.__init__(self, *args, **kwargs)
# input device
self._input_device = input_device
# Here is the number of frames that have to pass without
# detecing a voice before we respond
chunklength = input_device._input_chunksize / input_device._input_rate
timeout = kwargs.get('timeout', 1)
minimum_capture = kwargs.get('minimum_capture', 0.5)
self._timeout = round(timeout / chunklength)
# Mimimum capture frames is the smallest number of frames that will
# be recognized as audio.
Expand Down Expand Up @@ -466,7 +470,7 @@ def cleantext(self, text):
@staticmethod
def getcontractions(phrase):
return [phrase]

def match_phrase(self, phrase, choices):
# If phrase is a list, convert to a string
# (otherwise the "split" below throws an error)
Expand Down
4 changes: 2 additions & 2 deletions plugins/speechhandler/shutdownplugin/shutdown.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
import random
from naomi import plugin
from naomi import profile


class ShutdownPlugin(plugin.SpeechHandlerPlugin):
Expand All @@ -12,6 +11,7 @@ def intents(self):
'en-US': {
'templates': [
"SHUTDOWN",
"SHUT DOWN",
"TURN YOURSELF OFF"
]
},
Expand Down Expand Up @@ -41,7 +41,7 @@ def handle(self, intent, mic):
text -- user-input, typically transcribed speech
mic -- used to interact with the user (for both input and output)
"""
name = profile.get_profile_var(['first_name'], '')
name = intent.get('user', '')

messages = [
self.gettext("I'm shutting down."),
Expand Down
Loading

0 comments on commit dab0b66

Please sign in to comment.