2.2.8 (#615)

* Bug fixes * Fix bug in filter training utterances
MontrealCorpusTools · Apr 17, 2023 · 2569caa · 2569caa
1 parent 2f5cdd0
commit 2569caa
Show file tree

Hide file tree

Showing 28 changed files with 284 additions and 341 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -11,7 +11,7 @@ RUN useradd -ms /bin/bash mfauser
 RUN chown -R mfauser /mfa
 RUN chown -R mfauser /env
 USER mfauser
-ENV MFA_ROOT_ENVIRONMENT_VARIABLE=/mfa
+ENV MFA_ROOT_DIR=/mfa
 RUN conda run -p /env mfa server init
 
 RUN echo "source activate /env && mfa server start" > ~/.bashrc

diff --git a/docs/source/changelog/changelog_2.2.rst b/docs/source/changelog/changelog_2.2.rst
@@ -5,6 +5,13 @@
 2.2 Changelog
 *************
 
+2.2.8
+=====
+- Fixed a bug introduced in 2.2.4 that made segments overlap with silence intervals when using textgrid cleanup
+- Changed databases to always use the root MFA rather than rely on temporary directories to make it more consistent where database files and sockets will get placed.  This root directory can be changed via the environment variable :code:`MFA_ROOT_DIR`
+- Optimized training graph and collecting alignments after changes to how unknown words were represented internally
+- Changed feature generation to use piped audio loaded via PySoundFile rather than via calls to sox/ffmpeg directly
+
 2.2.7
 =====
 

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -70,7 +70,7 @@ A simple Dockerfile for installing MFA would be:
    RUN chown -R mfauser /mfa
    RUN chown -R mfauser /env
    USER mfauser
-   ENV MFA_ROOT_ENVIRONMENT_VARIABLE=/mfa
+   ENV MFA_ROOT_DIR=/mfa
    RUN conda run -p /env mfa server init
 
    RUN echo "source activate /env && mfa server start" > ~/.bashrc
@@ -84,7 +84,7 @@ Crucially, note the useradd and subsequent user commands:
    RUN chown -R mfauser /mfa
    RUN chown -R mfauser /env
    USER mfauser
-   ENV MFA_ROOT_ENVIRONMENT_VARIABLE=/mfa
+   ENV MFA_ROOT_DIR=/mfa
    RUN conda run -p /env mfa server init
 
 These lines ensure that the database is initialized without using Docker's default root user, avoiding a permissions error thrown by PostGreSQL.

diff --git a/environment.yml b/environment.yml
@@ -6,15 +6,18 @@ dependencies:
   - python>=3.8
   - numpy
   - librosa
+  - pysoundfile
   - tqdm
   - requests
   - pyyaml
   - dataclassy
   - kaldi=*=*cpu*
   - sox
   - ffmpeg
+  - scipy
   - pynini
   - openfst
+  - scikit-learn
   - hdbscan
   - baumwelch
   - ngram

diff --git a/montreal_forced_aligner/acoustic_modeling/trainer.py b/montreal_forced_aligner/acoustic_modeling/trainer.py
@@ -21,7 +21,14 @@
 from montreal_forced_aligner.abc import KaldiFunction, ModelExporterMixin, TopLevelMfaWorker
 from montreal_forced_aligner.config import GLOBAL_CONFIG
 from montreal_forced_aligner.data import MfaArguments, WorkflowType
-from montreal_forced_aligner.db import CorpusWorkflow, Dictionary, Job
+from montreal_forced_aligner.db import (
+    CorpusWorkflow,
+    Dictionary,
+    Job,
+    Speaker,
+    Utterance,
+    bulk_update,
+)
 from montreal_forced_aligner.exceptions import ConfigError, KaldiProcessingError
 from montreal_forced_aligner.helper import load_configuration, mfa_open, parse_old_features
 from montreal_forced_aligner.models import AcousticModel, DictionaryModel
@@ -333,6 +340,28 @@ def setup_trainers(self):
                         wf.current = True
             session.commit()
 
+    def filter_training_utterances(self):
+        logger.info("Filtering utterances with only unknown words...")
+        with self.session() as session:
+            dictionaries = session.query(Dictionary)
+            for d in dictionaries:
+                update_mapping = []
+                word_mapping = d.word_mapping
+                utterances = (
+                    session.query(Utterance.id, Utterance.normalized_text)
+                    .join(Utterance.speaker)
+                    .filter(Utterance.ignored == False)  # noqa
+                    .filter(Speaker.dictionary_id == d.id)
+                )
+                for u_id, text in utterances:
+                    words = text.split()
+                    if any(x in word_mapping for x in words):
+                        continue
+                    update_mapping.append({"id": u_id, "ignored": True})
+                if update_mapping:
+                    bulk_update(session, Utterance, update_mapping)
+                    session.commit()
+
     def setup(self) -> None:
         """Setup for acoustic model training"""
         super().setup()
@@ -342,6 +371,7 @@ def setup(self) -> None:
         try:
             self.load_corpus()
             self.setup_trainers()
+            self.filter_training_utterances()
         except Exception as e:
             if isinstance(e, KaldiProcessingError):
                 log_kaldi_errors(e.error_logs)

diff --git a/montreal_forced_aligner/alignment/multiprocessing.py b/montreal_forced_aligner/alignment/multiprocessing.py
@@ -24,6 +24,7 @@
 import pynini
 import pywrapfst
 import sqlalchemy
+from pynini.lib import rewrite
 from sqlalchemy.orm import Session, joinedload, selectinload, subqueryload
 
 from montreal_forced_aligner.corpus.features import (
@@ -53,7 +54,7 @@
     Word,
 )
 from montreal_forced_aligner.exceptions import AlignmentExportError, FeatureGenerationError
-from montreal_forced_aligner.helper import mfa_open, split_phone_position
+from montreal_forced_aligner.helper import align_pronunciations, mfa_open, split_phone_position
 from montreal_forced_aligner.textgrid import (
     construct_output_path,
     construct_output_tiers,
@@ -95,6 +96,8 @@
     "GeneratePronunciationsFunction",
 ]
 
+logger = logging.getLogger("mfa")
+
 
 def phones_to_prons(
     text: str,
@@ -104,17 +107,23 @@ def phones_to_prons(
     phone_symbol_table: pywrapfst.SymbolTableView,
     optional_silence_phone: str,
     transcription: bool = False,
-    clitic_marker=None,
+    clitic_marker: str = None,
+    oov_word: str = None,
+    use_g2p: bool = False,
 ):
-    if "<space>" in text:
+    if use_g2p:
         words = [x.replace(" ", "") for x in text.split("<space>")]
     else:
         words = text.split()
     word_begin = "#1"
     word_end = "#2"
     word_begin_symbol = phone_symbol_table.find(word_begin)
     word_end_symbol = phone_symbol_table.find(word_end)
-    acceptor = pynini.accep(text, token_type=word_symbol_table)
+    if use_g2p:
+        kaldi_text = text
+    else:
+        kaldi_text = " ".join([x if word_symbol_table.member(x) else oov_word for x in words])
+    acceptor = pynini.accep(kaldi_text, token_type=word_symbol_table)
     phone_to_word = pynini.compose(align_lexicon_fst, acceptor)
     phone_fst = pynini.Fst()
     current_state = phone_fst.add_state()
@@ -183,25 +192,27 @@ def phones_to_prons(
     try:
         path_string = pynini.shortestpath(lattice).project("input").string(phone_symbol_table)
     except Exception:
-        logging.debug("For the text and intervals:")
-        logging.debug(text)
-        logging.debug([x.label for x in intervals])
-        logging.debug("There was an issue composing word and phone FSTs")
-        logging.debug("PHONE FST:")
+        logger.debug("For the text and intervals:")
+        logger.debug(text)
+        logger.debug(kaldi_text)
+        logger.debug([x.label for x in intervals])
+        logger.debug("There was an issue composing word and phone FSTs")
+        logger.debug("PHONE FST:")
         phone_fst.set_input_symbols(phone_symbol_table)
         phone_fst.set_output_symbols(phone_symbol_table)
-        logging.debug(phone_fst)
-        logging.debug("PHONE_TO_WORD FST:")
+        logger.debug(phone_fst)
+        logger.debug("PHONE_TO_WORD FST:")
         phone_to_word.set_input_symbols(phone_symbol_table)
         phone_to_word.set_output_symbols(word_symbol_table)
-        logging.debug(phone_to_word)
+        logger.debug(phone_to_word)
         raise
     path_string = path_string.replace(f"{word_end} {word_begin}", word_begin)
     path_string = path_string.replace(f"{word_end}", word_begin)
+    path_string = re.sub(f"^{word_begin} ", "", path_string)
     word_splits = re.split(rf" ?{word_begin} ?", path_string)
-    word_splits = [x.split() for x in word_splits if x != optional_silence_phone and x]
-
-    return list(zip(words, word_splits))
+    word_splits = [x.split() for x in word_splits if x != optional_silence_phone]
+    pronunciations = align_pronunciations(words, list(zip(words, word_splits)), oov_word)
+    return pronunciations
 
 
 @dataclass
@@ -568,20 +579,27 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
                 workflow.working_directory, f"{self.job_name}.ha_out_disambig.temp"
             )
             text_int_paths = job.per_dictionary_text_int_scp_paths
+            batch_size = 1000
             if self.use_g2p:
-                import pynini
-                from pynini.lib import rewrite
 
                 from montreal_forced_aligner.g2p.generator import threshold_lattice_to_dfa
 
                 for d in job.dictionaries:
+                    log_file.write(f"Compiling graphs for {d.name} ({d.id})...\n")
                     fst = pynini.Fst.read(d.lexicon_fst_path)
-                    token_type = pynini.SymbolTable.read_text(d.grapheme_symbol_table_path)
+                    words = d.word_mapping
+                    if self.use_g2p:
+                        token_type = pywrapfst.SymbolTable.read_text(d.grapheme_symbol_table_path)
+                        text_column = Utterance.normalized_character_text
+                    else:
+                        token_type = pywrapfst.SymbolTable.read_text(d.words_symbol_path)
+                        text_column = Utterance.normalized_text
+                        fst.invert()
                     utterances = (
-                        session.query(Utterance.kaldi_id, Utterance.normalized_character_text)
+                        session.query(Utterance.kaldi_id, text_column)
                         .join(Utterance.speaker)
                         .filter(Utterance.ignored == False)  # noqa
-                        .filter(Utterance.normalized_character_text != "")
+                        .filter(text_column != "")
                         .filter(Utterance.job_id == self.job_name)
                         .filter(Speaker.dictionary_id == d.id)
                         .order_by(Utterance.kaldi_id)
@@ -593,8 +611,19 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
                     with mfa_open(fst_ark_path, "wb") as fst_output_file:
                         for utt_id, full_text in utterances:
                             try:
-                                lattice = rewrite.rewrite_lattice(full_text, fst, token_type)
-                                lattice = threshold_lattice_to_dfa(lattice, 2.0)
+                                if self.use_g2p:
+                                    lattice = rewrite.rewrite_lattice(full_text, fst, token_type)
+                                    lattice = threshold_lattice_to_dfa(lattice, 2.0)
+                                else:
+                                    text = " ".join(
+                                        [
+                                            x if x in words else d.oov_word
+                                            for x in full_text.split()
+                                        ]
+                                    )
+                                    a = pynini.accep(text, token_type=token_type)
+                                    lattice = rewrite.rewrite_lattice(a, fst)
+                                    lattice.invert()
                                 input = lattice.write_to_string()
                             except pynini.lib.rewrite.Error:
                                 log_file.write(f'Error composing "{full_text}"\n')
@@ -703,6 +732,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
 
             else:
                 for d in job.dictionaries:
+                    log_file.write(f"Compiling graphs for {d}")
                     fst_ark_path = job.construct_path(
                         workflow.working_directory, "fsts", "ark", d.id
                     )
@@ -711,6 +741,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
                         [
                             thirdparty_binary("compile-train-graphs"),
                             f"--read-disambig-syms={d.disambiguation_symbols_int_path}",
+                            f"--batch-size={batch_size}",
                             self.tree_path,
                             self.model_path,
                             d.lexicon_fst_path,
@@ -723,6 +754,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int]]:
                     )
                     for line in proc.stderr:
                         log_file.write(line)
+                        log_file.flush()
                         m = self.progress_pattern.match(line.strip())
                         if m:
                             yield int(m.group("succeeded")), int(m.group("failed"))
@@ -1766,6 +1798,7 @@ def _run(self) -> typing.Generator[typing.Tuple[int, int, str]]:
                         self.word_symbol_table,
                         self.phone_symbol_table,
                         self.optional_silence_phone,
+                        oov_word=self.oov_word,
                     )
                     if d.position_dependent_phones:
                         word_pronunciations = [
@@ -1837,7 +1870,7 @@ def compile_information_func(
             if decode_error_match:
                 data["unaligned"].append(decode_error_match.group("utt"))
                 continue
-            log_like_match = re.match(log_like_pattern, line)
+            log_like_match = re.search(log_like_pattern, line)
             if log_like_match:
                 log_like = log_like_match.group("log_like")
                 frames = log_like_match.group("frames")
@@ -1923,6 +1956,7 @@ def cleanup_intervals(
             self.phone_symbol_table,
             self.optional_silence_phone,
             self.transcription,
+            oov_word=self.oov_word,
         )
         actual_phone_intervals = []
         actual_word_intervals = []
@@ -2018,6 +2052,8 @@ def cleanup_g2p_intervals(
             self.phone_symbol_table,
             self.optional_silence_phone,
             clitic_marker=self.clitic_marker,
+            oov_word=self.oov_word,
+            use_g2p=True,
         )
         actual_phone_intervals = []
         actual_word_intervals = []

diff --git a/montreal_forced_aligner/alignment/pretrained.py b/montreal_forced_aligner/alignment/pretrained.py
@@ -296,10 +296,11 @@ def align_one_utterance(self, utterance: Utterance, session: Session) -> None:
         if not sox_string:
             sox_string = utterance.file.sound_file.sound_file_path
         text_int_path = self.working_directory.joinpath("text.int")
+        word_mapping = self.word_mapping(utterance.speaker.dictionary_id)
         with mfa_open(text_int_path, "w") as f:
             normalized_text_int = " ".join(
                 [
-                    str(self.word_mapping(utterance.speaker.dictionary_id)[x])
+                    str(word_mapping[x]) if x in word_mapping else str(word_mapping[self.oov_word])
                     for x in utterance.normalized_text.split()
                 ]
             )

diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py
@@ -3,7 +3,6 @@
 
 import atexit
 import multiprocessing as mp
-import os
 import sys
 import time
 import warnings
@@ -34,11 +33,7 @@
     validate_corpus_cli,
     validate_dictionary_cli,
 )
-from montreal_forced_aligner.config import (
-    GLOBAL_CONFIG,
-    MFA_PROFILE_VARIABLE,
-    update_command_history,
-)
+from montreal_forced_aligner.config import GLOBAL_CONFIG, update_command_history
 from montreal_forced_aligner.utils import check_third_party
 
 BEGIN = time.time()
@@ -118,7 +113,6 @@ def mfa_cli(ctx: click.Context) -> None:
     auto_server = False
     run_check = True
     if ctx.invoked_subcommand == "anchor":
-        os.environ[MFA_PROFILE_VARIABLE] = "anchor"
 
         GLOBAL_CONFIG.current_profile.clean = False
         GLOBAL_CONFIG.save()

diff --git a/montreal_forced_aligner/command_line/server.py b/montreal_forced_aligner/command_line/server.py
@@ -4,6 +4,7 @@
 import rich_click as click
 
 from montreal_forced_aligner.command_line.utils import (
+    common_options,
     delete_server,
     initialize_server,
     start_server,
@@ -28,6 +29,7 @@ def server_cli():
     default=None,
 )
 @click.help_option("-h", "--help")
+@common_options
 @click.pass_context
 def init_cli(context, **kwargs):
     if kwargs.get("profile", None) is not None:
@@ -46,6 +48,7 @@ def init_cli(context, **kwargs):
     default=None,
 )
 @click.help_option("-h", "--help")
+@common_options
 @click.pass_context
 def start_cli(context, **kwargs):
     if kwargs.get("profile", None) is not None:
@@ -71,6 +74,7 @@ def start_cli(context, **kwargs):
     default="fast",
 )
 @click.help_option("-h", "--help")
+@common_options
 @click.pass_context
 def stop_cli(context, **kwargs):
     if kwargs.get("profile", None) is not None:
@@ -89,6 +93,7 @@ def stop_cli(context, **kwargs):
     default=None,
 )
 @click.help_option("-h", "--help")
+@common_options
 @click.pass_context
 def delete_cli(context, **kwargs):
     if kwargs.get("profile", None) is not None: