Various bug fixes (#773)

MontrealCorpusTools · Mar 5, 2024 · 587ce0f · 587ce0f
1 parent 4657517
commit 587ce0f
Show file tree

Hide file tree

Showing 16 changed files with 134 additions and 33 deletions.
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,6 +5,15 @@
 3.0 Changelog
 *************
 
+3.0.2
+=====
+
+- Added support for :code:`--phone_groups_path` and :code:`--rules_path` to :ref:`validating_data`
+- Added support for speechbrain 1.0 release
+- Allow alignment with older models that don't have a dedicated speaker-independent :code:`.alimdl` model
+- Fixed a bug in loading lexicon compilers
+- Updated default feature configuration to remove dithering and use energy_floor=1.0, following `torchaudio's implementation <https://github.com/pytorch/audio/issues/371>`_
+
 3.0.1
 =====
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -286,7 +286,7 @@
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
 #
-default_role = "autolink"
+default_role = "code"
 
 # If true, '()' will be appended to :func: etc. cross-reference text.
 #

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -52,7 +52,7 @@ If you need to use an older version of MFA, you can install it via:
 
 More stable key versions:
 
-* Stable 3.0 release: :code:`conda install -c conda-forge montreal-forced-aligner=3.0.0`
+* Stable 3.0 release: :code:`conda install -c conda-forge montreal-forced-aligner=3.0.2`
 * Stable 2.2 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068`
 * Stable 2.1 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.1.7 openfst=1.8.2 kaldi=5.5.1068`
 * Stable 2.0 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.0.6 openfst=1.8.2 kaldi=5.5.1068`

diff --git a/docs/source/user_guide/configuration/acoustic_modeling.rst b/docs/source/user_guide/configuration/acoustic_modeling.rst
@@ -154,6 +154,14 @@ The below configuration file shows the equivalent of the current 2.0 training re
          max_gaussians: 15000
          num_iterations: 35
 
+     - sat:
+         subset: 20000
+         num_leaves: 2500
+         max_gaussians: 15000
+         power: 0.2
+         silence_weight: 0.0
+         fmllr_update_type: "full"
+
      - sat:
          subset: 50000
          num_leaves: 4200

diff --git a/docs/source/user_guide/corpus_structure.rst b/docs/source/user_guide/corpus_structure.rst
@@ -14,6 +14,12 @@ Prior to running the aligner, make sure the following are set up:
 
 3. Orthographic annotations in .lab files for individual sound files (:ref:`prosodylab_format`)
    or in TextGrid intervals for longer sound files (:ref:`textgrid_format`).
+.. note::
+
+   A collection of preprocessing scripts to get various corpora of other formats is available in the :xref:`mfa_reorg_scripts` and :xref:`corpus_creation_scripts`.
+
+Specifying speakers
+===================
 
 The sound files and the orthographic annotations should be contained in one directory structured as follows::
 
@@ -36,10 +42,48 @@ The sound files and the orthographic annotations should be contained in one dire
     |   --- ...
 
 
+.. _speaker_characters_flag:
 
-.. note::
+Using :code:`--speaker_characters` flag
+---------------------------------------
+
+.. warning::
+
+   In general I would not recommend using this flag and instead sticking to the default behavior of per-speaker directories for :ref:`prosodylab_format` and per-speaker tiers for :ref:`textgrid_format`.
+
+MFA also has limited support for a flat structure directory structure where speaker information is encoded in the file like::
+
+    +-- prosodylab_corpus_directory
+    |   --- speaker1_recording1.wav
+    |   --- speaker1_recording1.lab
+    |   --- speaker1_recording2.wav
+    |   --- speaker1_recording2.lab
+    |   --- speaker2_recording3.wav
+    |   --- speaker2_recording3.lab
+    |   --- ...
+
+By specifying :code:`--speaker_characters 8`, then the above files will be assigned "speaker1" and "speaker2" as their speakers from the first 8 characters of their file name.  Note that because this is dependent on the number of initial characters, if your speaker codes have varying lengths, then it will not behave correctly.
+
+For historical reasons, MFA also supports reading speaker information from files like the following::
+
+    +-- prosodylab_corpus_directory
+    |   --- experiment1_speaker1_recording1.wav
+    |   --- experiment1_speaker1_recording1.lab
+    |   --- experiment1_speaker1_recording2.wav
+    |   --- experiment1_speaker1_recording2.lab
+    |   --- experiment1_speaker2_recording3.wav
+    |   --- experiment1_speaker2_recording3.lab
+    |   --- ...
+
+By specifying :code:`--speaker_characters prosodylab`, then the above files will be assigned "speaker1" and "speaker2" from the second element when splitting by underscores in the file name.
+
+.. _single_speaker_flag:
+
+Using :code:`--single_speaker` flag
+-----------------------------------
+
+MFA uses multiple jobs to process utterances in parallel.  The default setup assigns utterances to jobs based on speakers, so all utterances from ``speaker1`` would be assigned to Job 1, all utterances from ``speaker2`` would be assigned to Job 2, and so on.  However, if there is only one speaker in the corpus (say if you're generating alignments for a Text-to-Speech corpus), then all files would be assigned to Job 1 and only one process would be used.  By using the :code:`--single_speaker` flag, MFA will distribute utterances across jobs equally and it will skip any speaker adaptation steps.
 
-   A collection of preprocessing scripts to get various corpora of other formats is available in the :xref:`mfa_reorg_scripts` and :xref:`corpus_creation_scripts`.
 
 Transcription file formats
 ==========================
@@ -57,10 +101,9 @@ transcription files must have the same name. For example, if you have ``givrep_1
 its transcription should be in ``givrep_1027_2_1.lab`` (which is just a
 text file with the .lab extension).
 
-.. note:: If you have transcriptions in a
-   tab-separated text file (or an Excel file, which can be saved as one),
-   you can generate .lab files from it using the relabel function of relabel_clean.py.
-   The relabel_clean.py script is currently in the prosodylab.alignertools repository on GitHub.
+.. note::
+
+   If you have transcriptions in a tab-separated text file (or an Excel file, which can be saved as one), you can generate .lab files from it using the relabel function of relabel_clean.py. The `relabel_clean.py script <https://github.com/prosodylab/prosodylab.alignertools/blob/master/relabel_clean.py>`_ is currently in the `prosodylab.alignertools repository on GitHub <https://github.com/prosodylab/prosodylab.alignertools>`_.
 
 If no ``.lab`` file is found, then the aligner will look for any matching ``.txt`` files and use those.
 
@@ -69,7 +112,7 @@ files are separated into subdirectories based on their speaker (with one
 speaker per file).
 
 An alternative way to specify which speaker says which
-segment is to use the ``-s`` flag with some number of characters of the file name as the speaker identifier.
+segment is to :ref:`speaker_characters_flag` with some number of characters of the file name as the speaker identifier.
 
 The output from aligning this format of data will be TextGrids that have a tier
 for words and a tier for phones.
@@ -88,7 +131,7 @@ of speech.
         :align: center
         :alt: Input TextGrid in Praat with intervals for each utterance and a single tier for a speaker
 
-If the ``-s`` flag is specified, the tier names will not be used as speaker names, and instead the first X characters
+If :ref:`speaker_characters_flag`, the tier names will not be used as speaker names, and instead the first X characters
 specified by the flag will be used as the speaker name.
 
 By default, each tier corresponds to a speaker (speaker "237" in the above example), so it is possible to

diff --git a/docs/source/user_guide/troubleshooting.rst b/docs/source/user_guide/troubleshooting.rst
@@ -92,9 +92,9 @@ Script example
                       else:
                          resampled_file = path.replace(ext, f'.wav')
                       if sys.platform == 'win32' or ext in {'.opus', '.ogg'}:
-                          command = ['ffmpeg', '-nostdin', '-hide_banner', '-loglevel', 'error', '-nostats', '-i', path '-acodec' 'pcm_s16le' '-f' 'wav', '-ar', '16000', resampled_file]
+                          command = ['ffmpeg', '-nostdin', '-hide_banner', '-loglevel', 'error', '-nostats', '-i', path, '-acodec', 'pcm_s16le', '-f', 'wav', '-ar', '16000', resampled_file]
                       else:
-                          command = ['sox', path, '-t', 'wav' '-r', '16000', '-b', '16', resampled_file]
+                          command = ['sox', path, '-t', 'wav', '-r', '16000', '-b', '16', resampled_file]
                       subprocess.check_call(command)
                       os.remove(path)
                       os.rename(resampled_file, path)

diff --git a/montreal_forced_aligner/alignment/base.py b/montreal_forced_aligner/alignment/base.py
@@ -359,12 +359,6 @@ def align(self, workflow_name=None) -> None:
         try:
             self.uses_speaker_adaptation = False
 
-            if (
-                acoustic_model is not None
-                and acoustic_model.meta["features"]["uses_speaker_adaptation"]
-                and perform_speaker_adaptation
-            ):
-                assert self.alignment_model_path.suffix == ".alimdl"
             self.compile_train_graphs()
 
             logger.info("Performing first-pass alignment...")

diff --git a/montreal_forced_aligner/command_line/validate.py b/montreal_forced_aligner/command_line/validate.py
@@ -62,9 +62,24 @@
 )
 @click.option(
     "--phone_set",
-    help="Enable extra decision tree modeling based on the phone set.",
+    "phone_set_type",
+    help="DEPRECATED, please use --phone_groups_path to specify phone groups instead.",
     default="UNKNOWN",
-    type=click.Choice(["UNKNOWN", "AUTO", "IPA", "ARPA", "PINYIN"]),
+    type=click.Choice(["UNKNOWN", "AUTO", "MFA", "IPA", "ARPA", "PINYIN"]),
+)
+@click.option(
+    "--phone_groups_path",
+    "phone_groups_path",
+    help="Path to yaml file defining phone groups. See "
+    "https://github.com/MontrealCorpusTools/mfa-models/tree/main/config/acoustic/phone_groups for examples.",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
+)
+@click.option(
+    "--rules_path",
+    "rules_path",
+    help="Path to yaml file defining phonological rules. See "
+    "https://github.com/MontrealCorpusTools/mfa-models/tree/main/config/acoustic/rules for examples.",
+    type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
 )
 @click.option(
     "--ignore_acoustics",
@@ -90,11 +105,21 @@ def validate_corpus_cli(context, **kwargs) -> None:
     if kwargs.get("profile", None) is not None:
         config.profile = kwargs.pop("profile")
     config.update_configuration(kwargs)
+    kwargs["USE_THREADING"] = False
 
     config_path = kwargs.get("config_path", None)
     corpus_directory = kwargs["corpus_directory"].absolute()
     dictionary_path = kwargs["dictionary_path"]
     acoustic_model_path = kwargs.get("acoustic_model_path", None)
+    if kwargs.get("phone_set_type", "UNKNOWN") != "UNKNOWN":
+        import warnings
+
+        warnings.warn(
+            "The flag `--phone_set` is deprecated, please use a yaml file for phone groups passed to "
+            "`--phone_groups_path`.  See "
+            "https://github.com/MontrealCorpusTools/mfa-models/tree/main/config/acoustic/phone_groups "
+            "for example phone group configurations that have been used in training MFA models."
+        )
     if acoustic_model_path:
         validator = PretrainedValidator(
             corpus_directory=corpus_directory,

diff --git a/montreal_forced_aligner/corpus/features.py b/montreal_forced_aligner/corpus/features.py
@@ -602,8 +602,8 @@ def __init__(
         sample_frequency: int = 16000,
         allow_downsample: bool = True,
         allow_upsample: bool = True,
-        dither: float = 1.0,
-        energy_floor: float = 0,
+        dither: float = 0.0,
+        energy_floor: float = 1.0,
         num_coefficients: int = 13,
         num_mel_bins: int = 23,
         cepstral_lifter: float = 22,
@@ -798,6 +798,12 @@ def mfcc_options(self) -> MetaDict:
                 "allow_upsample": self.allow_upsample,
                 "snip_edges": self.snip_edges,
             }
+        options.update(
+            {
+                "dither": 0.0,
+                "energy_floor": 1.0,
+            }
+        )
         options.update(
             {
                 "dither": self.dither,

diff --git a/montreal_forced_aligner/diarization/multiprocessing.py b/montreal_forced_aligner/diarization/multiprocessing.py
@@ -51,7 +51,12 @@
         torch_logger = logging.getLogger("speechbrain.utils.train_logger")
         torch_logger.setLevel(logging.ERROR)
         import torch
-        from speechbrain.pretrained import EncoderClassifier, SpeakerRecognition
+
+        try:
+            from speechbrain.pretrained import EncoderClassifier, SpeakerRecognition
+        except ImportError:  # speechbrain 1.0
+            from speechbrain.inference.classifiers import EncoderClassifier
+            from speechbrain.inference.speaker import SpeakerRecognition
     FOUND_SPEECHBRAIN = True
 except (ImportError, OSError):
     FOUND_SPEECHBRAIN = False

diff --git a/montreal_forced_aligner/diarization/speaker_diarizer.py b/montreal_forced_aligner/diarization/speaker_diarizer.py
@@ -81,7 +81,12 @@
         torch_logger = logging.getLogger("speechbrain.utils.train_logger")
         torch_logger.setLevel(logging.ERROR)
         import torch
-        from speechbrain.pretrained import EncoderClassifier, SpeakerRecognition
+
+        try:
+            from speechbrain.pretrained import EncoderClassifier, SpeakerRecognition
+        except ImportError:  # speechbrain 1.0
+            from speechbrain.inference.classifiers import EncoderClassifier
+            from speechbrain.inference.speaker import SpeakerRecognition
         from speechbrain.utils.metric_stats import EER
 
     FOUND_SPEECHBRAIN = True

diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
@@ -18,7 +18,7 @@
 if TYPE_CHECKING:
     from montreal_forced_aligner.abc import MetaDict
 
-DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）!\\&%#*~【】，…‥「」『』〝〟″⟨⟩♪・‹›«»～′$+=‘')
+DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）!\\&%#*~【】，…‥「」『』〝〟″⟨⟩♪・‹›«»～′$+=‘۔')
 
 DEFAULT_WORD_BREAK_MARKERS = list(r'？！!()，,.:;¡¿?“„"”&~%#—…‥、。【】$+=〝〟″‹›«»・⟨⟩「」『』')
 

diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py
@@ -1444,7 +1444,7 @@ def write_lexicon_information(self, write_disambiguation: Optional[bool] = False
     def load_lexicon_compilers(self):
         with self.session() as session:
             self.lexicon_compilers = {}
-            dictionaries = session.get(Dictionary)
+            dictionaries = session.query(Dictionary)
             for d in dictionaries:
                 self.lexicon_compilers[d.id] = d.lexicon_compiler
 

diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py
@@ -535,15 +535,15 @@ def mfcc_options(self) -> MetaDict:
             "sample_frequency": self._meta["features"].get("sample_frequency", 16000),
             "frame_shift": self._meta["features"].get("frame_shift", 10),
             "frame_length": self._meta["features"].get("frame_length", 25),
-            "dither": 0,  # self._meta["features"].get("dither", 1),
+            "dither": self._meta["features"].get("dither", 0.0),
             "preemphasis_coefficient": self._meta["features"].get("preemphasis_coefficient", 0.97),
             "snip_edges": self._meta["features"].get("snip_edges", True),
             "num_mel_bins": self._meta["features"].get("num_mel_bins", 23),
             "low_frequency": self._meta["features"].get("low_frequency", 20),
             "high_frequency": self._meta["features"].get("high_frequency", 7800),
             "num_coefficients": self._meta["features"].get("num_coefficients", 13),
             "use_energy": self._meta["features"].get("use_energy", False),
-            "energy_floor": self._meta["features"].get("energy_floor", 0.0),
+            "energy_floor": self._meta["features"].get("energy_floor", 1.0),
             "raw_energy": self._meta["features"].get("raw_energy", True),
             "cepstral_lifter": self._meta["features"].get("cepstral_lifter", 22),
         }
@@ -882,8 +882,8 @@ def mfcc_options(self) -> MetaDict:
         """Parameters to use in computing MFCC features."""
         return {
             "use_energy": self._meta["features"].get("use_energy", False),
-            "dither": self._meta["features"].get("dither", 1),
-            "energy_floor": self._meta["features"].get("energy_floor", 0),
+            "dither": self._meta["features"].get("dither", 0.0),
+            "energy_floor": self._meta["features"].get("energy_floor", 1.0),
             "num_coefficients": self._meta["features"].get("num_coefficients", 13),
             "num_mel_bins": self._meta["features"].get("num_mel_bins", 23),
             "cepstral_lifter": self._meta["features"].get("cepstral_lifter", 22),

diff --git a/montreal_forced_aligner/online/alignment.py b/montreal_forced_aligner/online/alignment.py
@@ -54,7 +54,9 @@ def align_utterance_online(
                     if not lexicon_compiler.word_table.member(w):
                         pron = rewriter(w)
                         if pron:
-                            lexicon_compiler.add_pronunciation(KalpyPronunciation(w, pron[0]))
+                            lexicon_compiler.add_pronunciation(
+                                KalpyPronunciation(w, pron[0], None, None, None, None, None)
+                            )
 
         else:
             text, pronunciation_form = tokenizer(text)
@@ -70,7 +72,7 @@ def align_utterance_online(
                         g2p_cache[w] = pron[0]
                     if w in g2p_cache and not lexicon_compiler.word_table.member(norm_w):
                         lexicon_compiler.add_pronunciation(
-                            KalpyPronunciation(norm_w, g2p_cache[w])
+                            KalpyPronunciation(norm_w, g2p_cache[w], None, None, None, None, None)
                         )
 
     graph_compiler = TrainingGraphCompiler(

diff --git a/montreal_forced_aligner/vad/multiprocessing.py b/montreal_forced_aligner/vad/multiprocessing.py
@@ -41,7 +41,11 @@
         torch_logger = logging.getLogger("speechbrain.utils.train_logger")
         torch_logger.setLevel(logging.ERROR)
         import torch
-        from speechbrain.pretrained import VAD
+
+        try:
+            from speechbrain.pretrained import VAD
+        except ImportError:  # speechbrain 1.0
+            from speechbrain.inference.VAD import VAD
 
     FOUND_SPEECHBRAIN = True
 except (ImportError, OSError):