Bug fixes (#638)

MontrealCorpusTools · May 17, 2023 · ab58ca6 · ab58ca6
1 parent 60c32ca
commit ab58ca6
Show file tree

Hide file tree

Showing 13 changed files with 66 additions and 23 deletions.
diff --git a/docs/source/changelog/changelog_2.2.rst b/docs/source/changelog/changelog_2.2.rst
@@ -5,6 +5,16 @@
 2.2 Changelog
 *************
 
+2.2.11
+======
+
+- Make socket updating more general
+- Remove false "no alignments" warning in alignment iterations while training
+- Fixed a bug in adding words to a dictionary
+- Fixed a bug where words marked as "<cutoff>" were being treated as "[bracketed]"
+- Silences DatabaseError while cleaning up MFA
+- Fix a crash with in fine tuning
+
 2.2.10
 ======
 

diff --git a/montreal_forced_aligner/acoustic_modeling/base.py b/montreal_forced_aligner/acoustic_modeling/base.py
@@ -440,12 +440,12 @@ def align_iteration(self) -> None:
         logger.debug(
             f"Generating alignments for iteration {self.iteration} took {time.time()-begin} seconds"
         )
-        logger.debug(f"Analyzing information for alignment in iteration {self.iteration}...")
-        begin = time.time()
-        self.compile_information()
-        logger.debug(
-            f"Analyzing iteration {self.iteration} alignments took {time.time()-begin} seconds"
-        )
+        # logger.debug(f"Analyzing information for alignment in iteration {self.iteration}...")
+        # begin = time.time()
+        # self.compile_information()
+        # logger.debug(
+        #    f"Analyzing iteration {self.iteration} alignments took {time.time()-begin} seconds"
+        # )
 
     @property
     def initialized(self) -> bool:

diff --git a/montreal_forced_aligner/alignment/mixins.py b/montreal_forced_aligner/alignment/mixins.py
@@ -499,13 +499,16 @@ def compile_information(self) -> None:
         average_logdet_sum = 0
         average_logdet_frames = 0
         beam_too_narrow_count = 0
-        too_short_count = 0
-        for data in alignment_info.values():
+        for k, data in alignment_info.items():
+            logger.debug(f"For job {k}:")
+            logger.debug(f'{len(data["unaligned"])} beam too narrow')
+            logger.debug(f'{data["total_frames"]} total frames')
+            logger.debug(f'{data["log_like"]} average log-likelihood')
             beam_too_narrow_count += len(data["unaligned"])
-            too_short_count += len(data["too_short"])
             avg_like_frames += data["total_frames"]
             avg_like_sum += data["log_like"] * data["total_frames"]
             if "logdet_frames" in data:
+                logger.debug(f'{data["logdet"]} average logdet')
                 average_logdet_frames += data["logdet_frames"]
                 average_logdet_sum += data["logdet"] * data["logdet_frames"]
 
@@ -538,10 +541,6 @@ def compile_information(self) -> None:
                 "No files were aligned, this likely indicates serious problems with the aligner."
             )
         else:
-            if too_short_count:
-                logger.debug(
-                    f"There were {too_short_count} utterances that were too short to be aligned."
-                )
             if beam_too_narrow_count:
                 logger.debug(
                     f"There were {beam_too_narrow_count} utterances that could not be aligned with "

diff --git a/montreal_forced_aligner/alignment/multiprocessing.py b/montreal_forced_aligner/alignment/multiprocessing.py
@@ -1445,7 +1445,6 @@ def _run(self) -> typing.Generator[typing.Tuple[int, float]]:
                         feature_segment_path,
                         "ark:-",
                     ],
-                    stdin=paste_proc.stdout,
                     stderr=log_file,
                     stdout=subprocess.PIPE,
                     env=os.environ,
@@ -1874,28 +1873,33 @@ def compile_information_func(
         r"^WARNING .* Did not successfully decode file (?P<utt>.*?), .*$"
     )
 
-    data = {"unaligned": [], "too_short": [], "log_like": 0, "total_frames": 0}
+    data = {"unaligned": [], "log_like": 0, "total_frames": 0}
     align_log_path = arguments.align_log_path
     if not os.path.exists(align_log_path):
         align_log_path = align_log_path.with_suffix(".fmllr.log")
-    with mfa_open(arguments.log_path, "w"), mfa_open(align_log_path, "r") as f:
+    with mfa_open(arguments.log_path, "w") as log_file, mfa_open(align_log_path, "r") as f:
+        log_file.write(f"Processing {align_log_path}...\n")
         for line in f:
             decode_error_match = re.match(decode_error_pattern, line)
             if decode_error_match:
-                data["unaligned"].append(decode_error_match.group("utt"))
+                utt = decode_error_match.group("utt")
+                data["unaligned"].append(utt)
+                log_file.write(f"Unaligned: {utt}\n")
                 continue
             log_like_match = re.search(log_like_pattern, line)
             if log_like_match:
                 log_like = log_like_match.group("log_like")
                 frames = log_like_match.group("frames")
                 data["log_like"] = float(log_like)
                 data["total_frames"] = int(frames)
+                log_file.write(line)
             m = re.search(average_logdet_pattern, line)
             if m:
                 logdet = float(m.group("logdet"))
                 frames = float(m.group("frames"))
                 data["logdet"] = logdet
                 data["logdet_frames"] = frames
+                log_file.write(line)
     return data
 
 

diff --git a/montreal_forced_aligner/command_line/mfa.py b/montreal_forced_aligner/command_line/mfa.py
@@ -34,6 +34,7 @@
     validate_dictionary_cli,
 )
 from montreal_forced_aligner.config import GLOBAL_CONFIG, update_command_history
+from montreal_forced_aligner.exceptions import DatabaseError
 from montreal_forced_aligner.utils import check_third_party
 
 BEGIN = time.time()
@@ -149,7 +150,10 @@ def mfa_cli(ctx: click.Context) -> None:
         atexit.register(hooks.history_save_handler)
         atexit.register(cleanup_logger)
         if auto_server:
-            atexit.register(stop_server)
+            try:
+                atexit.register(stop_server)
+            except DatabaseError:
+                pass
 
     mp.freeze_support()
 

diff --git a/montreal_forced_aligner/command_line/utils.py b/montreal_forced_aligner/command_line/utils.py
@@ -4,6 +4,7 @@
 import functools
 import logging
 import os
+import re
 import shutil
 import subprocess
 import sys
@@ -27,6 +28,7 @@
 from montreal_forced_aligner.models import MODEL_TYPES
 
 __all__ = [
+    "cleanup_logger",
     "validate_acoustic_model",
     "validate_g2p_model",
     "validate_ivector_extractor",
@@ -234,9 +236,7 @@ def configure_pg(directory):
         "#log_min_duration_statement = -1": "log_min_duration_statement = 5000",
         "#enable_partitionwise_join = off": "enable_partitionwise_join = on",
         "#enable_partitionwise_aggregate = off": "enable_partitionwise_aggregate = on",
-        "#unix_socket_directories = ''": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
-        "#unix_socket_directories = '/var/run/postgresql'": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
-        "#unix_socket_directories = '/tmp'": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
+        "#unix_socket_directories.*": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
         "#listen_addresses = 'localhost'": "listen_addresses = ''",
         "max_connections = 100": "max_connections = 1000",
     }
@@ -261,7 +261,7 @@ def configure_pg(directory):
     with mfa_open(directory.joinpath("postgresql.conf"), "r") as f:
         config = f.read()
     for query, rep in configuration_updates.items():
-        config = config.replace(query, rep)
+        config = re.sub(query, rep, config)
     with mfa_open(directory.joinpath("postgresql.conf"), "w") as f:
         f.write(config)
 

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -636,6 +636,7 @@ def normalize_text_arguments(self):
                     self.laughter_word,
                     self.oov_word,
                     self.bracketed_word,
+                    self.cutoff_word,
                     self.ignore_case,
                     getattr(self, "use_g2p", False),
                 )

diff --git a/montreal_forced_aligner/corpus/multiprocessing.py b/montreal_forced_aligner/corpus/multiprocessing.py
@@ -238,6 +238,7 @@ class NormalizeTextArguments(MfaArguments):
     laughter_word: str
     oov_word: str
     bracketed_word: str
+    cutoff_word: str
     ignore_case: bool
     use_g2p: bool
 
@@ -277,10 +278,12 @@ def __init__(self, args: NormalizeTextArguments):
         self.laughter_word = args.laughter_word
         self.oov_word = args.oov_word
         self.bracketed_word = args.bracketed_word
+        self.cutoff_word = args.cutoff_word
         self.clitic_marker = None
         self.clitic_cleanup_regex = None
         self.compound_regex = None
         self.bracket_regex = None
+        self.cutoff_regex = None
         self.bracket_sanitize_regex = None
         self.laughter_regex = None
         self.word_break_regex = None
@@ -311,6 +314,10 @@ def compile_regexes(self) -> None:
         if self.brackets:
             left_brackets = [x[0] for x in self.brackets]
             right_brackets = [x[1] for x in self.brackets]
+            self.cutoff_regex = re.compile(
+                rf"[{re.escape(''.join(left_brackets))}](cutoff|hes).*?[{re.escape(''.join(right_brackets))}]+",
+                flags=re.IGNORECASE,
+            )
             self.bracket_regex = re.compile(
                 rf"[{re.escape(''.join(left_brackets))}].*?[{re.escape(''.join(right_brackets))}]+"
             )
@@ -351,6 +358,8 @@ def compile_regexes(self) -> None:
 
         if self.laughter_regex is not None:
             self.non_speech_regexes[self.laughter_word] = self.laughter_regex
+        if self.cutoff_regex is not None:
+            self.non_speech_regexes[self.cutoff_word] = self.cutoff_regex
         if self.bracket_regex is not None:
             self.non_speech_regexes[self.bracketed_word] = self.bracket_regex
 
@@ -398,6 +407,8 @@ def _dictionary_sanitize(self, session):
             non_speech_regexes = {}
             if self.laughter_regex is not None:
                 non_speech_regexes[d.laughter_word] = self.laughter_regex
+            if self.cutoff_regex is not None:
+                non_speech_regexes[d.cutoff_word] = self.cutoff_regex
             if self.bracket_regex is not None:
                 non_speech_regexes[d.bracketed_word] = self.bracket_regex
             split_function = SplitWordsFunction(

diff --git a/montreal_forced_aligner/db.py b/montreal_forced_aligner/db.py
@@ -341,6 +341,7 @@ class Dictionary(MfaSqlBase):
     oov_word = Column(String, nullable=True, default="<unk>")
     oov_phone = Column(String, nullable=True, default="spn")
     bracketed_word = Column(String, nullable=True)
+    cutoff_word = Column(String, nullable=True)
     laughter_word = Column(String, nullable=True)
 
     use_g2p = Column(Boolean, nullable=False, default=False)

diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
@@ -434,6 +434,7 @@ def __init__(
         self.oov_word = oov_word
         self.silence_word = silence_word
         self.bracketed_word = "[bracketed]"
+        self.cutoff_word = "<cutoff>"
         self.laughter_word = "[laughter]"
         self.position_dependent_phones = position_dependent_phones
         self.optional_silence_phone = optional_silence_phone

diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py
@@ -333,6 +333,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
                         silence_word=self.silence_word,
                         oov_word=self.oov_word,
                         bracketed_word=self.bracketed_word,
+                        cutoff_word=self.cutoff_word,
                         laughter_word=self.laughter_word,
                         optional_silence_phone=self.optional_silence_phone,
                         oov_phone=self.oov_phone,
@@ -373,6 +374,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
 
                     special_words = {self.oov_word: WordType.oov}
                     special_words[self.bracketed_word] = WordType.bracketed
+                    special_words[self.cutoff_word] = WordType.cutoff
                     special_words[self.laughter_word] = WordType.laughter
                     specials_found = set()
                     if not os.path.exists(dictionary_model.path):
@@ -533,6 +535,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
                 session.commit()
                 special_graphemes = [self.silence_word, "<space>"]
                 special_graphemes.append(self.bracketed_word)
+                special_graphemes.append(self.cutoff_word)
                 special_graphemes.append(self.laughter_word)
                 for g in special_graphemes:
                     grapheme_objs.append(
@@ -1245,7 +1248,14 @@ def words_for_export(
                         Word.word_type.in_(
                             [WordType.speech, WordType.clitic, WordType.interjection]
                         ),
-                        Word.word.in_([self.oov_word, self.bracketed_word, self.laughter_word]),
+                        Word.word.in_(
+                            [
+                                self.oov_word,
+                                self.bracketed_word,
+                                self.cutoff_word,
+                                self.laughter_word,
+                            ]
+                        ),
                     ),
                 )
                 .order_by(Word.word)

diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py
@@ -54,6 +54,7 @@
     "ModelTypeNotSupportedError",
     "PronunciationAcousticMismatchError",
     "RootDirectoryError",
+    "DatabaseError",
 ]
 
 

diff --git a/montreal_forced_aligner/utils.py b/montreal_forced_aligner/utils.py
@@ -36,6 +36,7 @@
 from montreal_forced_aligner.textgrid import process_ctm_line
 
 __all__ = [
+    "check_third_party",
     "thirdparty_binary",
     "log_kaldi_errors",
     "get_mfa_version",