Skip to content

Commit

Permalink
Bug fixes (#638)
Browse files Browse the repository at this point in the history
  • Loading branch information
mmcauliffe committed May 17, 2023
1 parent 60c32ca commit ab58ca6
Show file tree
Hide file tree
Showing 13 changed files with 66 additions and 23 deletions.
10 changes: 10 additions & 0 deletions docs/source/changelog/changelog_2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,16 @@
2.2 Changelog
*************

2.2.11
======

- Make socket updating more general
- Remove false "no alignments" warning in alignment iterations while training
- Fixed a bug in adding words to a dictionary
- Fixed a bug where words marked as "<cutoff>" were being treated as "[bracketed]"
- Silences DatabaseError while cleaning up MFA
- Fix a crash with in fine tuning

2.2.10
======

Expand Down
12 changes: 6 additions & 6 deletions montreal_forced_aligner/acoustic_modeling/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,12 +440,12 @@ def align_iteration(self) -> None:
logger.debug(
f"Generating alignments for iteration {self.iteration} took {time.time()-begin} seconds"
)
logger.debug(f"Analyzing information for alignment in iteration {self.iteration}...")
begin = time.time()
self.compile_information()
logger.debug(
f"Analyzing iteration {self.iteration} alignments took {time.time()-begin} seconds"
)
# logger.debug(f"Analyzing information for alignment in iteration {self.iteration}...")
# begin = time.time()
# self.compile_information()
# logger.debug(
# f"Analyzing iteration {self.iteration} alignments took {time.time()-begin} seconds"
# )

@property
def initialized(self) -> bool:
Expand Down
13 changes: 6 additions & 7 deletions montreal_forced_aligner/alignment/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,13 +499,16 @@ def compile_information(self) -> None:
average_logdet_sum = 0
average_logdet_frames = 0
beam_too_narrow_count = 0
too_short_count = 0
for data in alignment_info.values():
for k, data in alignment_info.items():
logger.debug(f"For job {k}:")
logger.debug(f'{len(data["unaligned"])} beam too narrow')
logger.debug(f'{data["total_frames"]} total frames')
logger.debug(f'{data["log_like"]} average log-likelihood')
beam_too_narrow_count += len(data["unaligned"])
too_short_count += len(data["too_short"])
avg_like_frames += data["total_frames"]
avg_like_sum += data["log_like"] * data["total_frames"]
if "logdet_frames" in data:
logger.debug(f'{data["logdet"]} average logdet')
average_logdet_frames += data["logdet_frames"]
average_logdet_sum += data["logdet"] * data["logdet_frames"]

Expand Down Expand Up @@ -538,10 +541,6 @@ def compile_information(self) -> None:
"No files were aligned, this likely indicates serious problems with the aligner."
)
else:
if too_short_count:
logger.debug(
f"There were {too_short_count} utterances that were too short to be aligned."
)
if beam_too_narrow_count:
logger.debug(
f"There were {beam_too_narrow_count} utterances that could not be aligned with "
Expand Down
12 changes: 8 additions & 4 deletions montreal_forced_aligner/alignment/multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1445,7 +1445,6 @@ def _run(self) -> typing.Generator[typing.Tuple[int, float]]:
feature_segment_path,
"ark:-",
],
stdin=paste_proc.stdout,
stderr=log_file,
stdout=subprocess.PIPE,
env=os.environ,
Expand Down Expand Up @@ -1874,28 +1873,33 @@ def compile_information_func(
r"^WARNING .* Did not successfully decode file (?P<utt>.*?), .*$"
)

data = {"unaligned": [], "too_short": [], "log_like": 0, "total_frames": 0}
data = {"unaligned": [], "log_like": 0, "total_frames": 0}
align_log_path = arguments.align_log_path
if not os.path.exists(align_log_path):
align_log_path = align_log_path.with_suffix(".fmllr.log")
with mfa_open(arguments.log_path, "w"), mfa_open(align_log_path, "r") as f:
with mfa_open(arguments.log_path, "w") as log_file, mfa_open(align_log_path, "r") as f:
log_file.write(f"Processing {align_log_path}...\n")
for line in f:
decode_error_match = re.match(decode_error_pattern, line)
if decode_error_match:
data["unaligned"].append(decode_error_match.group("utt"))
utt = decode_error_match.group("utt")
data["unaligned"].append(utt)
log_file.write(f"Unaligned: {utt}\n")
continue
log_like_match = re.search(log_like_pattern, line)
if log_like_match:
log_like = log_like_match.group("log_like")
frames = log_like_match.group("frames")
data["log_like"] = float(log_like)
data["total_frames"] = int(frames)
log_file.write(line)
m = re.search(average_logdet_pattern, line)
if m:
logdet = float(m.group("logdet"))
frames = float(m.group("frames"))
data["logdet"] = logdet
data["logdet_frames"] = frames
log_file.write(line)
return data


Expand Down
6 changes: 5 additions & 1 deletion montreal_forced_aligner/command_line/mfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
validate_dictionary_cli,
)
from montreal_forced_aligner.config import GLOBAL_CONFIG, update_command_history
from montreal_forced_aligner.exceptions import DatabaseError
from montreal_forced_aligner.utils import check_third_party

BEGIN = time.time()
Expand Down Expand Up @@ -149,7 +150,10 @@ def mfa_cli(ctx: click.Context) -> None:
atexit.register(hooks.history_save_handler)
atexit.register(cleanup_logger)
if auto_server:
atexit.register(stop_server)
try:
atexit.register(stop_server)
except DatabaseError:
pass

mp.freeze_support()

Expand Down
8 changes: 4 additions & 4 deletions montreal_forced_aligner/command_line/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import functools
import logging
import os
import re
import shutil
import subprocess
import sys
Expand All @@ -27,6 +28,7 @@
from montreal_forced_aligner.models import MODEL_TYPES

__all__ = [
"cleanup_logger",
"validate_acoustic_model",
"validate_g2p_model",
"validate_ivector_extractor",
Expand Down Expand Up @@ -234,9 +236,7 @@ def configure_pg(directory):
"#log_min_duration_statement = -1": "log_min_duration_statement = 5000",
"#enable_partitionwise_join = off": "enable_partitionwise_join = on",
"#enable_partitionwise_aggregate = off": "enable_partitionwise_aggregate = on",
"#unix_socket_directories = ''": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
"#unix_socket_directories = '/var/run/postgresql'": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
"#unix_socket_directories = '/tmp'": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
"#unix_socket_directories.*": f"unix_socket_directories = '{GLOBAL_CONFIG.database_socket}'",
"#listen_addresses = 'localhost'": "listen_addresses = ''",
"max_connections = 100": "max_connections = 1000",
}
Expand All @@ -261,7 +261,7 @@ def configure_pg(directory):
with mfa_open(directory.joinpath("postgresql.conf"), "r") as f:
config = f.read()
for query, rep in configuration_updates.items():
config = config.replace(query, rep)
config = re.sub(query, rep, config)
with mfa_open(directory.joinpath("postgresql.conf"), "w") as f:
f.write(config)

Expand Down
1 change: 1 addition & 0 deletions montreal_forced_aligner/corpus/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,7 @@ def normalize_text_arguments(self):
self.laughter_word,
self.oov_word,
self.bracketed_word,
self.cutoff_word,
self.ignore_case,
getattr(self, "use_g2p", False),
)
Expand Down
11 changes: 11 additions & 0 deletions montreal_forced_aligner/corpus/multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ class NormalizeTextArguments(MfaArguments):
laughter_word: str
oov_word: str
bracketed_word: str
cutoff_word: str
ignore_case: bool
use_g2p: bool

Expand Down Expand Up @@ -277,10 +278,12 @@ def __init__(self, args: NormalizeTextArguments):
self.laughter_word = args.laughter_word
self.oov_word = args.oov_word
self.bracketed_word = args.bracketed_word
self.cutoff_word = args.cutoff_word
self.clitic_marker = None
self.clitic_cleanup_regex = None
self.compound_regex = None
self.bracket_regex = None
self.cutoff_regex = None
self.bracket_sanitize_regex = None
self.laughter_regex = None
self.word_break_regex = None
Expand Down Expand Up @@ -311,6 +314,10 @@ def compile_regexes(self) -> None:
if self.brackets:
left_brackets = [x[0] for x in self.brackets]
right_brackets = [x[1] for x in self.brackets]
self.cutoff_regex = re.compile(
rf"[{re.escape(''.join(left_brackets))}](cutoff|hes).*?[{re.escape(''.join(right_brackets))}]+",
flags=re.IGNORECASE,
)
self.bracket_regex = re.compile(
rf"[{re.escape(''.join(left_brackets))}].*?[{re.escape(''.join(right_brackets))}]+"
)
Expand Down Expand Up @@ -351,6 +358,8 @@ def compile_regexes(self) -> None:

if self.laughter_regex is not None:
self.non_speech_regexes[self.laughter_word] = self.laughter_regex
if self.cutoff_regex is not None:
self.non_speech_regexes[self.cutoff_word] = self.cutoff_regex
if self.bracket_regex is not None:
self.non_speech_regexes[self.bracketed_word] = self.bracket_regex

Expand Down Expand Up @@ -398,6 +407,8 @@ def _dictionary_sanitize(self, session):
non_speech_regexes = {}
if self.laughter_regex is not None:
non_speech_regexes[d.laughter_word] = self.laughter_regex
if self.cutoff_regex is not None:
non_speech_regexes[d.cutoff_word] = self.cutoff_regex
if self.bracket_regex is not None:
non_speech_regexes[d.bracketed_word] = self.bracket_regex
split_function = SplitWordsFunction(
Expand Down
1 change: 1 addition & 0 deletions montreal_forced_aligner/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ class Dictionary(MfaSqlBase):
oov_word = Column(String, nullable=True, default="<unk>")
oov_phone = Column(String, nullable=True, default="spn")
bracketed_word = Column(String, nullable=True)
cutoff_word = Column(String, nullable=True)
laughter_word = Column(String, nullable=True)

use_g2p = Column(Boolean, nullable=False, default=False)
Expand Down
1 change: 1 addition & 0 deletions montreal_forced_aligner/dictionary/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,7 @@ def __init__(
self.oov_word = oov_word
self.silence_word = silence_word
self.bracketed_word = "[bracketed]"
self.cutoff_word = "<cutoff>"
self.laughter_word = "[laughter]"
self.position_dependent_phones = position_dependent_phones
self.optional_silence_phone = optional_silence_phone
Expand Down
12 changes: 11 additions & 1 deletion montreal_forced_aligner/dictionary/multispeaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
silence_word=self.silence_word,
oov_word=self.oov_word,
bracketed_word=self.bracketed_word,
cutoff_word=self.cutoff_word,
laughter_word=self.laughter_word,
optional_silence_phone=self.optional_silence_phone,
oov_phone=self.oov_phone,
Expand Down Expand Up @@ -373,6 +374,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:

special_words = {self.oov_word: WordType.oov}
special_words[self.bracketed_word] = WordType.bracketed
special_words[self.cutoff_word] = WordType.cutoff
special_words[self.laughter_word] = WordType.laughter
specials_found = set()
if not os.path.exists(dictionary_model.path):
Expand Down Expand Up @@ -533,6 +535,7 @@ def dictionary_setup(self) -> Tuple[typing.Set[str], collections.Counter]:
session.commit()
special_graphemes = [self.silence_word, "<space>"]
special_graphemes.append(self.bracketed_word)
special_graphemes.append(self.cutoff_word)
special_graphemes.append(self.laughter_word)
for g in special_graphemes:
grapheme_objs.append(
Expand Down Expand Up @@ -1245,7 +1248,14 @@ def words_for_export(
Word.word_type.in_(
[WordType.speech, WordType.clitic, WordType.interjection]
),
Word.word.in_([self.oov_word, self.bracketed_word, self.laughter_word]),
Word.word.in_(
[
self.oov_word,
self.bracketed_word,
self.cutoff_word,
self.laughter_word,
]
),
),
)
.order_by(Word.word)
Expand Down
1 change: 1 addition & 0 deletions montreal_forced_aligner/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"ModelTypeNotSupportedError",
"PronunciationAcousticMismatchError",
"RootDirectoryError",
"DatabaseError",
]


Expand Down
1 change: 1 addition & 0 deletions montreal_forced_aligner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from montreal_forced_aligner.textgrid import process_ctm_line

__all__ = [
"check_third_party",
"thirdparty_binary",
"log_kaldi_errors",
"get_mfa_version",
Expand Down

0 comments on commit ab58ca6

Please sign in to comment.