Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Phonemes from file #8054

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import (
get_grapheme_character_set,
get_ipa_character_set,
get_ipa_punctuation_list,
validate_locale,
)
Expand Down Expand Up @@ -644,6 +645,7 @@ def __init__(
sep='|', # To be able to distinguish between symbols
add_blank_at=None,
pad_with_space=False,
read_phones_from_file=False,
):
"""General-purpose IPA-based tokenizer.
Args:
Expand All @@ -667,6 +669,9 @@ def __init__(
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
read_phones_from_file: Whether the text input read from the manifest should be taken as the sentence's
phoneme sequence representation. With this set to true, if fixed_vocab is not passed, the phoneme
tokens to be used wil be the ones defined in the g2p module of the given locale.
"""
if not hasattr(g2p, "symbols"):
logging.error(
Expand All @@ -682,6 +687,9 @@ def __init__(
self.phoneme_probability = None
if hasattr(g2p, "phoneme_probability"):
self.phoneme_probability = g2p.phoneme_probability
if read_phones_from_file:
# if input text is phoneme sequence, only phonemes will be used in training
self.phoneme_probability = 1

if locale == "en-US":
self.text_preprocessing_func = lambda text: english_text_preprocessing(text, lower=False)
Expand All @@ -698,6 +706,8 @@ def __init__(
"Did not replace G2P valid symbol set since the given set is equivalent to the existing one."
)
self.set_fixed_vocab = False
elif read_phones_from_file:
tokens = get_ipa_character_set(locale)
else:
g2p.replace_symbols(tokens)
else:
Expand Down Expand Up @@ -734,14 +744,19 @@ def __init__(
self.pad_with_space = pad_with_space

self.g2p = g2p
self.read_phones_from_file = read_phones_from_file

def encode(self, text: str) -> List[int]:
"""See base class for more information."""
# normalize the input text with "NFC" form.
text = self.text_preprocessing_func(text)

# transliterate the text into phoneme sequences and/or grapheme sequences.
g2p_text = self.g2p(text)
if self.read_phones_from_file:
# separate text input by chars if is to be used as phonemes
g2p_text = [c for c in text]
else:
# transliterate the text into phoneme sequences and/or grapheme sequences.
g2p_text = self.g2p(text)

return self.encode_from_g2p(g2p_text, text)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,18 @@ def test_ipa_tokenizer_fr_fr(self):

assert chars == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer_fr_fr_from_text(self):
input_text = "bɔ̃ʒˈuʁ lˈə- mˈɔ̃d"
expected_output = "bɔ̃ʒˈuʁ lˈə- mˈɔ̃d"

g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_FR, locale="fr-FR")
tokenizer = IPATokenizer(g2p=g2p, locale="fr-FR", read_phones_from_file=True)
chars, tokens = self._parse_text(tokenizer, input_text)

assert chars == expected_output

@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_ipa_tokenizer_fixed_vocab(self):
Expand Down
Loading