NVIDIA · mgrafu · Nov 2, 2023 · Nov 2, 2023
diff --git a/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py b/nemo/collections/common/tokenizers/text_to_speech/tts_tokenizers.py
@@ -21,6 +21,7 @@
 
 from nemo.collections.common.tokenizers.text_to_speech.ipa_lexicon import (
     get_grapheme_character_set,
+    get_ipa_character_set,
     get_ipa_punctuation_list,
     validate_locale,
 )
@@ -644,6 +645,7 @@ def __init__(
         sep='|',  # To be able to distinguish between symbols
         add_blank_at=None,
         pad_with_space=False,
+        read_phones_from_file=False,
     ):
         """General-purpose IPA-based tokenizer.
         Args:
@@ -667,6 +669,9 @@ def __init__(
             add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
                 if None then no blank in labels.
             pad_with_space: Whether to pad text with spaces at the beginning and at the end or not.
+            read_phones_from_file: Whether the text input read from the manifest should be taken as the sentence's 
+                phoneme sequence representation. With this set to true, if fixed_vocab is not passed, the phoneme
+                tokens to be used wil be the ones defined in the g2p module of the given locale.
         """
         if not hasattr(g2p, "symbols"):
             logging.error(
@@ -682,6 +687,9 @@ def __init__(
         self.phoneme_probability = None
         if hasattr(g2p, "phoneme_probability"):
             self.phoneme_probability = g2p.phoneme_probability
+        if read_phones_from_file:
+            # if input text is phoneme sequence, only phonemes will be used in training
+            self.phoneme_probability = 1
 
         if locale == "en-US":
             self.text_preprocessing_func = lambda text: english_text_preprocessing(text, lower=False)
@@ -698,6 +706,8 @@ def __init__(
                     "Did not replace G2P valid symbol set since the given set is equivalent to the existing one."
                 )
                 self.set_fixed_vocab = False
+            elif read_phones_from_file:
+                tokens = get_ipa_character_set(locale)
             else:
                 g2p.replace_symbols(tokens)
         else:
@@ -734,14 +744,19 @@ def __init__(
         self.pad_with_space = pad_with_space
 
         self.g2p = g2p
+        self.read_phones_from_file = read_phones_from_file
 
     def encode(self, text: str) -> List[int]:
         """See base class for more information."""
         # normalize the input text with "NFC" form.
         text = self.text_preprocessing_func(text)
 
-        # transliterate the text into phoneme sequences and/or grapheme sequences.
-        g2p_text = self.g2p(text)
+        if self.read_phones_from_file:
+            # separate text input by chars if is to be used as phonemes
+            g2p_text = [c for c in text]
+        else:
+            # transliterate the text into phoneme sequences and/or grapheme sequences.
+            g2p_text = self.g2p(text)
 
         return self.encode_from_g2p(g2p_text, text)
 

diff --git a/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py b/tests/collections/common/tokenizers/text_to_speech/test_tts_tokenizers.py
@@ -217,6 +217,18 @@ def test_ipa_tokenizer_fr_fr(self):
 
         assert chars == expected_output
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_ipa_tokenizer_fr_fr_from_text(self):
+        input_text = "bɔ̃ʒˈuʁ lˈə- mˈɔ̃d"
+        expected_output = "bɔ̃ʒˈuʁ lˈə- mˈɔ̃d"
+
+        g2p = IpaG2p(phoneme_dict=self.PHONEME_DICT_FR, locale="fr-FR")
+        tokenizer = IPATokenizer(g2p=g2p, locale="fr-FR", read_phones_from_file=True)
+        chars, tokens = self._parse_text(tokenizer, input_text)
+
+        assert chars == expected_output
+
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_ipa_tokenizer_fixed_vocab(self):