From ec104cb4a3bc0cd68a3da7f8fc1e7c9931149c9e Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 25 May 2021 12:53:22 -0700 Subject: [PATCH 01/17] add jenkins test, refactoring Signed-off-by: ekmb --- Jenkinsfile | 18 +++++ .../text_normalization/data/suppletive.tsv | 3 +- .../data/whitelist_alternatives.tsv | 7 -- .../normalize_with_audio.py | 67 +++++++++---------- .../text_normalization/taggers/cardinal.py | 14 +++- .../test_cases_normalize_with_audio.txt | 58 +++++++++++++--- tests/nemo_text_processing/utils.py | 12 +++- 7 files changed, 119 insertions(+), 60 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ba2e3606e5a5..3daec034db46 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -167,6 +167,24 @@ pipeline { sh 'rm -rf /home/TestData/nlp/text_denorm/output/*' } } + stage('L2: TN with Audio (audio and raw text)') { + steps { + sh 'cd nemo_text_processing/text_normalization && \ + python python normalize_with_audio.py --text "The total amounts to \$4.76." \ + --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n 1 > /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt 2>&1 && \ + cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' + sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt' + } + } + stage('L2: TN with Audio (audio and text file)') { + steps { + sh 'cd nemo_text_processing/text_normalization && \ + python python normalize_with_audio.py --text /home/TestData/nlp/text_norm/audio_based/text.txt \ + --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n 1 > /home/TestData/nlp/text_norm/audio_based/output/out_file.txt 2>&1 && \ + cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' + sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_file.txt' + } + } } } diff --git a/nemo_text_processing/text_normalization/data/suppletive.tsv b/nemo_text_processing/text_normalization/data/suppletive.tsv index dea620f37ab3..be59872c28d9 100644 --- a/nemo_text_processing/text_normalization/data/suppletive.tsv +++ b/nemo_text_processing/text_normalization/data/suppletive.tsv @@ -34,4 +34,5 @@ revolution per minute revolutions per minute mile per hour miles per hour megabit per second megabits per second square foot square feet -kilobit per second kilobits per second \ No newline at end of file +kilobit per second kilobits per second +lb \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv index ffb60fceb1c4..6b9e93d66d4f 100644 --- a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv +++ b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv @@ -1,10 +1,3 @@ -II. the Second -II Second -III. the Third -III Third -IV. the Fourth -IV Fourth -VIII. the Eighth Hon. honorable Hon. honourable St. street diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 880606718633..82f23e144672 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -15,6 +15,7 @@ import json import os import re +import time from argparse import ArgumentParser from typing import List, Tuple @@ -28,6 +29,7 @@ try: import pynini + from pynini.lib import rewrite PYNINI_AVAILABLE = True except (ModuleNotFoundError, ImportError): @@ -83,7 +85,6 @@ def normalize_with_audio(self, text: str, verbose: bool = False) -> str: Args: text: string that may include semiotic classes - transcript: transcription of the audio verbose: whether to print intermediate meta information Returns: @@ -96,12 +97,7 @@ def normalize_with_audio(self, text: str, verbose: bool = False) -> str: return text text = pynini.escape(text) - def get_tagged_texts(text): - tagged_lattice = self.find_tags(text) - tagged_texts = self.select_all_semiotic_tags(tagged_lattice) - return tagged_texts - - tagged_texts = set(get_tagged_texts(text)) + tagged_texts = rewrite.rewrites(text, self.tagger.fst) normalized_texts = [] for tagged_text in tagged_texts: @@ -110,23 +106,21 @@ def get_tagged_texts(text): tags_reordered = self.generate_permutations(tokens) for tagged_text_reordered in tags_reordered: tagged_text_reordered = pynini.escape(tagged_text_reordered) - - verbalizer_lattice = self.find_verbalizer(tagged_text_reordered) - if verbalizer_lattice.num_states() == 0: + try: + verbalized = rewrite.rewrites(tagged_text_reordered, self.verbalizer.fst) + normalized_texts.extend(verbalized) + except pynini.lib.rewrite.Error: continue - verbalized = self.get_all_verbalizers(verbalizer_lattice) - for verbalized_option in verbalized: - normalized_texts.append(verbalized_option) - if len(normalized_texts) == 0: raise ValueError() - normalized_texts = [post_process(t) for t in normalized_texts] normalized_texts = set(normalized_texts) return normalized_texts - def select_best_match(self, normalized_texts: List[str], transcript: str, verbose: bool = False): + def select_best_match( + self, normalized_texts: List[str], transcript: str, verbose: bool = False, remove_punct: bool = False + ): """ Selects the best normalization option based on the lowest CER @@ -134,11 +128,12 @@ def select_best_match(self, normalized_texts: List[str], transcript: str, verbos normalized_texts: normalized text options transcript: ASR model transcript of the audio file corresponding to the normalized text verbose: whether to print intermediate meta information + remove_punct: whether to remove punctuation before calculating CER Returns: normalized text with the lowest CER and CER value """ - normalized_texts = calculate_cer(normalized_texts, transcript) + normalized_texts = calculate_cer(normalized_texts, transcript, remove_punct) normalized_texts = sorted(normalized_texts, key=lambda x: x[1]) normalized_text, cer = normalized_texts[0] @@ -149,16 +144,6 @@ def select_best_match(self, normalized_texts: List[str], transcript: str, verbos print('-' * 30) return normalized_text, cer - def select_all_semiotic_tags(self, lattice: 'pynini.FstLike', n=100) -> List[str]: - tagged_text_options = pynini.shortestpath(lattice, nshortest=n) - tagged_text_options = [t[1] for t in tagged_text_options.paths("utf8").items()] - return tagged_text_options - - def get_all_verbalizers(self, lattice: 'pynini.FstLike', n=100) -> List[str]: - verbalized_options = pynini.shortestpath(lattice, nshortest=n) - verbalized_options = [t[1] for t in verbalized_options.paths("utf8").items()] - return verbalized_options - def calculate_cer(normalized_texts: List[str], transcript: str, remove_punct=False) -> List[Tuple[str, float]]: """ @@ -266,6 +251,7 @@ def parse_args(): '--model', type=str, default='QuartzNet15x5Base-En', help='Pre-trained model name or path to model checkpoint' ) parser.add_argument("--verbose", help="print info for debugging", action='store_true') + parser.add_argument("--remove_punct", help="remove punctuation before calculating cer", action='store_true') return parser.parse_args() @@ -275,7 +261,7 @@ def normalize_manifest(args): manifest: path to .json manifest file. """ normalizer = NormalizerWithAudio(input_case=args.input_case) - manifest_out = args.audio_data.replace('.json', '_nemo_wfst.json') + manifest_out = args.audio_data.replace('.json', '_normalized.json') asr_model = None with open(args.audio_data, 'r') as f: with open(manifest_out, 'w') as f_out: @@ -289,10 +275,11 @@ def normalize_manifest(args): asr_model = get_asr_model(args.model) transcript = asr_model.transcribe([audio])[0] normalized_texts = normalizer.normalize_with_audio(line['text'], args.verbose) - normalized_text, cer = normalizer.select_best_match(normalized_texts, transcript, args.verbose) - - line['nemo_wfst'] = normalized_text - line['CER_nemo_wfst'] = cer + normalized_text, cer = normalizer.select_best_match( + normalized_texts, transcript, args.verbose, args.remove_punct + ) + line['nemo_normalized'] = normalized_text + line['CER_nemo_normalized'] = cer f_out.write(json.dumps(line, ensure_ascii=False) + '\n') print(f'Normalized version saved at {manifest_out}') @@ -300,18 +287,25 @@ def normalize_manifest(args): if __name__ == "__main__": args = parse_args() + start = time.time() if args.text: normalizer = NormalizerWithAudio(input_case=args.input_case) if os.path.exists(args.text): with open(args.text, 'r') as f: - args.text = f.read() + args.text = f.read().strip() normalized_texts = normalizer.normalize_with_audio(args.text, args.verbose) - for norm_text in normalized_texts: - print(norm_text) if args.audio_data: asr_model = get_asr_model(args.model) transcript = asr_model.transcribe([args.audio_data])[0] - normalized_text, cer = normalizer.select_best_match(normalized_texts, transcript, args.verbose) + normalized_text, cer = normalizer.select_best_match( + normalized_texts, transcript, args.verbose, args.remove_punct + ) + print(f'Transcript: {transcript}') + print(f'Normalized: {normalized_text}') + else: + print('Normalization options:') + for norm_text in normalized_texts: + print(norm_text) elif not os.path.exists(args.audio_data): raise ValueError(f'{args.audio_data} not found.') elif args.audio_data.endswith('.json'): @@ -322,3 +316,4 @@ def normalize_manifest(args): + "'--audio_data' path to audio file and '--text' path to a text file OR" "'--text' string text (for debugging without audio)" ) + print(f'Execution time: {round((time.time() - start)/60, 2)} min.') diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py index 7f9f821129ea..7f4b6fe68714 100644 --- a/nemo_text_processing/text_normalization/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/taggers/cardinal.py @@ -69,8 +69,18 @@ def __init__(self, deterministic: bool = True): 1, ) ) - - self.graph = self.graph | self.single_digits_graph | get_hundreds_graph() | single_digits_graph_with_commas + range_graph = ( + self.graph + + (pynini.cross("-", " to ") | pynini.cross("x", " by ") | pynini.cross(" x ", " by ")) + + self.graph + ) + self.graph = ( + self.graph + | self.single_digits_graph + | get_hundreds_graph() + | single_digits_graph_with_commas + | range_graph + ) optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index d6d12defbb4f..e5d8b9b22d96 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -1,11 +1,47 @@ -123~123|one twenty three|one two three|one hundred twenty three|one hundred and twenty three -$123~$123|one twenty three dollars|one two three dollars|one hundred twenty three dollars|one hundred and twenty three dollars -$123.2~$123.2|one twenty three dollars and two cents|one two three dollars two cents|one hundred twenty three dollars and two cents|one hundred and twenty three point two dollars -1.24~1.24|one point two four|one two four|one point twenty four|one twenty four -$1.21~one dollar and twenty one cents|one dollar twenty one cents|one point two one dollars|one twenty one dollars -£1.00~one pound|one point o o pounds -t-0t25d12-f~t zero t twenty five d twelve f -133-A~one hundred thirty three A -B2A23C~B two A twenty three C -25d08A~twenty five d zero eight A -C24~C twenty four \ No newline at end of file +>>>123 +123 +one twenty three +one two three +one hundred twenty three +one hundred and twenty three +>>>$123 +$123 +one twenty three dollars +one two three dollars +one hundred twenty three dollars +one hundred and twenty three dollars +>>>$123.2 +$123.2 +one twenty three dollars and two cents +one two three dollars two cents +one hundred twenty three dollars and two cents +one hundred and twenty three point two dollars +>>>1.24 +1.24 +one point two four +one two four +one point twenty four +one twenty four +>>>$1.21 +one dollar and twenty one cents +one dollar twenty one cents +one point two one dollars +one twenty one dollars +>>>£1.00 +one pound +one point o o pounds +>>>t-0t25d12-f +t zero t twenty five d twelve f +>>>133-A +one hundred thirty three A +>>>B2A23C +B two A twenty three C +>>>25d08A +twenty five d zero eight A +>>>C24 +C twenty four +>>>It seemed to her that the jacket Oswald wore was darker than Commission Exhibit No. 162. +It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one sixty two. +It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred sixty two. +It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred and sixty two. +It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one six two. \ No newline at end of file diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index ca9750fc54f9..8372287f6e18 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -41,8 +41,14 @@ def get_test_cases_multiple(file_name: str = 'data_text_normalization/test_cases """ test_pairs = [] with open(os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name, 'r') as f: + written = None + normalized_options = [] for line in f: - written, normalized_options = line.split('~') - normalized_options = normalized_options.strip().split('|') - test_pairs.append((written, normalized_options)) + if line.startswith('>>>'): + if written: + test_pairs.append((written, normalized_options)) + normalized_options = [] + written = line.strip().replace('>>>', '') + else: + normalized_options.append(line.strip()) return test_pairs From 131689c443a97f51e37032bd79fbe7950e312597 Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 25 May 2021 13:06:01 -0700 Subject: [PATCH 02/17] update test Signed-off-by: ekmb --- .../test_cases_normalize_with_audio.txt | 24 +++++++++---------- tests/nemo_text_processing/utils.py | 5 ++-- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index e5d8b9b22d96..f2c13fb9abf4 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -1,46 +1,46 @@ ->>>123 +~123 123 one twenty three one two three one hundred twenty three one hundred and twenty three ->>>$123 +~$123 $123 one twenty three dollars one two three dollars one hundred twenty three dollars one hundred and twenty three dollars ->>>$123.2 +~$123.2 $123.2 one twenty three dollars and two cents one two three dollars two cents one hundred twenty three dollars and two cents one hundred and twenty three point two dollars ->>>1.24 +~1.24 1.24 one point two four one two four one point twenty four one twenty four ->>>$1.21 +~$1.21 one dollar and twenty one cents one dollar twenty one cents one point two one dollars one twenty one dollars ->>>£1.00 +~£1.00 one pound one point o o pounds ->>>t-0t25d12-f +~t-0t25d12-f t zero t twenty five d twelve f ->>>133-A +~133-A one hundred thirty three A ->>>B2A23C +~B2A23C B two A twenty three C ->>>25d08A +~25d08A twenty five d zero eight A ->>>C24 +~C24 C twenty four ->>>It seemed to her that the jacket Oswald wore was darker than Commission Exhibit No. 162. +~It seemed to her that the jacket Oswald wore was darker than Commission Exhibit No. 162. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one sixty two. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred sixty two. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred and sixty two. diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index 8372287f6e18..34bbe935fdf5 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -44,11 +44,12 @@ def get_test_cases_multiple(file_name: str = 'data_text_normalization/test_cases written = None normalized_options = [] for line in f: - if line.startswith('>>>'): + if line.startswith('~'): if written: test_pairs.append((written, normalized_options)) normalized_options = [] - written = line.strip().replace('>>>', '') + written = line.strip().replace('~', '') else: normalized_options.append(line.strip()) + test_pairs.append((written, normalized_options)) return test_pairs From 01ab6f5c4c569f87ece98d27554dd901a46af379 Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 25 May 2021 14:41:04 -0700 Subject: [PATCH 03/17] fix new test Signed-off-by: ekmb --- .../text_normalization/normalize_with_audio.py | 10 ++++++---- .../test_cases_normalize_with_audio.txt | 4 +++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 82f23e144672..4a15d95ce2f0 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -95,9 +95,10 @@ def normalize_with_audio(self, text: str, verbose: bool = False) -> str: if verbose: print(text) return text - text = pynini.escape(text) - tagged_texts = rewrite.rewrites(text, self.tagger.fst) + text = pynini.escape(text) + # TODO add preprocess? + tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=100) normalized_texts = [] for tagged_text in tagged_texts: @@ -176,10 +177,12 @@ def pre_process(text: str) -> str: Returns: text with spaces around punctuation marks """ + print(text) text = text.replace('--', '-') space_right = '!?:;,.-()*+-/<=>@^_' space_both = '-()*+-/<=>@^_' + # TODO for punct in space_right: text = text.replace(punct, punct + ' ') for punct in space_both: @@ -200,8 +203,7 @@ def post_process(text: str, punctuation='!,.:;?') -> str: Returns: text with normalized spaces and quotes """ text = ( - text.replace('--', '-') - .replace('( ', '(') + text.replace('( ', '(') .replace(' )', ')') .replace(' ', ' ') .replace('”', '"') diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index f2c13fb9abf4..bfaee65065c5 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -44,4 +44,6 @@ C twenty four It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one sixty two. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred sixty two. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred and sixty two. -It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one six two. \ No newline at end of file +It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one six two. +~"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." +"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." \ No newline at end of file From a6790f7dfc1a4580ed32ac2d56235c5b42220649 Mon Sep 17 00:00:00 2001 From: ekmb Date: Fri, 28 May 2021 15:48:53 -0700 Subject: [PATCH 04/17] add serial to the default normalizer, add tests Signed-off-by: ekmb --- .../data/whitelist_alternatives.tsv | 21 ++++++----- .../normalize_with_audio.py | 14 ++++--- .../text_normalization/taggers/cardinal.py | 7 +++- .../text_normalization/taggers/measure.py | 22 ++++++++++- .../text_normalization/taggers/serial.py | 24 +++++++----- .../taggers/tokenize_and_classify.py | 10 +++-- .../text_normalization/taggers/whitelist.py | 13 +++++-- .../text_normalization/verbalizers/measure.py | 4 +- .../text_normalization/verbalizers/serial.py | 8 ++-- .../verbalizers/verbalize.py | 8 ++-- .../test_cases_cardinal.txt | 1 + .../test_cases_decimal.txt | 1 + .../test_cases_normalize_with_audio.txt | 10 ++++- .../test_cases_serial.txt | 6 +++ tests/nemo_text_processing/test_boundary.py | 4 ++ tests/nemo_text_processing/test_cardinal.py | 4 ++ tests/nemo_text_processing/test_date.py | 6 +++ tests/nemo_text_processing/test_decimal.py | 19 ++++++++++ tests/nemo_text_processing/test_electronic.py | 6 ++- tests/nemo_text_processing/test_measure.py | 4 ++ tests/nemo_text_processing/test_money.py | 4 ++ .../test_normalization_with_audio.py | 2 +- tests/nemo_text_processing/test_ordinal.py | 4 ++ tests/nemo_text_processing/test_serial.py | 37 +++++++++++++++++++ tests/nemo_text_processing/test_telephone.py | 4 ++ tests/nemo_text_processing/test_time.py | 4 ++ tests/nemo_text_processing/test_whitelist.py | 6 +++ tests/nemo_text_processing/test_word.py | 4 ++ tests/nemo_text_processing/utils.py | 1 - 29 files changed, 211 insertions(+), 47 deletions(-) create mode 100644 tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt create mode 100644 tests/nemo_text_processing/test_serial.py diff --git a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv index 6b9e93d66d4f..32f896aa8d95 100644 --- a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv +++ b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv @@ -1,10 +1,11 @@ -Hon. honorable -Hon. honourable -St. street -St street -St. saint -St saint -Dr. drive -Dr. doctor -Mr mister -Mrs misses \ No newline at end of file +Hon. Honorable +Hon. Honourable +St. Street +St Street +St. Saint +St Saint +Dr. Drive +Dr. Doctor +Mr. Mister +Mrs. Misses +Ms. Miss \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 4a15d95ce2f0..f3ae0e48f04e 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -78,7 +78,7 @@ def __init__(self, input_case: str): self.tagger = ClassifyFst(input_case=input_case, deterministic=False) self.verbalizer = VerbalizeFinalFst(deterministic=False) - def normalize_with_audio(self, text: str, verbose: bool = False) -> str: + def normalize(self, text: str, verbose: bool = False) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms @@ -102,10 +102,12 @@ def normalize_with_audio(self, text: str, verbose: bool = False) -> str: normalized_texts = [] for tagged_text in tagged_texts: + print(tagged_text) self.parser(tagged_text) tokens = self.parser.parse() tags_reordered = self.generate_permutations(tokens) for tagged_text_reordered in tags_reordered: + print(tagged_text_reordered) tagged_text_reordered = pynini.escape(tagged_text_reordered) try: verbalized = rewrite.rewrites(tagged_text_reordered, self.verbalizer.fst) @@ -115,7 +117,7 @@ def normalize_with_audio(self, text: str, verbose: bool = False) -> str: if len(normalized_texts) == 0: raise ValueError() - normalized_texts = [post_process(t) for t in normalized_texts] + normalized_texts = [post_process(t) for t in normalized_texts] + normalized_texts normalized_texts = set(normalized_texts) return normalized_texts @@ -158,7 +160,7 @@ def calculate_cer(normalized_texts: List[str], transcript: str, remove_punct=Fal """ normalized_options = [] for text in normalized_texts: - text_clean = text.replace('-', ' ').lower().strip() + text_clean = text.replace('-', ' ').lower() if remove_punct: for punct in "!?:;,.-()*+-/<=>@^_": text_clean = text_clean.replace(punct, " ") @@ -260,7 +262,7 @@ def parse_args(): def normalize_manifest(args): """ Args: - manifest: path to .json manifest file. + args.audio_data: path to .json manifest file. """ normalizer = NormalizerWithAudio(input_case=args.input_case) manifest_out = args.audio_data.replace('.json', '_normalized.json') @@ -276,7 +278,7 @@ def normalize_manifest(args): if asr_model is None: asr_model = get_asr_model(args.model) transcript = asr_model.transcribe([audio])[0] - normalized_texts = normalizer.normalize_with_audio(line['text'], args.verbose) + normalized_texts = normalizer.normalize(line['text'], args.verbose) normalized_text, cer = normalizer.select_best_match( normalized_texts, transcript, args.verbose, args.remove_punct ) @@ -295,7 +297,7 @@ def normalize_manifest(args): if os.path.exists(args.text): with open(args.text, 'r') as f: args.text = f.read().strip() - normalized_texts = normalizer.normalize_with_audio(args.text, args.verbose) + normalized_texts = normalizer.normalize(args.text, args.verbose) if args.audio_data: asr_model = get_asr_model(args.model) transcript = asr_model.transcribe([args.audio_data])[0] diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py index 7f4b6fe68714..24f8bf1ac95a 100644 --- a/nemo_text_processing/text_normalization/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/taggers/cardinal.py @@ -71,7 +71,12 @@ def __init__(self, deterministic: bool = True): ) range_graph = ( self.graph - + (pynini.cross("-", " to ") | pynini.cross("x", " by ") | pynini.cross(" x ", " by ")) + + ( + pynini.cross("-", " to ") + | pynini.cross("-", " ") + | pynini.cross("x", " by ") + | pynini.cross(" x ", " by ") + ) + self.graph ) self.graph = ( diff --git a/nemo_text_processing/text_normalization/taggers/measure.py b/nemo_text_processing/text_normalization/taggers/measure.py index 1c4f87566c8f..d9fbf26552e4 100644 --- a/nemo_text_processing/text_normalization/taggers/measure.py +++ b/nemo_text_processing/text_normalization/taggers/measure.py @@ -16,6 +16,7 @@ from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path from nemo_text_processing.text_normalization.graph_utils import ( NEMO_NON_BREAKING_SPACE, + NEMO_NOT_SPACE, NEMO_SIGMA, SINGULAR_TO_PLURAL, GraphFst, @@ -101,6 +102,25 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + pynutil.insert(" } ") + unit_singular ) - final_graph = subgraph_decimal | subgraph_cardinal + + subgraph_cardinal_dash = ( + pynutil.insert("cardinal { integer: \"") + + cardinal.single_digits_graph + + pynini.cross('-', '') + + pynutil.insert("\" } units: \"") + + pynini.closure(NEMO_NOT_SPACE, 1) + + pynutil.insert("\"") + ) + + subgraph_decimal_dash = ( + pynutil.insert("decimal { ") + + decimal.final_graph_wo_negative + + pynini.cross('-', '') + + pynutil.insert(" } units: \"") + + pynini.closure(NEMO_NOT_SPACE, 1) + + pynutil.insert("\"") + ) + + final_graph = subgraph_decimal | subgraph_cardinal | subgraph_cardinal_dash | subgraph_decimal_dash final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/taggers/serial.py b/nemo_text_processing/text_normalization/taggers/serial.py index 1447a88ccf28..39e3a1117504 100644 --- a/nemo_text_processing/text_normalization/taggers/serial.py +++ b/nemo_text_processing/text_normalization/taggers/serial.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, GraphFst, delete_space +from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, NEMO_NOT_SPACE, GraphFst, delete_space try: import pynini @@ -40,18 +40,22 @@ class SerialFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, deterministic: bool = False): + def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="serial", kind="classify", deterministic=deterministic) - num_graph = cardinal.graph + if deterministic: + num_graph = cardinal.single_digits_graph + else: + num_graph = cardinal.graph serial_graph_cardinal_start = ( pynini.closure((NEMO_ALPHA + pynutil.insert(" ")) | (NEMO_ALPHA + pynini.cross('-', ' ')), 1) + num_graph ) serial_end = pynini.closure(pynutil.insert(" ") + NEMO_ALPHA + pynini.closure(pynutil.insert(" ") + num_graph)) serial_graph_cardinal_end = num_graph + ( - (pynutil.insert(" ") + NEMO_ALPHA) | (pynini.cross('-', ' ') + NEMO_ALPHA) + pynini.closure(pynutil.insert(" ") + NEMO_ALPHA) | (pynini.cross('-', ' ') + NEMO_ALPHA) ) + serial_end2 = pynini.closure( pynutil.insert(" ") + num_graph @@ -62,12 +66,12 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = False): serial_end | serial_end2 ) - graph = ( - pynutil.insert("cardinal { integer: \"") - + serial_graph - + delete_space - + pynutil.insert("\" } units: \"serial\"") - ) + graph = pynutil.insert("cardinal { integer: \"") + serial_graph + + if not deterministic: + graph += pynini.closure(pynini.accep("s")) + + graph += pynutil.insert("\" } units: \"serial\"") graph = self.add_tokens(graph) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py index 1ee63e55770a..9b3a2b9a5434 100644 --- a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py @@ -71,6 +71,7 @@ def __init__(self, input_case: str, deterministic: bool = True): money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst + serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) @@ -83,13 +84,14 @@ def __init__(self, input_case: str, deterministic: bool = True): | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) + | pynutil.add_weight(serial_graph, 1.2) | pynutil.add_weight(word_graph, 100) ) - if not deterministic: - serial_graph = SerialFst(cardinal, deterministic=deterministic).fst - classify |= pynutil.add_weight(serial_graph, 1.1) - classify = classify.optimize() + # if not deterministic: + # serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic).fst + # classify |= pynutil.add_weight(serial_graph, 1.1) + # classify = classify.optimize() punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") diff --git a/nemo_text_processing/text_normalization/taggers/whitelist.py b/nemo_text_processing/text_normalization/taggers/whitelist.py index 59c0725f32a5..50c883f80358 100644 --- a/nemo_text_processing/text_normalization/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/taggers/whitelist.py @@ -44,7 +44,7 @@ class WhiteListFst(GraphFst): def __init__(self, input_case: str, deterministic: bool = True): super().__init__(name="whitelist", kind="classify") - def _get_whitelist_graph(file="data/whitelist.tsv"): + def _get_whitelist_graph(input_case, file="data/whitelist.tsv"): whitelist = load_labels(get_abs_path(file)) if input_case == "lower_cased": whitelist = [(x.lower(), y) for x, y in whitelist] @@ -53,9 +53,16 @@ def _get_whitelist_graph(file="data/whitelist.tsv"): graph = pynini.string_map(whitelist) return graph - graph = _get_whitelist_graph() + def _get_whitelist_non_deterministic_graph(file="data/whitelist_alternatives.tsv"): + whitelist = load_labels(get_abs_path(file)) + whitelist_lower = [(x.lower(), y.lower()) for x, y in whitelist] + whitelist_cased = [(x, y) for x, y in whitelist] + graph = pynini.string_map(whitelist_lower + whitelist_cased) + return graph + + graph = _get_whitelist_graph(input_case) if not deterministic: - graph |= _get_whitelist_graph("data/whitelist_alternatives.tsv") + graph |= _get_whitelist_graph("lower_cased") | _get_whitelist_non_deterministic_graph() graph = pynutil.insert("name: \"") + convert_space(graph) + pynutil.insert("\"") self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/measure.py b/nemo_text_processing/text_normalization/verbalizers/measure.py index 78cf0eb9ddde..dbbc73c79445 100644 --- a/nemo_text_processing/text_normalization/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/verbalizers/measure.py @@ -43,7 +43,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = unit = pynutil.insert(" ") + pynini.closure(NEMO_CHAR - " ", 1) unit = pynutil.delete("units: \"") + unit + pynutil.delete("\"") + delete_space - graph_decimal = ( + self.graph_decimal = ( pynutil.delete("decimal {") + delete_space + optional_sign @@ -61,6 +61,6 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = + delete_space + pynutil.delete("}") ) - graph = (self.graph_cardinal | graph_decimal) + delete_space + unit + graph = (self.graph_cardinal | self.graph_decimal) + delete_space + unit delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/serial.py b/nemo_text_processing/text_normalization/verbalizers/serial.py index 66361a8b2342..cf328035bba9 100644 --- a/nemo_text_processing/text_normalization/verbalizers/serial.py +++ b/nemo_text_processing/text_normalization/verbalizers/serial.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.graph_utils import GraphFst, delete_space +from nemo_text_processing.text_normalization.graph_utils import NEMO_NOT_SPACE, GraphFst, delete_space try: import pynini @@ -38,7 +38,9 @@ class SerialFst(GraphFst): def __init__(self, measure: GraphFst, deterministic: bool = False): super().__init__(name="serial", kind="verbalize", deterministic=deterministic) - serial = pynutil.delete("units: \"") + pynini.cross("serial", "") + pynutil.delete("\"") + delete_space - graph = measure.graph_cardinal + delete_space + serial + serial = ( + pynini.cross("units: \"serial", "") + pynini.closure(NEMO_NOT_SPACE) + pynutil.delete("\"") + delete_space + ) + graph = measure.graph_cardinal + pynini.closure(delete_space) + serial delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/verbalizers/verbalize.py index 92ff74cabcd1..651e0b9f7cb1 100644 --- a/nemo_text_processing/text_normalization/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/verbalizers/verbalize.py @@ -54,6 +54,7 @@ def __init__(self, deterministic: bool = True): date_graph = DateFst(ordinal=ordinal, deterministic=deterministic).fst money_graph = MoneyFst(decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst + serial_graph = SerialFst(measure=measure, deterministic=deterministic).fst graph = ( time_graph | date_graph @@ -64,10 +65,11 @@ def __init__(self, deterministic: bool = True): | cardinal_graph | telephone_graph | electronic_graph + | serial_graph | whitelist_graph ) - if not deterministic: - serial_graph = SerialFst(measure, deterministic=deterministic).fst - graph |= serial_graph + # if not deterministic: + # serial_graph = SerialFst(measure=measure, deterministic=deterministic).fst + # graph |= serial_graph self.fst = graph diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt index 5b9100f6e949..1495a7c046b3 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt @@ -7,3 +7,4 @@ 13,000~thirteen thousand 123,123,000~one hundred twenty three million one hundred twenty three thousand 123,000,012~one hundred twenty three million twelve +a 4-kilogram bag~a four kilogram bag diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt index 1b839abc2052..d9634ffc815c 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt @@ -6,3 +6,4 @@ 0.1 billion~zero point one billion .1 trillion~point one trillion -0.1~minus zero point one +7.2-millimeter bullet~seven point two millimeter bullet diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index bfaee65065c5..7dadcf864fb8 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -32,6 +32,7 @@ one pound one point o o pounds ~t-0t25d12-f t zero t twenty five d twelve f +t zero t two five d one two f ~133-A one hundred thirty three A ~B2A23C @@ -46,4 +47,11 @@ It seemed to her that the jacket Oswald wore was darker than Commission Exhibit It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred and sixty two. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one six two. ~"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." -"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." \ No newline at end of file +"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." +~The box was 5 x 7 m. +The box was five by seven meters. +~25-30 +twenty five to thirty +twenty five thirty +~W2s +W twos \ No newline at end of file diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt new file mode 100644 index 000000000000..6df8c17d5af2 --- /dev/null +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt @@ -0,0 +1,6 @@ +t-0t25d12-f~t oh t two five d one two f +133-A~one three three A +B2A23C~B two A two three C +25d08A~two five d oh eight A +C24~C two four +W2s~W two s diff --git a/tests/nemo_text_processing/test_boundary.py b/tests/nemo_text_processing/test_boundary.py index cad8b2715854..547a7589bf6d 100644 --- a/tests/nemo_text_processing/test_boundary.py +++ b/tests/nemo_text_processing/test_boundary.py @@ -14,6 +14,7 @@ import pytest from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -21,6 +22,7 @@ class TestBoundary: normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_boundary.txt')) @pytest.mark.skipif( @@ -31,3 +33,5 @@ class TestBoundary: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_cardinal.py b/tests/nemo_text_processing/test_cardinal.py index a8baa5859dd7..3e4c3f6fdf33 100644 --- a/tests/nemo_text_processing/test_cardinal.py +++ b/tests/nemo_text_processing/test_cardinal.py @@ -15,6 +15,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -33,6 +34,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_cardinal.txt')) @pytest.mark.skipif( @@ -43,3 +45,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_date.py b/tests/nemo_text_processing/test_date.py index 4d796152447b..54b9296b0334 100644 --- a/tests/nemo_text_processing/test_date.py +++ b/tests/nemo_text_processing/test_date.py @@ -15,6 +15,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -33,6 +34,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_date.txt')) @pytest.mark.skipif( @@ -43,6 +45,8 @@ def test_denorm(self, test_input, expected): def test_norm_uncased(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + assert expected in pred_non_deterministic normalizer_uppercased = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None cases_uppercased = {"Aug. 8": "august eighth", "8 Aug.": "the eighth of august", "aug. 8": "august eighth"} @@ -56,3 +60,5 @@ def test_norm_uncased(self, test_input, expected): def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_decimal.py b/tests/nemo_text_processing/test_decimal.py index 778476dfd790..59fece29438c 100644 --- a/tests/nemo_text_processing/test_decimal.py +++ b/tests/nemo_text_processing/test_decimal.py @@ -15,6 +15,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -33,6 +34,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case="lower_cased") if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_decimal.txt')) @pytest.mark.skipif( @@ -43,3 +45,20 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + assert expected in pred_non_deterministic + + +if __name__ == '__main__': + test_cases = parse_test_case_file('data_text_normalization/test_cases_decimal.txt') + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None + + for test_input, expected in test_cases: + import pdb + + pdb.set_trace() + pred_non_deterministic = normalizer_with_audio.normalize(test_input) + print('input:', test_input) + for p in pred_non_deterministic: + print(p) + print('=' * 30) diff --git a/tests/nemo_text_processing/test_electronic.py b/tests/nemo_text_processing/test_electronic.py index 190dd379993d..a4238be40dec 100644 --- a/tests/nemo_text_processing/test_electronic.py +++ b/tests/nemo_text_processing/test_electronic.py @@ -15,6 +15,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -33,6 +34,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case="lower_cased") if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_electronic.txt')) @pytest.mark.skipif( @@ -42,4 +44,6 @@ def test_denorm(self, test_input, expected): @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected, f'|{pred}| {len(pred)}- |{expected}| {len(expected)}' + assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_measure.py b/tests/nemo_text_processing/test_measure.py index 5bd191bf5d60..ec61ec4112a6 100644 --- a/tests/nemo_text_processing/test_measure.py +++ b/tests/nemo_text_processing/test_measure.py @@ -16,6 +16,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -34,6 +35,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case="lower_cased") if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_measure.txt')) @pytest.mark.skipif( @@ -44,3 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_money.py b/tests/nemo_text_processing/test_money.py index 86ed5659be0f..04ce35ec0b8a 100644 --- a/tests/nemo_text_processing/test_money.py +++ b/tests/nemo_text_processing/test_money.py @@ -16,6 +16,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -34,6 +35,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_money.txt')) @pytest.mark.skipif( @@ -44,3 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_normalization_with_audio.py b/tests/nemo_text_processing/test_normalization_with_audio.py index d30d4837f53d..290267471a21 100644 --- a/tests/nemo_text_processing/test_normalization_with_audio.py +++ b/tests/nemo_text_processing/test_normalization_with_audio.py @@ -29,5 +29,5 @@ class TestNormalizeWithAudio: @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): - pred = self.normalizer.normalize_with_audio(test_input) + pred = self.normalizer.normalize(test_input) assert len(set(pred).intersection(set(expected))) == len(expected) diff --git a/tests/nemo_text_processing/test_ordinal.py b/tests/nemo_text_processing/test_ordinal.py index 5cb766d3b185..4cb47b2a40cf 100644 --- a/tests/nemo_text_processing/test_ordinal.py +++ b/tests/nemo_text_processing/test_ordinal.py @@ -16,6 +16,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -34,6 +35,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_ordinal.txt')) @pytest.mark.skipif( @@ -44,3 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_serial.py b/tests/nemo_text_processing/test_serial.py new file mode 100644 index 000000000000..0b858fd663eb --- /dev/null +++ b/tests/nemo_text_processing/test_serial.py @@ -0,0 +1,37 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from parameterized import parameterized +from utils import PYNINI_AVAILABLE, parse_test_case_file + + +class TestSerial: + normalizer = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None + + @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_serial.txt')) + @pytest.mark.skipif( + not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" + ) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_telephone.py b/tests/nemo_text_processing/test_telephone.py index a59414c181bb..0c13f72605c5 100644 --- a/tests/nemo_text_processing/test_telephone.py +++ b/tests/nemo_text_processing/test_telephone.py @@ -16,6 +16,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -34,6 +35,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_telephone.txt')) @pytest.mark.skipif( @@ -44,3 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_time.py b/tests/nemo_text_processing/test_time.py index 6a5b9e342f93..6add727d936c 100644 --- a/tests/nemo_text_processing/test_time.py +++ b/tests/nemo_text_processing/test_time.py @@ -15,6 +15,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -33,6 +34,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_time.txt')) @pytest.mark.skipif( @@ -43,3 +45,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_whitelist.py b/tests/nemo_text_processing/test_whitelist.py index 4146d91dcc31..0dfc67266f66 100644 --- a/tests/nemo_text_processing/test_whitelist.py +++ b/tests/nemo_text_processing/test_whitelist.py @@ -16,6 +16,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -34,6 +35,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.skipif( @@ -44,6 +46,8 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + assert expected in pred_non_deterministic normalizer_uppercased = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None cases_uppercased = {"Dr. Evil": "doctor Evil", "No. 4": "number four", "dr. Evil": "dr. Evil", "no. 4": "no. four"} @@ -57,3 +61,5 @@ def test_norm(self, test_input, expected): def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_word.py b/tests/nemo_text_processing/test_word.py index 9d1eb9849afc..eea3c7802177 100644 --- a/tests/nemo_text_processing/test_word.py +++ b/tests/nemo_text_processing/test_word.py @@ -16,6 +16,7 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from utils import PYNINI_AVAILABLE, parse_test_case_file @@ -34,6 +35,7 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer = Normalizer(input_case='lower_cased') if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_word.txt')) @pytest.mark.skipif( @@ -44,3 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py index 34bbe935fdf5..d2bb3554ef65 100644 --- a/tests/nemo_text_processing/utils.py +++ b/tests/nemo_text_processing/utils.py @@ -31,7 +31,6 @@ def parse_test_case_file(file_name: str): for line in f: spoken, written = line.split('~') test_pairs.append((spoken, written.strip("\n"))) - print(test_pairs) return test_pairs From 1b4b9930025ddca78b987882bfb7e936a8bf75bb Mon Sep 17 00:00:00 2001 From: ekmb Date: Fri, 28 May 2021 16:15:58 -0700 Subject: [PATCH 05/17] manifest test added Signed-off-by: ekmb --- Jenkinsfile | 8 ++++++++ .../data/whitelist_alternatives.tsv | 5 ++++- .../text_normalization/normalize_with_audio.py | 4 ---- .../taggers/tokenize_and_classify.py | 5 ----- .../text_normalization/verbalizers/measure.py | 4 ++-- .../text_normalization/verbalizers/verbalize.py | 3 --- tests/nemo_text_processing/test_decimal.py | 15 --------------- 7 files changed, 14 insertions(+), 30 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3daec034db46..8cdf884d4b9b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -185,6 +185,14 @@ pipeline { sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_file.txt' } } + stage('L2: TN with Audio (manifest)') { + steps { + sh 'cd nemo_text_processing/text_normalization && \ + python python normalize_with_audio.py --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json \ + cmp --silent /home/TestData/nlp/text_norm/audio_based/manifest_normalized.json /home/TestData/nlp/text_norm/audio_based/manifest_result.json || exit 1' + sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/manifest_result.json' + } + } } } diff --git a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv index 32f896aa8d95..d3c878e34b43 100644 --- a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv +++ b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv @@ -8,4 +8,7 @@ Dr. Drive Dr. Doctor Mr. Mister Mrs. Misses -Ms. Miss \ No newline at end of file +Ms. Miss +Mr Mister +Mrs Misses +Ms Miss \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index f3ae0e48f04e..2bab55d04402 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -102,12 +102,10 @@ def normalize(self, text: str, verbose: bool = False) -> str: normalized_texts = [] for tagged_text in tagged_texts: - print(tagged_text) self.parser(tagged_text) tokens = self.parser.parse() tags_reordered = self.generate_permutations(tokens) for tagged_text_reordered in tags_reordered: - print(tagged_text_reordered) tagged_text_reordered = pynini.escape(tagged_text_reordered) try: verbalized = rewrite.rewrites(tagged_text_reordered, self.verbalizer.fst) @@ -179,12 +177,10 @@ def pre_process(text: str) -> str: Returns: text with spaces around punctuation marks """ - print(text) text = text.replace('--', '-') space_right = '!?:;,.-()*+-/<=>@^_' space_both = '-()*+-/<=>@^_' - # TODO for punct in space_right: text = text.replace(punct, punct + ' ') for punct in space_both: diff --git a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py index 9b3a2b9a5434..55b9c7ad60f4 100644 --- a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py @@ -88,11 +88,6 @@ def __init__(self, input_case: str, deterministic: bool = True): | pynutil.add_weight(word_graph, 100) ) - # if not deterministic: - # serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic).fst - # classify |= pynutil.add_weight(serial_graph, 1.1) - # classify = classify.optimize() - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( diff --git a/nemo_text_processing/text_normalization/verbalizers/measure.py b/nemo_text_processing/text_normalization/verbalizers/measure.py index dbbc73c79445..78cf0eb9ddde 100644 --- a/nemo_text_processing/text_normalization/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/verbalizers/measure.py @@ -43,7 +43,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = unit = pynutil.insert(" ") + pynini.closure(NEMO_CHAR - " ", 1) unit = pynutil.delete("units: \"") + unit + pynutil.delete("\"") + delete_space - self.graph_decimal = ( + graph_decimal = ( pynutil.delete("decimal {") + delete_space + optional_sign @@ -61,6 +61,6 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = + delete_space + pynutil.delete("}") ) - graph = (self.graph_cardinal | self.graph_decimal) + delete_space + unit + graph = (self.graph_cardinal | graph_decimal) + delete_space + unit delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/verbalizers/verbalize.py index 651e0b9f7cb1..9081bc47239d 100644 --- a/nemo_text_processing/text_normalization/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/verbalizers/verbalize.py @@ -69,7 +69,4 @@ def __init__(self, deterministic: bool = True): | whitelist_graph ) - # if not deterministic: - # serial_graph = SerialFst(measure=measure, deterministic=deterministic).fst - # graph |= serial_graph self.fst = graph diff --git a/tests/nemo_text_processing/test_decimal.py b/tests/nemo_text_processing/test_decimal.py index 59fece29438c..a4d2f488e46b 100644 --- a/tests/nemo_text_processing/test_decimal.py +++ b/tests/nemo_text_processing/test_decimal.py @@ -47,18 +47,3 @@ def test_norm(self, test_input, expected): assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) assert expected in pred_non_deterministic - - -if __name__ == '__main__': - test_cases = parse_test_case_file('data_text_normalization/test_cases_decimal.txt') - normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None - - for test_input, expected in test_cases: - import pdb - - pdb.set_trace() - pred_non_deterministic = normalizer_with_audio.normalize(test_input) - print('input:', test_input) - for p in pred_non_deterministic: - print(p) - print('=' * 30) From 4f726c0a3ae2a3f4836197e45dc21383cfb7bd90 Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 1 Jun 2021 13:18:33 -0700 Subject: [PATCH 06/17] expose more params, new test cases Signed-off-by: ekmb --- Jenkinsfile | 6 +- .../data/letter_pronunciation.tsv | 2 + .../normalize_with_audio.py | 74 ++++++++++++++----- .../text_normalization/taggers/cardinal.py | 43 +++++------ .../text_normalization/taggers/measure.py | 2 +- .../text_normalization/taggers/serial.py | 28 ++++--- .../text_normalization/verbalizers/money.py | 25 +++++-- .../test_cases_measure.txt | 5 +- .../test_cases_normalize_with_audio.txt | 57 ++++++++++++-- .../test_cases_serial.txt | 2 +- tests/nemo_text_processing/test_boundary.py | 4 +- tests/nemo_text_processing/test_cardinal.py | 2 +- tests/nemo_text_processing/test_date.py | 4 +- tests/nemo_text_processing/test_decimal.py | 2 +- tests/nemo_text_processing/test_electronic.py | 4 +- tests/nemo_text_processing/test_measure.py | 2 +- tests/nemo_text_processing/test_money.py | 2 +- .../test_normalization_with_audio.py | 2 +- tests/nemo_text_processing/test_ordinal.py | 2 +- tests/nemo_text_processing/test_serial.py | 2 +- tests/nemo_text_processing/test_telephone.py | 2 +- tests/nemo_text_processing/test_time.py | 2 +- tests/nemo_text_processing/test_whitelist.py | 4 +- tests/nemo_text_processing/test_word.py | 4 +- 24 files changed, 188 insertions(+), 94 deletions(-) create mode 100644 nemo_text_processing/text_normalization/data/letter_pronunciation.tsv diff --git a/Jenkinsfile b/Jenkinsfile index 8cdf884d4b9b..7450d6ad63f4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -170,7 +170,7 @@ pipeline { stage('L2: TN with Audio (audio and raw text)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python python normalize_with_audio.py --text "The total amounts to \$4.76." \ + python normalize_with_audio.py --text "The total amounts to \$4.76." \ --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n 1 > /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt 2>&1 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt' @@ -179,7 +179,7 @@ pipeline { stage('L2: TN with Audio (audio and text file)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python python normalize_with_audio.py --text /home/TestData/nlp/text_norm/audio_based/text.txt \ + python normalize_with_audio.py --text /home/TestData/nlp/text_norm/audio_based/text.txt \ --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n 1 > /home/TestData/nlp/text_norm/audio_based/output/out_file.txt 2>&1 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_file.txt' @@ -188,7 +188,7 @@ pipeline { stage('L2: TN with Audio (manifest)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python python normalize_with_audio.py --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json \ + python normalize_with_audio.py --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 \ cmp --silent /home/TestData/nlp/text_norm/audio_based/manifest_normalized.json /home/TestData/nlp/text_norm/audio_based/manifest_result.json || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/manifest_result.json' } diff --git a/nemo_text_processing/text_normalization/data/letter_pronunciation.tsv b/nemo_text_processing/text_normalization/data/letter_pronunciation.tsv new file mode 100644 index 000000000000..3ec49acb9fa1 --- /dev/null +++ b/nemo_text_processing/text_normalization/data/letter_pronunciation.tsv @@ -0,0 +1,2 @@ +k kay +j jay \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 2bab55d04402..c64c9c933af0 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -78,13 +78,15 @@ def __init__(self, input_case: str): self.tagger = ClassifyFst(input_case=input_case, deterministic=False) self.verbalizer = VerbalizeFinalFst(deterministic=False) - def normalize(self, text: str, verbose: bool = False) -> str: + def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms Args: text: string that may include semiotic classes + n_tagged: number of tagged options to consider, -1 - to get all possible tagged options + punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: @@ -97,28 +99,47 @@ def normalize(self, text: str, verbose: bool = False) -> str: return text text = pynini.escape(text) - # TODO add preprocess? - tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=100) - normalized_texts = [] + if n_tagged == -1: + tagged_texts = rewrite.rewrites(text, self.tagger.fst) + else: + tagged_texts = rewrite.top_rewrites(text, self.tagger.fst, nshortest=n_tagged) + normalized_texts = [] for tagged_text in tagged_texts: + self._verbalize(tagged_text, normalized_texts) + + if len(normalized_texts) == 0: + raise ValueError() + if punct_post_process: + normalized_texts = [post_process(t) for t in normalized_texts] + normalized_texts = set(normalized_texts) + return normalized_texts + + def _verbalize(self, tagged_text: str, normalized_texts: List[str]): + """ + Verbalizes tagged text + + Args: + tagged_text: text with tags + normalized_texts: list of possible normalization options + """ + + def get_verbalized_text(tagged_text): + tagged_text = pynini.escape(tagged_text) + return rewrite.rewrites(tagged_text, self.verbalizer.fst) + + try: + normalized_texts.extend(get_verbalized_text(tagged_text)) + except pynini.lib.rewrite.Error: self.parser(tagged_text) tokens = self.parser.parse() tags_reordered = self.generate_permutations(tokens) for tagged_text_reordered in tags_reordered: - tagged_text_reordered = pynini.escape(tagged_text_reordered) try: - verbalized = rewrite.rewrites(tagged_text_reordered, self.verbalizer.fst) - normalized_texts.extend(verbalized) + normalized_texts.extend(get_verbalized_text(tagged_text_reordered)) except pynini.lib.rewrite.Error: continue - if len(normalized_texts) == 0: - raise ValueError() - normalized_texts = [post_process(t) for t in normalized_texts] + normalized_texts - normalized_texts = set(normalized_texts) - return normalized_texts - def select_best_match( self, normalized_texts: List[str], transcript: str, verbose: bool = False, remove_punct: bool = False ): @@ -250,8 +271,17 @@ def parse_args(): parser.add_argument( '--model', type=str, default='QuartzNet15x5Base-En', help='Pre-trained model name or path to model checkpoint' ) - parser.add_argument("--verbose", help="print info for debugging", action='store_true') - parser.add_argument("--remove_punct", help="remove punctuation before calculating cer", action='store_true') + parser.add_argument( + "--n_tagged", + type=int, + default=1000, + help="number of tagged options to consider, -1 - return all possible tagged options", + ) + parser.add_argument("--verbose", help="print info for debugging", action="store_true") + parser.add_argument("--remove_punct", help="remove punctuation before calculating cer", action="store_true") + parser.add_argument( + "--no_punct_post_process", help="set to True to disable punctuation post processing", action="store_true" + ) return parser.parse_args() @@ -274,7 +304,12 @@ def normalize_manifest(args): if asr_model is None: asr_model = get_asr_model(args.model) transcript = asr_model.transcribe([audio])[0] - normalized_texts = normalizer.normalize(line['text'], args.verbose) + normalized_texts = normalizer.normalize( + text=line['text'], + verbose=args.verbose, + n_tagged=args.n_tagged, + punct_post_process=not args.no_punct_post_process, + ) normalized_text, cer = normalizer.select_best_match( normalized_texts, transcript, args.verbose, args.remove_punct ) @@ -293,7 +328,12 @@ def normalize_manifest(args): if os.path.exists(args.text): with open(args.text, 'r') as f: args.text = f.read().strip() - normalized_texts = normalizer.normalize(args.text, args.verbose) + normalized_texts = normalizer.normalize( + text=args.text, + verbose=args.verbose, + n_tagged=args.n_tagged, + punct_post_process=not args.no_punct_post_process, + ) if args.audio_data: asr_model = get_asr_model(args.model) transcript = asr_model.transcribe([args.audio_data])[0] diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py index 24f8bf1ac95a..e3ef80682ae4 100644 --- a/nemo_text_processing/text_normalization/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/taggers/cardinal.py @@ -55,37 +55,28 @@ def __init__(self, deterministic: bool = True): self.single_digits_graph = single_digits_graph + pynini.closure(pynutil.insert(" ") + single_digits_graph) if not deterministic: - single_digits_graph_with_commas = ( - pynini.closure(self.single_digits_graph, 1, 3) + single_digits_graph_with_commas = pynini.closure( + self.single_digits_graph + pynutil.insert(" "), 1, 3 + ) + pynini.closure( + pynutil.delete(",") + + single_digits_graph + pynutil.insert(" ") - + pynini.closure( - pynutil.delete(",") - + pynutil.insert(" ") - + single_digits_graph - + pynutil.insert(" ") - + single_digits_graph - + pynutil.insert(" ") - + single_digits_graph, - 1, - ) + + single_digits_graph + + pynutil.insert(" ") + + single_digits_graph, + 1, ) + + self.graph |= self.single_digits_graph | get_hundreds_graph() | single_digits_graph_with_commas range_graph = ( - self.graph - + ( - pynini.cross("-", " to ") - | pynini.cross("-", " ") - | pynini.cross("x", " by ") - | pynini.cross(" x ", " by ") - ) + pynini.closure(pynutil.insert("from "), 0, 1) + + self.graph + + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph ) - self.graph = ( - self.graph - | self.single_digits_graph - | get_hundreds_graph() - | single_digits_graph_with_commas - | range_graph - ) + + range_graph |= self.graph + (pynini.cross("x", " by ") | pynini.cross(" x ", " by ")) + self.graph + self.graph |= range_graph optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") diff --git a/nemo_text_processing/text_normalization/taggers/measure.py b/nemo_text_processing/text_normalization/taggers/measure.py index d9fbf26552e4..dfa4ba2723a6 100644 --- a/nemo_text_processing/text_normalization/taggers/measure.py +++ b/nemo_text_processing/text_normalization/taggers/measure.py @@ -105,7 +105,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = subgraph_cardinal_dash = ( pynutil.insert("cardinal { integer: \"") - + cardinal.single_digits_graph + + cardinal_graph + pynini.cross('-', '') + pynutil.insert("\" } units: \"") + pynini.closure(NEMO_NOT_SPACE, 1) diff --git a/nemo_text_processing/text_normalization/taggers/serial.py b/nemo_text_processing/text_normalization/taggers/serial.py index 39e3a1117504..20f6ea4658e1 100644 --- a/nemo_text_processing/text_normalization/taggers/serial.py +++ b/nemo_text_processing/text_normalization/taggers/serial.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, NEMO_NOT_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path, load_labels +from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, GraphFst try: import pynini @@ -47,29 +48,26 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): num_graph = cardinal.single_digits_graph else: num_graph = cardinal.graph - serial_graph_cardinal_start = ( - pynini.closure((NEMO_ALPHA + pynutil.insert(" ")) | (NEMO_ALPHA + pynini.cross('-', ' ')), 1) + num_graph - ) - serial_end = pynini.closure(pynutil.insert(" ") + NEMO_ALPHA + pynini.closure(pynutil.insert(" ") + num_graph)) - serial_graph_cardinal_end = num_graph + ( - pynini.closure(pynutil.insert(" ") + NEMO_ALPHA) | (pynini.cross('-', ' ') + NEMO_ALPHA) - ) + alpha = NEMO_ALPHA + if not deterministic: + letter_pronunciation = pynini.string_map(load_labels(get_abs_path("data/letter_pronunciation.tsv"))) + alpha |= letter_pronunciation + letter_num = pynini.closure((alpha + pynutil.insert(" ")) | (alpha + pynini.cross('-', ' ')), 1) + num_graph + serial_end = pynini.closure(pynutil.insert(" ") + alpha + pynini.closure(pynutil.insert(" ") + num_graph)) + + num_letter = num_graph + (pynini.closure((pynutil.insert(" ") + alpha) | (pynini.cross('-', ' ') + alpha), 1)) serial_end2 = pynini.closure( - pynutil.insert(" ") - + num_graph - + pynini.closure((pynutil.insert(" ") | pynini.cross("-", " ")) + NEMO_ALPHA) + pynutil.insert(" ") + num_graph + pynini.closure((pynutil.insert(" ") | pynini.cross("-", " ")) + alpha) ) - serial_graph = (serial_graph_cardinal_start | serial_graph_cardinal_end) + pynini.closure( - serial_end | serial_end2 - ) + serial_graph = (letter_num | num_letter) + pynini.closure(serial_end | serial_end2) graph = pynutil.insert("cardinal { integer: \"") + serial_graph if not deterministic: - graph += pynini.closure(pynini.accep("s")) + graph += pynini.closure(pynini.accep("s") | pynini.cross("s", "es"), 0, 1) graph += pynutil.insert("\" } units: \"serial\"") diff --git a/nemo_text_processing/text_normalization/verbalizers/money.py b/nemo_text_processing/text_normalization/verbalizers/money.py index 80705faa3a35..97b95080bfb8 100644 --- a/nemo_text_processing/text_normalization/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/verbalizers/money.py @@ -50,7 +50,7 @@ def _get_minor_currencies(file): with open(get_abs_path(file), 'r') as f: for line in f: min_cur = line.strip() - minor_currencies.append(pynini.closure(pynutil.insert(min_cur), 0, 1)) + minor_currencies.append(pynutil.insert(min_cur)) return minor_currencies unit = ( @@ -64,9 +64,19 @@ def _get_minor_currencies(file): if not deterministic: minor_currencies_singular = _get_minor_currencies("data/currency/currency_minor_one.tsv") - minor_currencies_singular = pynini.closure( - pynini.cross("one", "one") + insert_space + pynini.union(*minor_currencies_singular), 0, 1 + minor_currencies_singular = pynini.union(*minor_currencies_singular) + minor_currencies_singular = ( + pynini.closure(NEMO_NOT_QUOTE) + + ( + pynini.accep("one") + | pynini.cross("zero one", "one") + | pynini.cross("oh one", "one") + | pynini.cross(" o one", " one") + ) + + insert_space + + minor_currencies_singular ) + minor_currencies_plural = _get_minor_currencies("data/currency/currency_minor.tsv") minor_currencies_plural = insert_space + pynini.union(*minor_currencies_plural) @@ -74,11 +84,11 @@ def _get_minor_currencies(file): pynutil.delete("fractional_part:") + delete_space + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + minor_currencies_singular + + ((pynini.closure(NEMO_NOT_QUOTE, 1) + minor_currencies_plural) | minor_currencies_singular) + pynutil.delete("\"") ) + # $2.00 {two zero zero dollars} -> two dollars fractional_with_zeros = ( pynutil.delete("fractional_part:") + delete_space @@ -92,7 +102,7 @@ def _get_minor_currencies(file): fractional = fractional_with_zeros | fractional_default - graph = ( + graph |= ( decimal.integer + delete_space + insert_space @@ -101,8 +111,7 @@ def _get_minor_currencies(file): + insert_space + pynini.closure(pynutil.insert("and "), 0, 1) + fractional - + minor_currencies_plural - ) | graph + ) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt index cf9e6087fde2..9ce6ac83ddbf 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt @@ -4,4 +4,7 @@ 1 mbps~one megabit per second 3 mbps~three megabits per second 3 cc/s~three c c per second -100 million kg~one hundred million kilograms \ No newline at end of file +100 million kg~one hundred million kilograms +a 4-kilogram bag~a four kilogram bag +55-millimeters long~fifty five millimeters long +a 123.2-millimeters long~a one hundred twenty three point two millimeters long \ No newline at end of file diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index 7dadcf864fb8..493af042af9f 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -31,8 +31,14 @@ one twenty one dollars one pound one point o o pounds ~t-0t25d12-f +t zero t twenty five d one two f t zero t twenty five d twelve f t zero t two five d one two f +t zero t two five d twelve f +t oh t two five d one two f +t oh t twenty five d one two f +t oh t twenty five d twelve f +t oh t two five d twelve f ~133-A one hundred thirty three A ~B2A23C @@ -48,10 +54,49 @@ It seemed to her that the jacket Oswald wore was darker than Commission Exhibit It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one six two. ~"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." "Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." -~The box was 5 x 7 m. -The box was five by seven meters. -~25-30 -twenty five to thirty -twenty five thirty +~1970-2010 +nineteen seventy to twenty ten +one thousand nine seventy to two thousand ten +one thousand nine hundred and seventy to two thousand ten +one thousand nine hundred and seventy to twenty ten +nineteen seventy twenty ten +one thousand nine seventy two thousand ten +one thousand nine hundred and seventy two thousand ten +one thousand nine hundred and seventy twenty ten +from nineteen seventy to twenty ten +from one thousand nine seventy to two thousand ten +from one thousand nine hundred and seventy to two thousand ten +from one thousand nine hundred and seventy to twenty ten +from nineteen seventy twenty ten +from one thousand nine seventy two thousand ten +from one thousand nine hundred and seventy two thousand ten +from one thousand nine hundred and seventy twenty ten ~W2s -W twos \ No newline at end of file +W twos +~W26s +W twenty sixes +~401-ks +four oh one kays +four oh one k s +four hundred one k s +~The box was 25 x 7 m. +The box was twenty five by seven meters. +The box was two five by seven meters. +The box was two five by seven m. +The box was twenty five by seven m. +~4567 +four thousand five hundred sixty seven +four five six seven +four thousand five sixty seven +forty five sixty seven +four thousand five hundred and sixty seven +~This example number 15,000 can be a very long one, and can fail to produce valid normalization for a such an easy number like 10,125 or dollar value $5349.01, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, 452. +This example number fifteen thousand can be a very long one, and can fail to produce valid normalization for a such an easy number like ten thousand one hundred twenty five or dollar value five thousand three hundred forty nine dollars and one cent, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, four five two. +~$1.01 +one dollar and one cent +one dollar one cent +one dollar and zero one cents +~$17.31 +seventeen dollars and thirty one cent +seventeen dollars and thirty one cents +seventeen point three one dollars \ No newline at end of file diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt index 6df8c17d5af2..2a103f962070 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt @@ -1,5 +1,5 @@ t-0t25d12-f~t oh t two five d one two f -133-A~one three three A +133-ABC~one hundred thirty three ABC B2A23C~B two A two three C 25d08A~two five d oh eight A C24~C two four diff --git a/tests/nemo_text_processing/test_boundary.py b/tests/nemo_text_processing/test_boundary.py index 547a7589bf6d..d210443c46d8 100644 --- a/tests/nemo_text_processing/test_boundary.py +++ b/tests/nemo_text_processing/test_boundary.py @@ -33,5 +33,7 @@ class TestBoundary: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, n_tagged=100, punct_post_process=False + ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_cardinal.py b/tests/nemo_text_processing/test_cardinal.py index 3e4c3f6fdf33..26d59d08589f 100644 --- a/tests/nemo_text_processing/test_cardinal.py +++ b/tests/nemo_text_processing/test_cardinal.py @@ -45,5 +45,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_date.py b/tests/nemo_text_processing/test_date.py index 54b9296b0334..3003aa4919a4 100644 --- a/tests/nemo_text_processing/test_date.py +++ b/tests/nemo_text_processing/test_date.py @@ -45,7 +45,7 @@ def test_denorm(self, test_input, expected): def test_norm_uncased(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic normalizer_uppercased = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None @@ -60,5 +60,5 @@ def test_norm_uncased(self, test_input, expected): def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_decimal.py b/tests/nemo_text_processing/test_decimal.py index a4d2f488e46b..82a2a561bdaf 100644 --- a/tests/nemo_text_processing/test_decimal.py +++ b/tests/nemo_text_processing/test_decimal.py @@ -45,5 +45,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_electronic.py b/tests/nemo_text_processing/test_electronic.py index a4238be40dec..8c9398618af6 100644 --- a/tests/nemo_text_processing/test_electronic.py +++ b/tests/nemo_text_processing/test_electronic.py @@ -45,5 +45,7 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, n_tagged=100, punct_post_process=False + ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_measure.py b/tests/nemo_text_processing/test_measure.py index ec61ec4112a6..695dc0687274 100644 --- a/tests/nemo_text_processing/test_measure.py +++ b/tests/nemo_text_processing/test_measure.py @@ -46,5 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_money.py b/tests/nemo_text_processing/test_money.py index 04ce35ec0b8a..e4376a810706 100644 --- a/tests/nemo_text_processing/test_money.py +++ b/tests/nemo_text_processing/test_money.py @@ -46,5 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_normalization_with_audio.py b/tests/nemo_text_processing/test_normalization_with_audio.py index 290267471a21..1e9c6ecea83e 100644 --- a/tests/nemo_text_processing/test_normalization_with_audio.py +++ b/tests/nemo_text_processing/test_normalization_with_audio.py @@ -29,5 +29,5 @@ class TestNormalizeWithAudio: @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): - pred = self.normalizer.normalize(test_input) + pred = self.normalizer.normalize(test_input, n_tagged=300) assert len(set(pred).intersection(set(expected))) == len(expected) diff --git a/tests/nemo_text_processing/test_ordinal.py b/tests/nemo_text_processing/test_ordinal.py index 4cb47b2a40cf..64a29c767c46 100644 --- a/tests/nemo_text_processing/test_ordinal.py +++ b/tests/nemo_text_processing/test_ordinal.py @@ -46,5 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_serial.py b/tests/nemo_text_processing/test_serial.py index 0b858fd663eb..81b19599de0a 100644 --- a/tests/nemo_text_processing/test_serial.py +++ b/tests/nemo_text_processing/test_serial.py @@ -33,5 +33,5 @@ class TestSerial: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_telephone.py b/tests/nemo_text_processing/test_telephone.py index 0c13f72605c5..4b869c4bee63 100644 --- a/tests/nemo_text_processing/test_telephone.py +++ b/tests/nemo_text_processing/test_telephone.py @@ -46,5 +46,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_time.py b/tests/nemo_text_processing/test_time.py index 6add727d936c..bb83136c6388 100644 --- a/tests/nemo_text_processing/test_time.py +++ b/tests/nemo_text_processing/test_time.py @@ -45,5 +45,5 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_whitelist.py b/tests/nemo_text_processing/test_whitelist.py index 0dfc67266f66..8b3daf2b67c7 100644 --- a/tests/nemo_text_processing/test_whitelist.py +++ b/tests/nemo_text_processing/test_whitelist.py @@ -46,7 +46,7 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, verbose=False) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic normalizer_uppercased = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None @@ -61,5 +61,5 @@ def test_norm(self, test_input, expected): def test_norm_cased(self, test_input, expected): pred = self.normalizer_uppercased.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_word.py b/tests/nemo_text_processing/test_word.py index eea3c7802177..50faa138fa3d 100644 --- a/tests/nemo_text_processing/test_word.py +++ b/tests/nemo_text_processing/test_word.py @@ -46,5 +46,7 @@ def test_denorm(self, test_input, expected): def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input) + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, n_tagged=100, punct_post_process=False + ) assert expected in pred_non_deterministic From b6c563fd502086510dd114618dbbada32ade2a73 Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 1 Jun 2021 14:38:14 -0700 Subject: [PATCH 07/17] fix jenkins, serial clean, exclude range from cardinal Signed-off-by: ekmb --- Jenkinsfile | 2 +- .../text_normalization/normalize_with_audio.py | 2 +- .../text_normalization/taggers/cardinal.py | 17 ++++++++++++----- .../text_normalization/taggers/measure.py | 3 +++ .../text_normalization/taggers/serial.py | 14 ++++++-------- .../test_cases_normalize_with_audio.txt | 10 ++++------ 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 7450d6ad63f4..59ee11e8573c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -188,7 +188,7 @@ pipeline { stage('L2: TN with Audio (manifest)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python normalize_with_audio.py --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 \ + python normalize_with_audio.py --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/manifest_normalized.json /home/TestData/nlp/text_norm/audio_based/manifest_result.json || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/manifest_result.json' } diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index c64c9c933af0..84c648401dd1 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -274,7 +274,7 @@ def parse_args(): parser.add_argument( "--n_tagged", type=int, - default=1000, + default=300, help="number of tagged options to consider, -1 - return all possible tagged options", ) parser.add_argument("--verbose", help="print info for debugging", action="store_true") diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py index e3ef80682ae4..00a542e4acc6 100644 --- a/nemo_text_processing/text_normalization/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/taggers/cardinal.py @@ -66,19 +66,26 @@ def __init__(self, deterministic: bool = True): + single_digits_graph, 1, ) - self.graph |= self.single_digits_graph | get_hundreds_graph() | single_digits_graph_with_commas - range_graph = ( + self.range_graph = ( pynini.closure(pynutil.insert("from "), 0, 1) + self.graph + (pynini.cross("-", " to ") | pynini.cross("-", " ")) + self.graph ) - range_graph |= self.graph + (pynini.cross("x", " by ") | pynini.cross(" x ", " by ")) + self.graph - self.graph |= range_graph + self.range_graph |= self.graph + (pynini.cross("x", " by ") | pynini.cross(" x ", " by ")) + self.graph + self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) - final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + if deterministic: + final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") + else: + final_graph = ( + optional_minus_graph + + pynutil.insert("integer: \"") + + (self.graph | self.range_graph) + + pynutil.insert("\"") + ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/taggers/measure.py b/nemo_text_processing/text_normalization/taggers/measure.py index dfa4ba2723a6..cd81fa8d2f77 100644 --- a/nemo_text_processing/text_normalization/taggers/measure.py +++ b/nemo_text_processing/text_normalization/taggers/measure.py @@ -51,6 +51,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph + if not deterministic: + cardinal_graph |= cardinal.range_graph + graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv")) graph_unit_plural = convert_space(graph_unit @ SINGULAR_TO_PLURAL) graph_unit = convert_space(graph_unit) diff --git a/nemo_text_processing/text_normalization/taggers/serial.py b/nemo_text_processing/text_normalization/taggers/serial.py index 20f6ea4658e1..b3b25f61c27b 100644 --- a/nemo_text_processing/text_normalization/taggers/serial.py +++ b/nemo_text_processing/text_normalization/taggers/serial.py @@ -14,7 +14,7 @@ # limitations under the License. from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path, load_labels -from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, GraphFst +from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, GraphFst, insert_space try: import pynini @@ -53,17 +53,15 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): if not deterministic: letter_pronunciation = pynini.string_map(load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation - letter_num = pynini.closure((alpha + pynutil.insert(" ")) | (alpha + pynini.cross('-', ' ')), 1) + num_graph - serial_end = pynini.closure(pynutil.insert(" ") + alpha + pynini.closure(pynutil.insert(" ") + num_graph)) - num_letter = num_graph + (pynini.closure((pynutil.insert(" ") + alpha) | (pynini.cross('-', ' ') + alpha), 1)) + delimiter = insert_space | pynini.cross("-", " ") - serial_end2 = pynini.closure( - pynutil.insert(" ") + num_graph + pynini.closure((pynutil.insert(" ") | pynini.cross("-", " ")) + alpha) - ) + letter_num = pynini.closure(alpha + delimiter, 1) + num_graph + num_letter = num_graph + pynini.closure(delimiter + alpha, 1) - serial_graph = (letter_num | num_letter) + pynini.closure(serial_end | serial_end2) + next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) + serial_graph = (letter_num | num_letter) + next_alpha_or_num graph = pynutil.insert("cardinal { integer: \"") + serial_graph if not deterministic: diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index 493af042af9f..800a46038d0f 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -1,23 +1,19 @@ ~123 -123 one twenty three one two three one hundred twenty three one hundred and twenty three ~$123 -$123 one twenty three dollars one two three dollars one hundred twenty three dollars one hundred and twenty three dollars ~$123.2 -$123.2 one twenty three dollars and two cents one two three dollars two cents one hundred twenty three dollars and two cents one hundred and twenty three point two dollars ~1.24 -1.24 one point two four one two four one point twenty four @@ -71,10 +67,12 @@ from nineteen seventy twenty ten from one thousand nine seventy two thousand ten from one thousand nine hundred and seventy two thousand ten from one thousand nine hundred and seventy twenty ten -~W2s -W twos ~W26s W twenty sixes +W twenty six s +~5-3-Ws +five three W s +five three Ws ~401-ks four oh one kays four oh one k s From e5ee241ea3ec83827779beb913dfe6d43bda9d35 Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 1 Jun 2021 15:24:12 -0700 Subject: [PATCH 08/17] jenkins Signed-off-by: ekmb --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 59ee11e8573c..90bfe395ec36 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -171,7 +171,7 @@ pipeline { steps { sh 'cd nemo_text_processing/text_normalization && \ python normalize_with_audio.py --text "The total amounts to \$4.76." \ - --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n 1 > /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt 2>&1 && \ + --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt 2>&1 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt' } @@ -180,7 +180,7 @@ pipeline { steps { sh 'cd nemo_text_processing/text_normalization && \ python normalize_with_audio.py --text /home/TestData/nlp/text_norm/audio_based/text.txt \ - --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n 1 > /home/TestData/nlp/text_norm/audio_based/output/out_file.txt 2>&1 && \ + --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /home/TestData/nlp/text_norm/audio_based/output/out_file.txt 2>&1 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_file.txt' } From 1d06576c37296babdd999aac7e3a3a1a708a429d Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 1 Jun 2021 19:03:40 -0700 Subject: [PATCH 09/17] jenkins dollar sign format Signed-off-by: ekmb --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9f26ca7bc15a..1fad8e5faf4a 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -170,7 +170,7 @@ pipeline { stage('L2: TN with Audio (audio and raw text)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python normalize_with_audio.py --text "The total amounts to \$4.76." \ + python normalize_with_audio.py --text "The total amounts to $4.76." \ --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt 2>&1 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt' From 9d6711b326d7bcff9b1e5825a7869ab97241b50c Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 1 Jun 2021 19:59:27 -0700 Subject: [PATCH 10/17] jenkins Signed-off-by: ekmb --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 1fad8e5faf4a..8395591a3d13 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -190,7 +190,7 @@ pipeline { sh 'cd nemo_text_processing/text_normalization && \ python normalize_with_audio.py --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/manifest_normalized.json /home/TestData/nlp/text_norm/audio_based/manifest_result.json || exit 1' - sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/manifest_result.json' + sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/manifest_normalized.json' } } } From aa9cd111474073e3a486524dada4236c5ff1eb69 Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 1 Jun 2021 21:36:43 -0700 Subject: [PATCH 11/17] jenkins dollar sign format Signed-off-by: ekmb --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 8395591a3d13..71059265b6df 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -170,7 +170,7 @@ pipeline { stage('L2: TN with Audio (audio and raw text)') { steps { sh 'cd nemo_text_processing/text_normalization && \ - python normalize_with_audio.py --text "The total amounts to $4.76." \ + python normalize_with_audio.py --text "The total amounts to \\$4.76." \ --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt 2>&1 && \ cmp --silent /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1' sh 'rm -rf /home/TestData/nlp/text_norm/audio_based/output/out_raw.txt' From c9c8aca50c7dda7c21c83d3bb7dc7e4608e228c6 Mon Sep 17 00:00:00 2001 From: ekmb Date: Wed, 2 Jun 2021 15:48:28 -0700 Subject: [PATCH 12/17] addressed review comments Signed-off-by: ekmb --- .../text_normalization/data/suppletive.tsv | 3 +-- .../text_normalization/taggers/serial.py | 2 +- .../text_normalization/verbalizers/measure.py | 4 ++-- .../text_normalization/verbalizers/serial.py | 17 +++++++++++------ .../text_normalization/verbalizers/verbalize.py | 2 +- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/text_normalization/data/suppletive.tsv b/nemo_text_processing/text_normalization/data/suppletive.tsv index be59872c28d9..dea620f37ab3 100644 --- a/nemo_text_processing/text_normalization/data/suppletive.tsv +++ b/nemo_text_processing/text_normalization/data/suppletive.tsv @@ -34,5 +34,4 @@ revolution per minute revolutions per minute mile per hour miles per hour megabit per second megabits per second square foot square feet -kilobit per second kilobits per second -lb \ No newline at end of file +kilobit per second kilobits per second \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/taggers/serial.py b/nemo_text_processing/text_normalization/taggers/serial.py index b3b25f61c27b..695df9dee3f5 100644 --- a/nemo_text_processing/text_normalization/taggers/serial.py +++ b/nemo_text_processing/text_normalization/taggers/serial.py @@ -67,7 +67,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): if not deterministic: graph += pynini.closure(pynini.accep("s") | pynini.cross("s", "es"), 0, 1) - graph += pynutil.insert("\" } units: \"serial\"") + graph += pynutil.insert("\" }") graph = self.add_tokens(graph) self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/measure.py b/nemo_text_processing/text_normalization/verbalizers/measure.py index 78cf0eb9ddde..033c45c9738f 100644 --- a/nemo_text_processing/text_normalization/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/verbalizers/measure.py @@ -52,7 +52,7 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = + delete_space + pynutil.delete("}") ) - self.graph_cardinal = ( + graph_cardinal = ( pynutil.delete("cardinal {") + delete_space + optional_sign @@ -61,6 +61,6 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = + delete_space + pynutil.delete("}") ) - graph = (self.graph_cardinal | graph_decimal) + delete_space + unit + graph = (graph_cardinal | graph_decimal) + delete_space + unit delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/serial.py b/nemo_text_processing/text_normalization/verbalizers/serial.py index cf328035bba9..3e9717499b0b 100644 --- a/nemo_text_processing/text_normalization/verbalizers/serial.py +++ b/nemo_text_processing/text_normalization/verbalizers/serial.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.graph_utils import NEMO_NOT_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.graph_utils import GraphFst, delete_space try: import pynini @@ -30,17 +30,22 @@ class SerialFst(GraphFst): tokens { serial { value: "c thirty two five" } } -> c thirty two five Args: - measure: MeasureFst + cardinal: CardinalFst deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, measure: GraphFst, deterministic: bool = False): + def __init__(self, cardinal: GraphFst, deterministic: bool = False): super().__init__(name="serial", kind="verbalize", deterministic=deterministic) - serial = ( - pynini.cross("units: \"serial", "") + pynini.closure(NEMO_NOT_SPACE) + pynutil.delete("\"") + delete_space + graph_cardinal = ( + pynutil.delete("cardinal {") + + delete_space + + delete_space + + cardinal.numbers + + delete_space + + pynutil.delete("}") ) - graph = measure.graph_cardinal + pynini.closure(delete_space) + serial + graph = graph_cardinal + pynini.closure(delete_space) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/verbalizers/verbalize.py index 9081bc47239d..08d9cb00c441 100644 --- a/nemo_text_processing/text_normalization/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/verbalizers/verbalize.py @@ -54,7 +54,7 @@ def __init__(self, deterministic: bool = True): date_graph = DateFst(ordinal=ordinal, deterministic=deterministic).fst money_graph = MoneyFst(decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst - serial_graph = SerialFst(measure=measure, deterministic=deterministic).fst + serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic).fst graph = ( time_graph | date_graph From 760e3d5b0467e9d7045611b5878d86bb91cc7e27 Mon Sep 17 00:00:00 2001 From: ekmb Date: Mon, 7 Jun 2021 11:33:58 -0700 Subject: [PATCH 13/17] fix decimal in measure Signed-off-by: ekmb --- nemo_text_processing/text_normalization/taggers/measure.py | 2 +- .../text_normalization/verbalizers/measure.py | 2 +- .../data_text_normalization/test_cases_cardinal.txt | 2 -- .../data_text_normalization/test_cases_decimal.txt | 1 - .../data_text_normalization/test_cases_measure.txt | 5 ++++- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/nemo_text_processing/text_normalization/taggers/measure.py b/nemo_text_processing/text_normalization/taggers/measure.py index c4a0524a8758..e90f4ee3f130 100644 --- a/nemo_text_processing/text_normalization/taggers/measure.py +++ b/nemo_text_processing/text_normalization/taggers/measure.py @@ -139,7 +139,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = pynutil.insert("decimal { ") + decimal.final_graph_wo_negative + pynini.cross('-', '') - + pynutil.insert("\" } units: \"") + + pynutil.insert(" } units: \"") + pynini.closure(NEMO_ALPHA, 1) + pynutil.insert("\"") ) diff --git a/nemo_text_processing/text_normalization/verbalizers/measure.py b/nemo_text_processing/text_normalization/verbalizers/measure.py index 82b7e1ec27bb..4bb4405bfb78 100644 --- a/nemo_text_processing/text_normalization/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/verbalizers/measure.py @@ -72,6 +72,6 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = # SH adds "preserve_order: true" by default preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space - graph |= unit + insert_space + graph_cardinal + delete_space + preserve_order + preserve_order + graph |= unit + insert_space + (graph_cardinal | graph_decimal) + delete_space + pynini.closure(preserve_order) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt index 2a79ee3b902f..5b9100f6e949 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt @@ -7,5 +7,3 @@ 13,000~thirteen thousand 123,123,000~one hundred twenty three million one hundred twenty three thousand 123,000,012~one hundred twenty three million twelve -a 4-kilogram bag~a four kilogram bag -covid-19~covid nineteen diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt index d9634ffc815c..1b839abc2052 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_decimal.txt @@ -6,4 +6,3 @@ 0.1 billion~zero point one billion .1 trillion~point one trillion -0.1~minus zero point one -7.2-millimeter bullet~seven point two millimeter bullet diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt index eee68ec60747..6f395da8679f 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt @@ -8,4 +8,7 @@ a 4-kilogram bag~a four kilogram bag 55-millimeters long~fifty five millimeters long a 123.2-millimeters long~a one hundred twenty three point two millimeters long -covid-19.5~covid nineteen point five \ No newline at end of file +covid-19.5~covid nineteen point five +covid-19~covid nineteen +a 4-kilogram bag~a four kilogram bag +7.2-millimeter bullet~seven point two millimeter bullet From a62fb9d403c9a16579f688009ba6006a538c35c2 Mon Sep 17 00:00:00 2001 From: ekmb Date: Mon, 7 Jun 2021 14:11:21 -0700 Subject: [PATCH 14/17] move serial in cardinal Signed-off-by: ekmb --- .../text_normalization/taggers/cardinal.py | 44 ++++++++--- .../text_normalization/taggers/serial.py | 73 ------------------- .../taggers/tokenize_and_classify.py | 3 - .../text_normalization/verbalizers/measure.py | 9 +-- .../text_normalization/verbalizers/serial.py | 51 ------------- .../verbalizers/verbalize.py | 3 - .../test_cases_cardinal.txt | 8 ++ .../test_cases_serial.txt | 6 -- .../test_cases_word.txt | 1 + tests/nemo_text_processing/test_serial.py | 37 ---------- 10 files changed, 43 insertions(+), 192 deletions(-) delete mode 100644 nemo_text_processing/text_normalization/taggers/serial.py delete mode 100644 nemo_text_processing/text_normalization/verbalizers/serial.py delete mode 100644 tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt delete mode 100644 tests/nemo_text_processing/test_serial.py diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py index 00a542e4acc6..65e52e3f5b79 100644 --- a/nemo_text_processing/text_normalization/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/taggers/cardinal.py @@ -14,8 +14,8 @@ # limitations under the License. -from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path -from nemo_text_processing.text_normalization.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path, load_labels +from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.taggers.date import get_hundreds_graph try: @@ -78,14 +78,36 @@ def __init__(self, deterministic: bool = True): self.range_graph = self.range_graph.optimize() optional_minus_graph = pynini.closure(pynutil.insert("negative: ") + pynini.cross("-", "\"true\" "), 0, 1) - if deterministic: - final_graph = optional_minus_graph + pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") - else: - final_graph = ( - optional_minus_graph - + pynutil.insert("integer: \"") - + (self.graph | self.range_graph) - + pynutil.insert("\"") - ) + final_graph = self.graph | pynutil.add_weight(self.get_serial_graph(), 1.2) + + if not deterministic: + final_graph |= self.range_graph + + final_graph = optional_minus_graph + pynutil.insert("integer: \"") + final_graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() + + def get_serial_graph(self): + """ + Finite state transducer for classifying serial. + The serial is a combination of digits, letters and dashes, e.g.: + c325-b -> tokens { serial { value: "c three two five b" } } + """ + alpha = NEMO_ALPHA + + if self.deterministic: + num_graph = self.single_digits_graph + else: + num_graph = self.graph + letter_pronunciation = pynini.string_map(load_labels(get_abs_path("data/letter_pronunciation.tsv"))) + alpha |= letter_pronunciation + + delimiter = insert_space | pynini.cross("-", " ") + letter_num = pynini.closure(alpha + delimiter, 1) + num_graph + num_letter = pynini.closure(num_graph + delimiter, 1) + alpha + next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) + serial_graph = (letter_num | num_letter) + next_alpha_or_num + + if not self.deterministic: + serial_graph += pynini.closure(pynini.accep("s") | pynini.cross("s", "es"), 0, 1) + return serial_graph diff --git a/nemo_text_processing/text_normalization/taggers/serial.py b/nemo_text_processing/text_normalization/taggers/serial.py deleted file mode 100644 index 695df9dee3f5..000000000000 --- a/nemo_text_processing/text_normalization/taggers/serial.py +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path, load_labels -from nemo_text_processing.text_normalization.graph_utils import NEMO_ALPHA, GraphFst, insert_space - -try: - import pynini - from pynini.lib import pynutil - - PYNINI_AVAILABLE = True -except (ModuleNotFoundError, ImportError): - PYNINI_AVAILABLE = False - - -class SerialFst(GraphFst): - """ - Finite state transducer for classifying serial. - The serial is a combination of digits, letters and dashes, e.g.: - c325 -> - tokens { serial { value: "c three hundred twenty five" } } - tokens { serial { value: "c three two five" } } - tokens { serial { value: "c thirty two five" } } - tokens { serial { value: "c three twenty five" } } - - Args: - cardinal: CardinalFst - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ - - def __init__(self, cardinal: GraphFst, deterministic: bool = True): - super().__init__(name="serial", kind="classify", deterministic=deterministic) - - if deterministic: - num_graph = cardinal.single_digits_graph - else: - num_graph = cardinal.graph - - alpha = NEMO_ALPHA - if not deterministic: - letter_pronunciation = pynini.string_map(load_labels(get_abs_path("data/letter_pronunciation.tsv"))) - alpha |= letter_pronunciation - - delimiter = insert_space | pynini.cross("-", " ") - - letter_num = pynini.closure(alpha + delimiter, 1) + num_graph - num_letter = num_graph + pynini.closure(delimiter + alpha, 1) - - next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) - - serial_graph = (letter_num | num_letter) + next_alpha_or_num - graph = pynutil.insert("cardinal { integer: \"") + serial_graph - - if not deterministic: - graph += pynini.closure(pynini.accep("s") | pynini.cross("s", "es"), 0, 1) - - graph += pynutil.insert("\" }") - - graph = self.add_tokens(graph) - self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py index 55b9c7ad60f4..9e15f12ad937 100644 --- a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py @@ -22,7 +22,6 @@ from nemo_text_processing.text_normalization.taggers.money import MoneyFst from nemo_text_processing.text_normalization.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.taggers.punctuation import PunctuationFst -from nemo_text_processing.text_normalization.taggers.serial import SerialFst from nemo_text_processing.text_normalization.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.taggers.time import TimeFst from nemo_text_processing.text_normalization.taggers.whitelist import WhiteListFst @@ -71,7 +70,6 @@ def __init__(self, input_case: str, deterministic: bool = True): money_graph = MoneyFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(input_case=input_case, deterministic=deterministic).fst punct_graph = PunctuationFst(deterministic=deterministic).fst - serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic).fst classify = ( pynutil.add_weight(whitelist_graph, 1.01) @@ -84,7 +82,6 @@ def __init__(self, input_case: str, deterministic: bool = True): | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) - | pynutil.add_weight(serial_graph, 1.2) | pynutil.add_weight(word_graph, 100) ) diff --git a/nemo_text_processing/text_normalization/verbalizers/measure.py b/nemo_text_processing/text_normalization/verbalizers/measure.py index 4bb4405bfb78..0718f42374b8 100644 --- a/nemo_text_processing/text_normalization/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/verbalizers/measure.py @@ -13,14 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.graph_utils import ( - NEMO_ALNUM, - NEMO_CHAR, - NEMO_PUNCT, - GraphFst, - delete_space, - insert_space, -) +from nemo_text_processing.text_normalization.graph_utils import NEMO_CHAR, GraphFst, delete_space, insert_space try: import pynini diff --git a/nemo_text_processing/text_normalization/verbalizers/serial.py b/nemo_text_processing/text_normalization/verbalizers/serial.py deleted file mode 100644 index 3e9717499b0b..000000000000 --- a/nemo_text_processing/text_normalization/verbalizers/serial.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# Copyright 2015 and onwards Google, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from nemo_text_processing.text_normalization.graph_utils import GraphFst, delete_space - -try: - import pynini - from pynini.lib import pynutil - - PYNINI_AVAILABLE = True -except (ModuleNotFoundError, ImportError): - PYNINI_AVAILABLE = False - - -class SerialFst(GraphFst): - """ - Finite state transducer for verbalizing serial, e.g. - tokens { serial { value: "c thirty two five" } } -> c thirty two five - - Args: - cardinal: CardinalFst - deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) - """ - - def __init__(self, cardinal: GraphFst, deterministic: bool = False): - super().__init__(name="serial", kind="verbalize", deterministic=deterministic) - - graph_cardinal = ( - pynutil.delete("cardinal {") - + delete_space - + delete_space - + cardinal.numbers - + delete_space - + pynutil.delete("}") - ) - graph = graph_cardinal + pynini.closure(delete_space) - delete_tokens = self.delete_tokens(graph) - self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/verbalizers/verbalize.py index 08d9cb00c441..04e01e8a0d5a 100644 --- a/nemo_text_processing/text_normalization/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/verbalizers/verbalize.py @@ -21,7 +21,6 @@ from nemo_text_processing.text_normalization.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.verbalizers.ordinal import OrdinalFst -from nemo_text_processing.text_normalization.verbalizers.serial import SerialFst from nemo_text_processing.text_normalization.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.verbalizers.whitelist import WhiteListFst @@ -54,7 +53,6 @@ def __init__(self, deterministic: bool = True): date_graph = DateFst(ordinal=ordinal, deterministic=deterministic).fst money_graph = MoneyFst(decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst - serial_graph = SerialFst(cardinal=cardinal, deterministic=deterministic).fst graph = ( time_graph | date_graph @@ -65,7 +63,6 @@ def __init__(self, deterministic: bool = True): | cardinal_graph | telephone_graph | electronic_graph - | serial_graph | whitelist_graph ) diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt index 5b9100f6e949..9dcb6805db0f 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt @@ -7,3 +7,11 @@ 13,000~thirteen thousand 123,123,000~one hundred twenty three million one hundred twenty three thousand 123,000,012~one hundred twenty three million twelve +t-0t25d12-f~t oh t two five d one two f +133-ABC~one hundred thirty three ABC +B2A23C~B two A two three C +25d08A~two five d oh eight A +C24~C two four +W2s~W two s +1-4-a-b-1-5~one four a b one five +b-c-1-5-b-s-b~b c one five b s b diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt deleted file mode 100644 index 2a103f962070..000000000000 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_serial.txt +++ /dev/null @@ -1,6 +0,0 @@ -t-0t25d12-f~t oh t two five d one two f -133-ABC~one hundred thirty three ABC -B2A23C~B two A two three C -25d08A~two five d oh eight A -C24~C two four -W2s~W two s diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_word.txt index 4016d48840e5..e10df681956e 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_word.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_word.txt @@ -20,3 +20,4 @@ aaghart's~aaghart's aalem~aalem a'ali~a'ali aaliyan's~aaliyan's +mother-in-law~mother-in-law diff --git a/tests/nemo_text_processing/test_serial.py b/tests/nemo_text_processing/test_serial.py deleted file mode 100644 index 81b19599de0a..000000000000 --- a/tests/nemo_text_processing/test_serial.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from parameterized import parameterized -from utils import PYNINI_AVAILABLE, parse_test_case_file - - -class TestSerial: - normalizer = Normalizer(input_case='cased') if PYNINI_AVAILABLE else None - normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None - - @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_serial.txt')) - @pytest.mark.skipif( - not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" - ) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_norm(self, test_input, expected): - pred = self.normalizer.normalize(test_input, verbose=False) - assert pred == expected - pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) - assert expected in pred_non_deterministic From 4c475b5efcc930a4790fe9acaee2ea893131c593 Mon Sep 17 00:00:00 2001 From: ekmb Date: Mon, 7 Jun 2021 15:10:50 -0700 Subject: [PATCH 15/17] clean up Signed-off-by: ekmb --- .../text_normalization/taggers/measure.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/nemo_text_processing/text_normalization/taggers/measure.py b/nemo_text_processing/text_normalization/taggers/measure.py index e90f4ee3f130..b32fd27dc554 100644 --- a/nemo_text_processing/text_normalization/taggers/measure.py +++ b/nemo_text_processing/text_normalization/taggers/measure.py @@ -115,16 +115,6 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + pynutil.insert("\"") ) - # alpha_dash_cardinal = ( - # pynutil.insert("units: \"") - # + pynini.closure(NEMO_ALPHA, 1) - # + pynini.cross('-', '') - # + pynutil.insert("\" preserve_order: true") - # + pynutil.insert(" cardinal { integer: \"") - # + cardinal_graph - # + pynutil.insert("\" }") - # ) - alpha_dash_cardinal = ( pynutil.insert("units: \"") + pynini.closure(NEMO_ALPHA, 1) @@ -132,7 +122,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + pynutil.insert("\"") + pynutil.insert(" cardinal { integer: \"") + cardinal_graph - + pynutil.insert("\" }" + pynutil.insert(" preserve_order: true")) + + pynutil.insert("\" } preserve_order: true") ) decimal_dash_alpha = ( @@ -151,7 +141,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + pynutil.insert("\"") + pynutil.insert(" decimal { ") + decimal.final_graph_wo_negative - + pynutil.insert(" } ") + + pynutil.insert(" } preserve_order: true") ) final_graph = ( From 505409089fabb939fc61abe04dd3f16a0449a213 Mon Sep 17 00:00:00 2001 From: ekmb Date: Mon, 7 Jun 2021 15:15:51 -0700 Subject: [PATCH 16/17] update for SH zero -> oh Signed-off-by: ekmb --- nemo_text_processing/text_normalization/taggers/cardinal.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py index 65e52e3f5b79..e578b0d31752 100644 --- a/nemo_text_processing/text_normalization/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/taggers/cardinal.py @@ -51,7 +51,9 @@ def __init__(self, deterministic: bool = True): graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - single_digits_graph = pynini.invert(graph_digit | graph_zero) | pynini.cross("0", "oh") + single_digits_graph = pynutil.add_weight(pynini.invert(graph_digit | graph_zero), 1.2) | pynutil.add_weight( + pynini.cross("0", "oh"), 1.1 + ) self.single_digits_graph = single_digits_graph + pynini.closure(pynutil.insert(" ") + single_digits_graph) if not deterministic: From 61a488a9012abf2f022a1df08d19429d4bbead2f Mon Sep 17 00:00:00 2001 From: ekmb Date: Tue, 8 Jun 2021 13:40:43 -0700 Subject: [PATCH 17/17] change n_tagger default Signed-off-by: ekmb --- .../text_normalization/normalize_with_audio.py | 2 +- tests/nemo_text_processing/test_normalization_with_audio.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 84c648401dd1..c64c9c933af0 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -274,7 +274,7 @@ def parse_args(): parser.add_argument( "--n_tagged", type=int, - default=300, + default=1000, help="number of tagged options to consider, -1 - return all possible tagged options", ) parser.add_argument("--verbose", help="print info for debugging", action="store_true") diff --git a/tests/nemo_text_processing/test_normalization_with_audio.py b/tests/nemo_text_processing/test_normalization_with_audio.py index 1e9c6ecea83e..00345837b3c2 100644 --- a/tests/nemo_text_processing/test_normalization_with_audio.py +++ b/tests/nemo_text_processing/test_normalization_with_audio.py @@ -29,5 +29,5 @@ class TestNormalizeWithAudio: @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): - pred = self.normalizer.normalize(test_input, n_tagged=300) - assert len(set(pred).intersection(set(expected))) == len(expected) + pred = self.normalizer.normalize(test_input, n_tagged=700) + assert len(set(pred).intersection(set(expected))) == len(expected), f'pred: {pred}'