From 102e8997220bd077386b9fbe721a7e027cb60423 Mon Sep 17 00:00:00 2001 From: Evelina <10428420+ekmb@users.noreply.github.com> Date: Mon, 14 Jun 2021 18:26:22 -0700 Subject: [PATCH] Text Normalization Update (#2356) * upper cased date support Signed-off-by: ekmb * update whitelist, change roman weights Signed-off-by: ekmb * docstrings, space fix, init file Signed-off-by: ekmb * lgtm Signed-off-by: ekmb * fraction with measure class Signed-off-by: ekmb Signed-off-by: mchrzanowski --- .../text_normalization/data/months/abbr.tsv | 4 +- .../text_normalization/data/roman/__init__.py | 13 +++ .../data/roman/digit_teen.tsv | 49 ++++++++++ .../data/roman/hundreds.tsv | 9 ++ .../text_normalization/data/roman/ties.tsv | 5 + .../text_normalization/data/whitelist.tsv | 3 + .../data/whitelist_alternatives.tsv | 5 +- .../text_normalization/data_loader_utils.py | 21 ++++ .../text_normalization/normalize.py | 23 ++++- .../normalize_with_audio.py | 98 +++++++++---------- .../text_normalization/taggers/cardinal.py | 2 +- .../text_normalization/taggers/date.py | 18 +++- .../text_normalization/taggers/fraction.py | 24 ++++- .../text_normalization/taggers/measure.py | 8 +- .../text_normalization/taggers/ordinal.py | 5 +- .../text_normalization/taggers/roman.py | 61 ++++++++++++ .../taggers/tokenize_and_classify.py | 12 ++- .../text_normalization/verbalizers/date.py | 16 +-- .../verbalizers/fraction.py | 45 ++++++++- .../text_normalization/verbalizers/measure.py | 10 +- .../text_normalization/verbalizers/roman.py | 46 +++++++++ .../verbalizers/verbalize.py | 12 ++- .../test_cases_cardinal.txt | 1 + .../test_cases_date.txt | 2 + .../test_cases_fraction.txt | 11 +++ .../test_cases_measure.txt | 1 + .../test_cases_normalize_with_audio.txt | 11 ++- tests/nemo_text_processing/test_boundary.py | 2 +- tests/nemo_text_processing/test_fraction.py | 38 +++++++ 29 files changed, 474 insertions(+), 81 deletions(-) create mode 100644 nemo_text_processing/text_normalization/data/roman/__init__.py create mode 100644 nemo_text_processing/text_normalization/data/roman/digit_teen.tsv create mode 100644 nemo_text_processing/text_normalization/data/roman/hundreds.tsv create mode 100644 nemo_text_processing/text_normalization/data/roman/ties.tsv create mode 100644 nemo_text_processing/text_normalization/taggers/roman.py create mode 100644 nemo_text_processing/text_normalization/verbalizers/roman.py create mode 100644 tests/nemo_text_processing/data_text_normalization/test_cases_fraction.txt create mode 100644 tests/nemo_text_processing/test_fraction.py diff --git a/nemo_text_processing/text_normalization/data/months/abbr.tsv b/nemo_text_processing/text_normalization/data/months/abbr.tsv index fb1f5c70309a..5609e211d60a 100644 --- a/nemo_text_processing/text_normalization/data/months/abbr.tsv +++ b/nemo_text_processing/text_normalization/data/months/abbr.tsv @@ -2,11 +2,11 @@ jan january feb february mar march apr april -jun june +jun june jul july aug august sep september sept september oct october nov november -dec december +dec december \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data/roman/__init__.py b/nemo_text_processing/text_normalization/data/roman/__init__.py new file mode 100644 index 000000000000..bc443be41c4c --- /dev/null +++ b/nemo_text_processing/text_normalization/data/roman/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/data/roman/digit_teen.tsv b/nemo_text_processing/text_normalization/data/roman/digit_teen.tsv new file mode 100644 index 000000000000..cd0991331f38 --- /dev/null +++ b/nemo_text_processing/text_normalization/data/roman/digit_teen.tsv @@ -0,0 +1,49 @@ +i 1 +ii 2 +iii 3 +iv 4 +v 5 +vi 6 +vii 7 +viii 8 +ix 9 +x 10 +xi 11 +xii 12 +xiii 13 +xiv 14 +xv 15 +xvi 16 +xvii 17 +xviii 18 +xix 19 +xx 20 +xxi 21 +xxii 22 +xxiii 23 +xxiv 24 +xxv 25 +xxvi 26 +xxvii 27 +xxviii 28 +xxix 29 +xxx 30 +xxxi 31 +xxxii 32 +xxxiii 33 +xxxiv 34 +xxxv 35 +xxxvi 36 +xxxvii 37 +xxxviii 38 +xxxix 39 +xl 40 +xli 41 +xlii 42 +xliii 43 +xliv 44 +xlv 45 +xlvi 46 +xlvii 47 +xlviii 48 +xlix 49 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data/roman/hundreds.tsv b/nemo_text_processing/text_normalization/data/roman/hundreds.tsv new file mode 100644 index 000000000000..0aafad3049cd --- /dev/null +++ b/nemo_text_processing/text_normalization/data/roman/hundreds.tsv @@ -0,0 +1,9 @@ +c 100 +cc 200 +ccc 300 +cd 400 +d 500 +dc 600 +dcc 700 +dccc 800 +cm 900 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data/roman/ties.tsv b/nemo_text_processing/text_normalization/data/roman/ties.tsv new file mode 100644 index 000000000000..5516676f4be3 --- /dev/null +++ b/nemo_text_processing/text_normalization/data/roman/ties.tsv @@ -0,0 +1,5 @@ +l 50 +lx 60 +lxx 70 +lxxx 80 +xc 90 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data/whitelist.tsv b/nemo_text_processing/text_normalization/data/whitelist.tsv index 641a15560079..853993190452 100644 --- a/nemo_text_processing/text_normalization/data/whitelist.tsv +++ b/nemo_text_processing/text_normalization/data/whitelist.tsv @@ -1,6 +1,9 @@ Ph.D. p h d Hon. honorable & and +Mt. Mount +Maj. Major +Rev. Reverend # hash Gov. governor 7-eleven seven eleven diff --git a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv index d3c878e34b43..8cea774c58f0 100644 --- a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv +++ b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv @@ -11,4 +11,7 @@ Mrs. Misses Ms. Miss Mr Mister Mrs Misses -Ms Miss \ No newline at end of file +Ms Miss +&Co. and Co. +§ section += equals \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index b361ab7d729e..1aaac7a76246 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -16,6 +16,7 @@ import csv import json import os +import re from collections import defaultdict, namedtuple from typing import Dict, List, Optional, Set, Tuple @@ -241,8 +242,28 @@ def post_process_punctuation(text: str) -> str: .replace('“', '"') .replace("‘", "'") .replace('`', "'") + .replace('- -', "--") ) for punct in "!,.:;?": text = text.replace(f' {punct}', punct) return text.strip() + + +def pre_process(text: str) -> str: + """ + Adds space around punctuation marks + + Args: + text: string that may include semiotic classes + + Returns: text with spaces around punctuation marks + """ + space_both = '*<=>^[]{}' + for punct in space_both: + text = text.replace(punct, ' ' + punct + ' ') + + text = text.replace('--', ' ' + '--' + ' ') + # remove extra space + text = re.sub(r' +', ' ', text) + return text diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py index 1ff591c3876b..353025a3a989 100644 --- a/nemo_text_processing/text_normalization/normalize.py +++ b/nemo_text_processing/text_normalization/normalize.py @@ -17,7 +17,7 @@ from collections import OrderedDict from typing import List -from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation +from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation, pre_process from nemo_text_processing.text_normalization.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.token_parser import PRESERVE_ORDER_KEY, TokenParser from nemo_text_processing.text_normalization.verbalizers.verbalize_final import VerbalizeFinalFst @@ -67,7 +67,9 @@ def normalize_list(self, texts: List[str], verbose=False) -> List[str]: res.append(text) return res - def normalize(self, text: str, verbose: bool, punct_post_process: bool = False) -> str: + def normalize( + self, text: str, verbose: bool, punct_pre_process: bool = False, punct_post_process: bool = False + ) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms @@ -75,10 +77,13 @@ def normalize(self, text: str, verbose: bool, punct_post_process: bool = False) Args: text: string that may include semiotic classes verbose: whether to print intermediate meta information - punct_post_process: set to True to normalize punctuation + punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ] + punct_post_process: whether to normalize punctuation Returns: spoken form """ + if punct_pre_process: + text = pre_process(text) text = text.strip() if not text: if verbose: @@ -222,10 +227,20 @@ def parse_args(): parser.add_argument( "--punct_post_process", help="set to True to enable punctuation post processing", action="store_true" ) + parser.add_argument( + "--punct_pre_process", help="set to True to enable punctuation pre processing", action="store_true" + ) return parser.parse_args() if __name__ == "__main__": args = parse_args() normalizer = Normalizer(input_case=args.input_case) - print(normalizer.normalize(args.input_string, verbose=args.verbose, punct_post_process=args.punct_post_process)) + print( + normalizer.normalize( + args.input_string, + verbose=args.verbose, + punct_pre_process=args.punct_pre_process, + punct_post_process=args.punct_post_process, + ) + ) diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 46d34afc1ee9..4df248f2d401 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -14,16 +14,15 @@ import json import os -import re import time from argparse import ArgumentParser from typing import List, Tuple -from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation +from joblib import Parallel, delayed +from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation, pre_process from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.text_normalization.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.text_normalization.verbalizers.verbalize_final import VerbalizeFinalFst -from tqdm import tqdm from nemo.collections.asr.metrics.wer import word_error_rate from nemo.collections.asr.models import ASRModel @@ -79,7 +78,14 @@ def __init__(self, input_case: str): self.tagger = ClassifyFst(input_case=input_case, deterministic=False) self.verbalizer = VerbalizeFinalFst(deterministic=False) - def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False) -> str: + def normalize( + self, + text: str, + n_tagged: int, + punct_pre_process: bool = True, + punct_post_process: bool = True, + verbose: bool = False, + ) -> str: """ Main function. Normalizes tokens from written to spoken form e.g. 12 kg -> twelve kilograms @@ -87,12 +93,15 @@ def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, v Args: text: string that may include semiotic classes n_tagged: number of tagged options to consider, -1 - to get all possible tagged options + punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ] punct_post_process: whether to normalize punctuation verbose: whether to print intermediate meta information Returns: normalized text options (usually there are multiple ways of normalizing a given semiotic class) """ + if punct_pre_process: + text = pre_process(text) text = text.strip() if not text: if verbose: @@ -108,7 +117,6 @@ def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, v normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts) - if len(normalized_texts) == 0: raise ValueError() if punct_post_process: @@ -183,36 +191,12 @@ def calculate_cer(normalized_texts: List[str], transcript: str, remove_punct=Fal text_clean = text.replace('-', ' ').lower() if remove_punct: for punct in "!?:;,.-()*+-/<=>@^_": - text_clean = text_clean.replace(punct, " ") - text_clean = re.sub(r' +', ' ', text_clean) + text_clean = text_clean.replace(punct, "") cer = round(word_error_rate([transcript], [text_clean], use_cer=True) * 100, 2) normalized_options.append((text, cer)) return normalized_options -def pre_process(text: str) -> str: - """ - Adds space around punctuation marks - - Args: - text: string that may include semiotic classes - - Returns: text with spaces around punctuation marks - """ - text = text.replace('--', '-') - space_right = '!?:;,.-()*+-/<=>@^_' - space_both = '-()*+-/<=>@^_' - - for punct in space_right: - text = text.replace(punct, punct + ' ') - for punct in space_both: - text = text.replace(punct, ' ' + punct + ' ') - - # remove extra space - text = re.sub(r' +', ' ', text) - return text - - def get_asr_model(asr_model: ASRModel): """ Returns ASR Model @@ -249,12 +233,36 @@ def parse_args(): ) parser.add_argument("--verbose", help="print info for debugging", action="store_true") parser.add_argument("--remove_punct", help="remove punctuation before calculating cer", action="store_true") + parser.add_argument( + "--no_punct_pre_process", help="set to True to disable punctuation pre processing", action="store_true" + ) parser.add_argument( "--no_punct_post_process", help="set to True to disable punctuation post processing", action="store_true" ) return parser.parse_args() +def _normalize_line(normalizer: NormalizerWithAudio, line: str, asr_model: ASRModel = None): + line = json.loads(line) + audio = line['audio_filepath'] + if 'transcript' in line: + transcript = line['transcript'] + else: + transcript = asr_model.transcribe([audio])[0] + + normalized_texts = normalizer.normalize( + text=line['text'], + verbose=args.verbose, + n_tagged=args.n_tagged, + punct_pre_process=not args.no_punct_pre_process, + punct_post_process=not args.no_punct_post_process, + ) + normalized_text, cer = normalizer.select_best_match(normalized_texts, transcript, args.verbose, args.remove_punct) + line['nemo_normalized'] = normalized_text + line['CER_nemo_normalized'] = cer + return line + + def normalize_manifest(args): """ Args: @@ -265,26 +273,15 @@ def normalize_manifest(args): asr_model = None with open(args.audio_data, 'r') as f: with open(manifest_out, 'w') as f_out: - for line in tqdm(f): - line = json.loads(line) - audio = line['audio_filepath'] - if 'transcript' in line: - transcript = line['transcript'] - else: - if asr_model is None: - asr_model = get_asr_model(args.model) - transcript = asr_model.transcribe([audio])[0] - normalized_texts = normalizer.normalize( - text=line['text'], - verbose=args.verbose, - n_tagged=args.n_tagged, - punct_post_process=not args.no_punct_post_process, - ) - normalized_text, cer = normalizer.select_best_match( - normalized_texts, transcript, args.verbose, args.remove_punct - ) - line['nemo_normalized'] = normalized_text - line['CER_nemo_normalized'] = cer + lines = f.readlines() + first_line = json.loads(lines[0]) + if 'transcript' not in first_line: + asr_model = get_asr_model(args.model) + normalized_lines = Parallel(n_jobs=-1)( + delayed(_normalize_line)(normalizer, line, asr_model) for line in lines + ) + + for line in normalized_lines: f_out.write(json.dumps(line, ensure_ascii=False) + '\n') print(f'Normalized version saved at {manifest_out}') @@ -302,6 +299,7 @@ def normalize_manifest(args): text=args.text, verbose=args.verbose, n_tagged=args.n_tagged, + punct_pre_process=not args.no_punct_pre_process, punct_post_process=not args.no_punct_post_process, ) if args.audio_data: diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py index e578b0d31752..2d01a49357e3 100644 --- a/nemo_text_processing/text_normalization/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/taggers/cardinal.py @@ -104,7 +104,7 @@ def get_serial_graph(self): letter_pronunciation = pynini.string_map(load_labels(get_abs_path("data/letter_pronunciation.tsv"))) alpha |= letter_pronunciation - delimiter = insert_space | pynini.cross("-", " ") + delimiter = insert_space | pynini.cross("-", " ") | pynini.cross("/", " ") letter_num = pynini.closure(alpha + delimiter, 1) + num_graph num_letter = pynini.closure(num_graph + delimiter, 1) + alpha next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph)) diff --git a/nemo_text_processing/text_normalization/taggers/date.py b/nemo_text_processing/text_normalization/taggers/date.py index 96bba2c25b66..8c7c6a713afe 100644 --- a/nemo_text_processing/text_normalization/taggers/date.py +++ b/nemo_text_processing/text_normalization/taggers/date.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path +from nemo_text_processing.text_normalization.data_loader_utils import get_abs_path, load_labels from nemo_text_processing.text_normalization.graph_utils import ( NEMO_CHAR, NEMO_DIGIT, @@ -96,7 +96,13 @@ def _get_year_graph(deterministic: bool = True): 2000 - 2009 will be verbalized as two thousand. """ graph = get_hundreds_graph(deterministic) - graph = (pynini.union("1", "2") + NEMO_DIGIT + NEMO_DIGIT + NEMO_DIGIT + pynini.closure("s", 0, 1)) @ graph + graph = ( + pynini.union("1", "2") + + NEMO_DIGIT + + NEMO_DIGIT + + NEMO_DIGIT + + pynini.closure(pynini.cross(" s", "s") | "s", 0, 1) + ) @ graph return graph @@ -128,8 +134,14 @@ def __init__(self, cardinal: GraphFst, deterministic: bool): ) + pynini.closure(pynutil.delete("."), 0, 1) month_graph |= month_abbr_graph - month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize() + # to support all caps names + names_all_caps = [[x[0].upper()] for x in load_labels(get_abs_path("data/months/names.tsv"))] + abbr_all_caps = [(x.upper(), y) for x, y in load_labels(get_abs_path("data/months/abbr.tsv"))] + month_graph |= pynini.string_map(names_all_caps) | ( + pynini.string_map(abbr_all_caps) + pynini.closure(pynutil.delete("."), 0, 1) + ) + month_numbers_graph = pynini.string_file(get_abs_path("data/months/numbers.tsv")).optimize() cardinal_graph = cardinal.graph_hundred_component_at_least_one_none_zero_digit year_graph = _get_year_graph(deterministic) diff --git a/nemo_text_processing/text_normalization/taggers/fraction.py b/nemo_text_processing/text_normalization/taggers/fraction.py index 9524d74228b9..0feb5ce1dcb9 100644 --- a/nemo_text_processing/text_normalization/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/taggers/fraction.py @@ -15,16 +15,36 @@ from nemo_text_processing.text_normalization.graph_utils import GraphFst +try: + import pynini + from pynini.lib import pynutil + + PYNINI_AVAILABLE = True +except (ModuleNotFoundError, ImportError): + PYNINI_AVAILABLE = False + class FractionFst(GraphFst): """ Finite state transducer for classifying fraction + "23 4/5" -> + tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } } Args: deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, deterministic: bool = True): + def __init__(self, cardinal, deterministic: bool = True): super().__init__(name="fraction", kind="classify", deterministic=deterministic) - # integer_part # numerator # denominator + cardinal_graph = cardinal.graph + + integer = pynutil.insert("integer: \"") + cardinal_graph + pynutil.insert("\"") + pynini.accep(" ") + numerator = ( + pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" ")) + ) + denominator = pynutil.insert("denominator: \"") + cardinal_graph + pynutil.insert("\"") + + self.graph = pynini.closure(integer, 0, 1) + numerator + denominator + final_graph = self.add_tokens(self.graph) + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/taggers/measure.py b/nemo_text_processing/text_normalization/taggers/measure.py index b32fd27dc554..a0475c5b10cb 100644 --- a/nemo_text_processing/text_normalization/taggers/measure.py +++ b/nemo_text_processing/text_normalization/taggers/measure.py @@ -43,11 +43,12 @@ class MeasureFst(GraphFst): Args: cardinal: CardinalFst decimal: DecimalFst + fraction: FractionFst deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = True): + def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph @@ -144,6 +145,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = + pynutil.insert(" } preserve_order: true") ) + subgraph_fraction = ( + pynutil.insert("fraction { ") + fraction.graph + delete_space + pynutil.insert(" } ") + unit_plural + ) + final_graph = ( subgraph_decimal | subgraph_cardinal @@ -151,6 +156,7 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst, deterministic: bool = | alpha_dash_cardinal | decimal_dash_alpha | alpha_dash_decimal + | subgraph_fraction ) final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/taggers/ordinal.py b/nemo_text_processing/text_normalization/taggers/ordinal.py index 2df87d3b5c90..49e452729dcc 100644 --- a/nemo_text_processing/text_normalization/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/taggers/ordinal.py @@ -39,9 +39,10 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="ordinal", kind="classify", deterministic=deterministic) cardinal_graph = cardinal.graph + endings = ["rd", "th", "st", "nd"] + endings += [x.upper() for x in endings] self.graph = ( - (pynini.closure(NEMO_DIGIT | pynini.accep(",")) + pynutil.delete(pynini.union("rd", "th", "st", "nd"))) - @ cardinal_graph + (pynini.closure(NEMO_DIGIT | pynini.accep(",")) + pynutil.delete(pynini.union(*endings))) @ cardinal_graph ).optimize() final_graph = pynutil.insert("integer: \"") + self.graph + pynutil.insert("\"") final_graph = self.add_tokens(final_graph) diff --git a/nemo_text_processing/text_normalization/taggers/roman.py b/nemo_text_processing/text_normalization/taggers/roman.py new file mode 100644 index 000000000000..136d0fb52063 --- /dev/null +++ b/nemo_text_processing/text_normalization/taggers/roman.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from nemo_text_processing.text_normalization.data_loader_utils import load_labels +from nemo_text_processing.text_normalization.graph_utils import GraphFst, get_abs_path, insert_space +from nemo_text_processing.text_normalization.taggers.cardinal import CardinalFst + +try: + import pynini + from pynini.lib import pynutil + + PYNINI_AVAILABLE = True +except (ModuleNotFoundError, ImportError): + PYNINI_AVAILABLE = False + + +class RomanFst(GraphFst): + """ + Finite state transducer for classifying electronic: as URLs, email addresses, etc. + e.g. cdf1@abc.edu -> tokens { electronic { username: "cdf1" domain: "abc.edu" } } + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="roman", kind="classify", deterministic=deterministic) + + def _load_roman(file: str): + roman = load_labels(get_abs_path(file)) + roman_numerals = [(x, y) for x, y in roman] + [(x.upper(), y) for x, y in roman] + return pynini.string_map(roman_numerals) + + cardinal_graph = CardinalFst(deterministic=True).graph + digit_teen = _load_roman("data/roman/digit_teen.tsv") @ cardinal_graph + ties = _load_roman("data/roman/ties.tsv") @ cardinal_graph + hundreds = _load_roman("data/roman/hundreds.tsv") @ cardinal_graph + + graph = ( + (ties | digit_teen | hundreds) + | (ties + insert_space + digit_teen) + | (hundreds + pynini.closure(insert_space + ties, 0, 1) + pynini.closure(insert_space + digit_teen, 0, 1)) + ).optimize() + + graph = pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + graph = self.add_tokens(graph) + self.fst = graph.optimize() diff --git a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py index 9e15f12ad937..1bef666a11f5 100644 --- a/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/taggers/tokenize_and_classify.py @@ -18,10 +18,12 @@ from nemo_text_processing.text_normalization.taggers.date import DateFst from nemo_text_processing.text_normalization.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.taggers.electronic import ElectronicFst +from nemo_text_processing.text_normalization.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.taggers.measure import MeasureFst from nemo_text_processing.text_normalization.taggers.money import MoneyFst from nemo_text_processing.text_normalization.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.taggers.roman import RomanFst from nemo_text_processing.text_normalization.taggers.telephone import TelephoneFst from nemo_text_processing.text_normalization.taggers.time import TimeFst from nemo_text_processing.text_normalization.taggers.whitelist import WhiteListFst @@ -59,8 +61,10 @@ def __init__(self, input_case: str, deterministic: bool = True): decimal = DecimalFst(cardinal=cardinal, deterministic=deterministic) decimal_graph = decimal.fst + fraction = FractionFst(deterministic=deterministic, cardinal=cardinal) + fraction_graph = fraction.fst - measure = MeasureFst(cardinal=cardinal, decimal=decimal, deterministic=deterministic) + measure = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst date_graph = DateFst(cardinal=cardinal, deterministic=deterministic).fst word_graph = WordFst(deterministic=deterministic).fst @@ -82,9 +86,15 @@ def __init__(self, input_case: str, deterministic: bool = True): | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(telephone_graph, 1.1) | pynutil.add_weight(electonic_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) + if not deterministic: + roman_graph = RomanFst(deterministic=deterministic).fst + # the weight matches the word_graph weight for "I" cases in long sentences with multiple semiotic tokens + classify |= pynutil.add_weight(roman_graph, 100) + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( diff --git a/nemo_text_processing/text_normalization/verbalizers/date.py b/nemo_text_processing/text_normalization/verbalizers/date.py index 7d7f0479f929..08195927df5e 100644 --- a/nemo_text_processing/text_normalization/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/verbalizers/date.py @@ -45,19 +45,21 @@ def __init__(self, ordinal: GraphFst, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) month = pynini.closure(NEMO_NOT_QUOTE, 1) + day_cardinal = ( + pynutil.delete("day:") + + delete_space + + pynutil.delete("\"") + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete("\"") + ) + day = day_cardinal @ ordinal.suffix if not deterministic: month |= pynutil.insert(" of ") + month + day |= day_cardinal month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + month + pynutil.delete("\"") - day = ( - pynutil.delete("day:") - + delete_space - + pynutil.delete("\"") - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete("\"") - ) @ ordinal.suffix year = ( pynutil.delete("year:") + delete_space diff --git a/nemo_text_processing/text_normalization/verbalizers/fraction.py b/nemo_text_processing/text_normalization/verbalizers/fraction.py index 74a1a844d12b..d49bd4bf5fc0 100644 --- a/nemo_text_processing/text_normalization/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/verbalizers/fraction.py @@ -13,12 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.graph_utils import GraphFst +from nemo_text_processing.text_normalization.graph_utils import NEMO_NOT_QUOTE, GraphFst, insert_space +from nemo_text_processing.text_normalization.verbalizers.ordinal import OrdinalFst + +try: + import pynini + from pynini.lib import pynutil + + PYNINI_AVAILABLE = True +except (ModuleNotFoundError, ImportError): + PYNINI_AVAILABLE = False class FractionFst(GraphFst): """ Finite state transducer for verbalizing fraction + e.g. tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } } -> + twenty three four fifth Args: deterministic: if True will provide a single transduction option, @@ -27,3 +38,35 @@ class FractionFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) + suffix = OrdinalFst().suffix + + integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") + numerator = pynutil.delete("numerator: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\" ") + numerator_one = pynutil.delete("numerator: \"") + pynini.accep("one") + pynutil.delete("\" ") + denominator = pynutil.delete("denominator: \"") + ( + pynini.closure(NEMO_NOT_QUOTE) @ suffix | pynini.cross('four', 'quarter') + ) + conjunction = pynutil.insert("and ") + if not deterministic: + conjunction = pynini.closure(conjunction, 0, 1) + + integer = pynini.closure(integer + insert_space + conjunction, 0, 1) + + denominator_half = pynini.cross("numerator: \"one\" denominator: \"two\"", "a half") + denominator_one_two = pynini.cross("denominator: \"one\"", "over one") | pynini.cross( + "denominator: \"two\"", "halves" + ) + fraction_default = pynutil.add_weight( + numerator + insert_space + denominator + pynutil.insert("s") + pynutil.delete("\""), 0.001 + ) + fraction_with_one = pynutil.add_weight( + numerator_one + insert_space + denominator + pynutil.delete("\""), 0.0001 + ) + + graph = integer + denominator_half | (fraction_with_one | fraction_default) + graph |= pynini.cross("numerator: \"one\" denominator: \"two\"", "one half") + graph |= (numerator | numerator_one) + insert_space + denominator_one_two + + self.graph = graph + delete_tokens = self.delete_tokens(self.graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/measure.py b/nemo_text_processing/text_normalization/verbalizers/measure.py index 0718f42374b8..3937efe26c48 100644 --- a/nemo_text_processing/text_normalization/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/verbalizers/measure.py @@ -34,11 +34,12 @@ class MeasureFst(GraphFst): Args: decimal: DecimalFst cardinal: CardinalFst + fraction: FractionFst deterministic: if True will provide a single transduction option, for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = True): + def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst, deterministic: bool = True): super().__init__(name="measure", kind="verbalize", deterministic=deterministic) optional_sign = cardinal.optional_sign unit = pynutil.delete("units: \"") + pynini.closure(NEMO_CHAR - " ", 1) + pynutil.delete("\"") + delete_space @@ -61,7 +62,12 @@ def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = + delete_space + pynutil.delete("}") ) - graph = (graph_cardinal | graph_decimal) + delete_space + insert_space + unit + + graph_fraction = ( + pynutil.delete("fraction {") + delete_space + fraction.graph + delete_space + pynutil.delete("}") + ) + + graph = (graph_cardinal | graph_decimal | graph_fraction) + delete_space + insert_space + unit # SH adds "preserve_order: true" by default preserve_order = pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space diff --git a/nemo_text_processing/text_normalization/verbalizers/roman.py b/nemo_text_processing/text_normalization/verbalizers/roman.py new file mode 100644 index 000000000000..bb42f3c52294 --- /dev/null +++ b/nemo_text_processing/text_normalization/verbalizers/roman.py @@ -0,0 +1,46 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.text_normalization.graph_utils import NEMO_NOT_QUOTE, GraphFst +from nemo_text_processing.text_normalization.verbalizers.ordinal import OrdinalFst + +try: + import pynini + from pynini.lib import pynutil + + PYNINI_AVAILABLE = True +except (ModuleNotFoundError, ImportError): + PYNINI_AVAILABLE = False + + +class RomanFst(GraphFst): + """ + Finite state transducer for verbalizing roman numerals + e.g. tokens { roman { integer: "one" } } -> one + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="roman", kind="verbalize", deterministic=deterministic) + suffix = OrdinalFst().suffix + + integer = pynini.closure(NEMO_NOT_QUOTE) + integer |= pynini.closure(pynutil.insert("the "), 0, 1) + integer @ suffix + graph = pynutil.delete("integer: \"") + integer + pynutil.delete("\"") + delete_tokens = self.delete_tokens(graph) + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/verbalizers/verbalize.py index 04e01e8a0d5a..b14468a8f088 100644 --- a/nemo_text_processing/text_normalization/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/verbalizers/verbalize.py @@ -18,9 +18,11 @@ from nemo_text_processing.text_normalization.verbalizers.date import DateFst from nemo_text_processing.text_normalization.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.verbalizers.electronic import ElectronicFst +from nemo_text_processing.text_normalization.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.verbalizers.roman import RomanFst from nemo_text_processing.text_normalization.verbalizers.telephone import TelephoneFst from nemo_text_processing.text_normalization.verbalizers.time import TimeFst from nemo_text_processing.text_normalization.verbalizers.whitelist import WhiteListFst @@ -45,14 +47,17 @@ def __init__(self, deterministic: bool = True): decimal_graph = decimal.fst ordinal = OrdinalFst(deterministic=deterministic) ordinal_graph = ordinal.fst + fraction = FractionFst(deterministic=deterministic) + fraction_graph = fraction.fst telephone_graph = TelephoneFst(deterministic=deterministic).fst electronic_graph = ElectronicFst(deterministic=deterministic).fst - measure = MeasureFst(decimal=decimal, cardinal=cardinal, deterministic=deterministic) + measure = MeasureFst(decimal=decimal, cardinal=cardinal, fraction=fraction, deterministic=deterministic) measure_graph = measure.fst time_graph = TimeFst(deterministic=deterministic).fst date_graph = DateFst(ordinal=ordinal, deterministic=deterministic).fst money_graph = MoneyFst(decimal=decimal, deterministic=deterministic).fst whitelist_graph = WhiteListFst(deterministic=deterministic).fst + graph = ( time_graph | date_graph @@ -63,7 +68,12 @@ def __init__(self, deterministic: bool = True): | cardinal_graph | telephone_graph | electronic_graph + | fraction_graph | whitelist_graph ) + if not deterministic: + roman_graph = RomanFst(deterministic=deterministic).fst + graph |= roman_graph + self.fst = graph diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt index 9dcb6805db0f..3d505b2ce447 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_cardinal.txt @@ -15,3 +15,4 @@ C24~C two four W2s~W two s 1-4-a-b-1-5~one four a b one five b-c-1-5-b-s-b~b c one five b s b +1/f-4s~one f four s diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_date.txt index 35b33cd1a04d..b5f95f466959 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_date.txt @@ -1,6 +1,7 @@ july 25 2012~july twenty fifth twenty twelve jul 25 2012~july twenty fifth twenty twelve 1980s~nineteen eighties +1980 s~nineteen eighties 25 july 2012~the twenty fifth of july twenty twelve 25 jul 2012~the twenty fifth of july twenty twelve 22 july 2012~the twenty second of july twenty twelve @@ -28,3 +29,4 @@ august 23, 2002~august twenty third two thousand two 1910s~nineteen tens 25 sept.~the twenty fifth of september 1000~one thousand +SEPT. 15TH~september fifteenth diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_fraction.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_fraction.txt new file mode 100644 index 000000000000..15d32caa9b55 --- /dev/null +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_fraction.txt @@ -0,0 +1,11 @@ +1/2007~one two thousand seventh +12639/12640~twelve thousand six hundred thirty nine twelve thousand six hundred fortieths +2/4~two quarters +1/4~one quarter +31/32~thirty one thirty seconds +22/3~twenty two thirds +1/3~one third +142/1~one hundred forty two over one +1/2~one half +2 1/2~two and a half +1795 / 1805~one thousand seven hundred ninety five one thousand eight hundred fifths \ No newline at end of file diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt index 6f395da8679f..3279e6863cc0 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_measure.txt @@ -12,3 +12,4 @@ covid-19.5~covid nineteen point five covid-19~covid nineteen a 4-kilogram bag~a four kilogram bag 7.2-millimeter bullet~seven point two millimeter bullet +4 1/2 lbs~four and a half pounds diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index 800a46038d0f..436bff19205f 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -49,7 +49,7 @@ It seemed to her that the jacket Oswald wore was darker than Commission Exhibit It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred and sixty two. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one six two. ~"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." -"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." +"Father, let this cup pass." He prayed -- was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord: -- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." ~1970-2010 nineteen seventy to twenty ten one thousand nine seventy to two thousand ten @@ -97,4 +97,11 @@ one dollar and zero one cents ~$17.31 seventeen dollars and thirty one cent seventeen dollars and thirty one cents -seventeen point three one dollars \ No newline at end of file +seventeen point three one dollars +~25.] +two five.] +twenty five.] +~Francis I--test +Francis the first -- test +Francis one -- test +Francis first -- test \ No newline at end of file diff --git a/tests/nemo_text_processing/test_boundary.py b/tests/nemo_text_processing/test_boundary.py index 75760fd9ebcf..907423005091 100644 --- a/tests/nemo_text_processing/test_boundary.py +++ b/tests/nemo_text_processing/test_boundary.py @@ -34,6 +34,6 @@ def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred == expected pred_non_deterministic = self.normalizer_with_audio.normalize( - test_input, n_tagged=100, punct_post_process=False + test_input, n_tagged=100, punct_pre_process=False, punct_post_process=False ) assert expected in pred_non_deterministic diff --git a/tests/nemo_text_processing/test_fraction.py b/tests/nemo_text_processing/test_fraction.py new file mode 100644 index 000000000000..24006c43cce5 --- /dev/null +++ b/tests/nemo_text_processing/test_fraction.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from parameterized import parameterized +from utils import PYNINI_AVAILABLE, parse_test_case_file + + +class TestFraction: + normalizer = Normalizer(input_case="cased") if PYNINI_AVAILABLE else None + normalizer_with_audio = NormalizerWithAudio(input_case='cased') if PYNINI_AVAILABLE else None + + @parameterized.expand(parse_test_case_file('data_text_normalization/test_cases_fraction.txt')) + @pytest.mark.skipif( + not PYNINI_AVAILABLE, reason="`pynini` not installed, please install via nemo_text_processing/setup.sh" + ) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False) + assert pred == expected + pred_non_deterministic = self.normalizer_with_audio.normalize(test_input, n_tagged=100) + assert expected in pred_non_deterministic