NVIDIA · ekmb · Jun 15, 2021 · Jun 14, 2021 · Jun 14, 2021 · Jun 14, 2021
diff --git a/nemo_text_processing/text_normalization/data/months/abbr.tsv b/nemo_text_processing/text_normalization/data/months/abbr.tsv
@@ -2,11 +2,11 @@ jan	january
 feb	february
 mar	march
 apr	april
-jun june
+jun	june
 jul	july
 aug	august
 sep	september
 sept	september
 oct	october
 nov	november
-dec	december
+dec	december
diff --git a/nemo_text_processing/text_normalization/data/roman/__init__.py b/nemo_text_processing/text_normalization/data/roman/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/data/roman/digit_teen.tsv b/nemo_text_processing/text_normalization/data/roman/digit_teen.tsv
@@ -0,0 +1,49 @@
+i	1
+ii	2
+iii	3
+iv	4
+v	5
+vi	6
+vii	7
+viii	8
+ix	9
+x	10
+xi	11
+xii	12
+xiii	13
+xiv	14
+xv	15
+xvi	16
+xvii	17
+xviii	18
+xix	19
+xx	20
+xxi	21
+xxii	22
+xxiii	23
+xxiv	24
+xxv	25
+xxvi	26
+xxvii	27
+xxviii	28
+xxix	29
+xxx	30
+xxxi	31
+xxxii	32
+xxxiii	33
+xxxiv	34
+xxxv	35
+xxxvi	36
+xxxvii	37
+xxxviii	38
+xxxix	39
+xl	40
+xli	41
+xlii	42
+xliii	43
+xliv	44
+xlv	45
+xlvi	46
+xlvii	47
+xlviii	48
+xlix	49
diff --git a/nemo_text_processing/text_normalization/data/roman/hundreds.tsv b/nemo_text_processing/text_normalization/data/roman/hundreds.tsv
@@ -0,0 +1,9 @@
+c	100
+cc	200
+ccc	300
+cd	400
+d	500
+dc	600
+dcc	700
+dccc	800
+cm	900
diff --git a/nemo_text_processing/text_normalization/data/roman/ties.tsv b/nemo_text_processing/text_normalization/data/roman/ties.tsv
@@ -0,0 +1,5 @@
+l	50
+lx	60
+lxx	70
+lxxx	80
+xc	90
diff --git a/nemo_text_processing/text_normalization/data/whitelist.tsv b/nemo_text_processing/text_normalization/data/whitelist.tsv
@@ -1,6 +1,9 @@
 Ph.D.	p h d
 Hon.	honorable
 &	and
+Mt.	Mount
+Maj.	Major
+Rev.	Reverend
 #	hash
 Gov.	governor
 7-eleven	seven eleven

diff --git a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv
@@ -11,4 +11,7 @@ Mrs.	Misses
 Ms.	Miss
 Mr	Mister
 Mrs	Misses
-Ms	Miss
+Ms	Miss
+&Co.	and Co.
+§	section
+=	equals
diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py
@@ -16,6 +16,7 @@
 import csv
 import json
 import os
+import re
 from collections import defaultdict, namedtuple
 from typing import Dict, List, Optional, Set, Tuple
 
@@ -241,8 +242,28 @@ def post_process_punctuation(text: str) -> str:
         .replace('“', '"')
         .replace("‘", "'")
         .replace('`', "'")
+        .replace('- -', "--")
     )
 
     for punct in "!,.:;?":
         text = text.replace(f' {punct}', punct)
     return text.strip()
+
+
+def pre_process(text: str) -> str:
+    """
+    Adds space around punctuation marks
+
+    Args:
+        text: string that may include semiotic classes
+
+    Returns: text with spaces around punctuation marks
+    """
+    space_both = '*<=>^[]{}'
+    for punct in space_both:
+        text = text.replace(punct, ' ' + punct + ' ')
+
+    text = text.replace('--', ' ' + '--' + ' ')
+    # remove extra space
+    text = re.sub(r' +', ' ', text)
+    return text
diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
@@ -17,7 +17,7 @@
 from collections import OrderedDict
 from typing import List
 
-from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation
+from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation, pre_process
 from nemo_text_processing.text_normalization.taggers.tokenize_and_classify import ClassifyFst
 from nemo_text_processing.text_normalization.token_parser import PRESERVE_ORDER_KEY, TokenParser
 from nemo_text_processing.text_normalization.verbalizers.verbalize_final import VerbalizeFinalFst
@@ -67,18 +67,23 @@ def normalize_list(self, texts: List[str], verbose=False) -> List[str]:
             res.append(text)
         return res
 
-    def normalize(self, text: str, verbose: bool, punct_post_process: bool = False) -> str:
+    def normalize(
+        self, text: str, verbose: bool, punct_pre_process: bool = False, punct_post_process: bool = False
+    ) -> str:
         """
         Main function. Normalizes tokens from written to spoken form
             e.g. 12 kg -> twelve kilograms
 
         Args:
             text: string that may include semiotic classes
             verbose: whether to print intermediate meta information
-            punct_post_process: set to True to normalize punctuation
+            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
+            punct_post_process: whether to normalize punctuation
 
         Returns: spoken form
         """
+        if punct_pre_process:
+            text = pre_process(text)
         text = text.strip()
         if not text:
             if verbose:
@@ -222,10 +227,20 @@ def parse_args():
     parser.add_argument(
         "--punct_post_process", help="set to True to enable punctuation post processing", action="store_true"
     )
+    parser.add_argument(
+        "--punct_pre_process", help="set to True to enable punctuation pre processing", action="store_true"
+    )
     return parser.parse_args()
 
 
 if __name__ == "__main__":
     args = parse_args()
     normalizer = Normalizer(input_case=args.input_case)
-    print(normalizer.normalize(args.input_string, verbose=args.verbose, punct_post_process=args.punct_post_process))
+    print(
+        normalizer.normalize(
+            args.input_string,
+            verbose=args.verbose,
+            punct_pre_process=args.punct_pre_process,
+            punct_post_process=args.punct_post_process,
+        )
+    )
diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py
@@ -14,16 +14,15 @@
 
 import json
 import os
-import re
 import time
 from argparse import ArgumentParser
 from typing import List, Tuple
 
-from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation
+from joblib import Parallel, delayed
+from nemo_text_processing.text_normalization.data_loader_utils import post_process_punctuation, pre_process
 from nemo_text_processing.text_normalization.normalize import Normalizer
 from nemo_text_processing.text_normalization.taggers.tokenize_and_classify import ClassifyFst
 from nemo_text_processing.text_normalization.verbalizers.verbalize_final import VerbalizeFinalFst
-from tqdm import tqdm
 
 from nemo.collections.asr.metrics.wer import word_error_rate
 from nemo.collections.asr.models import ASRModel
@@ -79,20 +78,30 @@ def __init__(self, input_case: str):
         self.tagger = ClassifyFst(input_case=input_case, deterministic=False)
         self.verbalizer = VerbalizeFinalFst(deterministic=False)
 
-    def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, verbose: bool = False) -> str:
+    def normalize(
+        self,
+        text: str,
+        n_tagged: int,
+        punct_pre_process: bool = True,
+        punct_post_process: bool = True,
+        verbose: bool = False,
+    ) -> str:
         """
         Main function. Normalizes tokens from written to spoken form
             e.g. 12 kg -> twelve kilograms
 
         Args:
             text: string that may include semiotic classes
             n_tagged: number of tagged options to consider, -1 - to get all possible tagged options
+            punct_pre_process: whether to perform punctuation pre-processing, for example, [25] -> [ 25 ]
             punct_post_process: whether to normalize punctuation
             verbose: whether to print intermediate meta information
 
         Returns:
             normalized text options (usually there are multiple ways of normalizing a given semiotic class)
         """
+        if punct_pre_process:
+            text = pre_process(text)
         text = text.strip()
         if not text:
             if verbose:
@@ -108,7 +117,6 @@ def normalize(self, text: str, n_tagged: int, punct_post_process: bool = True, v
         normalized_texts = []
         for tagged_text in tagged_texts:
             self._verbalize(tagged_text, normalized_texts)
-
         if len(normalized_texts) == 0:
             raise ValueError()
         if punct_post_process:
@@ -183,36 +191,12 @@ def calculate_cer(normalized_texts: List[str], transcript: str, remove_punct=Fal
         text_clean = text.replace('-', ' ').lower()
         if remove_punct:
             for punct in "!?:;,.-()*+-/<=>@^_":
-                text_clean = text_clean.replace(punct, " ")
-        text_clean = re.sub(r' +', ' ', text_clean)
+                text_clean = text_clean.replace(punct, "")
         cer = round(word_error_rate([transcript], [text_clean], use_cer=True) * 100, 2)
         normalized_options.append((text, cer))
     return normalized_options
 
 
-def pre_process(text: str) -> str:
-    """
-    Adds space around punctuation marks
-
-    Args:
-        text: string that may include semiotic classes
-
-    Returns: text with spaces around punctuation marks
-    """
-    text = text.replace('--', '-')
-    space_right = '!?:;,.-()*+-/<=>@^_'
-    space_both = '-()*+-/<=>@^_'
-
-    for punct in space_right:
-        text = text.replace(punct, punct + ' ')
-    for punct in space_both:
-        text = text.replace(punct, ' ' + punct + ' ')
-
-    # remove extra space
-    text = re.sub(r' +', ' ', text)
-    return text
-
-
 def get_asr_model(asr_model: ASRModel):
     """
     Returns ASR Model
@@ -249,12 +233,36 @@ def parse_args():
     )
     parser.add_argument("--verbose", help="print info for debugging", action="store_true")
     parser.add_argument("--remove_punct", help="remove punctuation before calculating cer", action="store_true")
+    parser.add_argument(
+        "--no_punct_pre_process", help="set to True to disable punctuation pre processing", action="store_true"
+    )
     parser.add_argument(
         "--no_punct_post_process", help="set to True to disable punctuation post processing", action="store_true"
     )
     return parser.parse_args()
 
 
+def _normalize_line(normalizer: NormalizerWithAudio, line: str, asr_model: ASRModel = None):
+    line = json.loads(line)
+    audio = line['audio_filepath']
+    if 'transcript' in line:
+        transcript = line['transcript']
+    else:
+        transcript = asr_model.transcribe([audio])[0]
+
+    normalized_texts = normalizer.normalize(
+        text=line['text'],
+        verbose=args.verbose,
+        n_tagged=args.n_tagged,
+        punct_pre_process=not args.no_punct_pre_process,
+        punct_post_process=not args.no_punct_post_process,
+    )
+    normalized_text, cer = normalizer.select_best_match(normalized_texts, transcript, args.verbose, args.remove_punct)
+    line['nemo_normalized'] = normalized_text
+    line['CER_nemo_normalized'] = cer
+    return line
+
+
 def normalize_manifest(args):
     """
     Args:
@@ -265,26 +273,15 @@ def normalize_manifest(args):
     asr_model = None
     with open(args.audio_data, 'r') as f:
         with open(manifest_out, 'w') as f_out:
-            for line in tqdm(f):
-                line = json.loads(line)
-                audio = line['audio_filepath']
-                if 'transcript' in line:
-                    transcript = line['transcript']
-                else:
-                    if asr_model is None:
-                        asr_model = get_asr_model(args.model)
-                    transcript = asr_model.transcribe([audio])[0]
-                normalized_texts = normalizer.normalize(
-                    text=line['text'],
-                    verbose=args.verbose,
-                    n_tagged=args.n_tagged,
-                    punct_post_process=not args.no_punct_post_process,
-                )
-                normalized_text, cer = normalizer.select_best_match(
-                    normalized_texts, transcript, args.verbose, args.remove_punct
-                )
-                line['nemo_normalized'] = normalized_text
-                line['CER_nemo_normalized'] = cer
+            lines = f.readlines()
+            first_line = json.loads(lines[0])
+            if 'transcript' not in first_line:
+                asr_model = get_asr_model(args.model)
+            normalized_lines = Parallel(n_jobs=-1)(
+                delayed(_normalize_line)(normalizer, line, asr_model) for line in lines
+            )
+
+            for line in normalized_lines:
                 f_out.write(json.dumps(line, ensure_ascii=False) + '\n')
     print(f'Normalized version saved at {manifest_out}')
 
@@ -302,6 +299,7 @@ def normalize_manifest(args):
             text=args.text,
             verbose=args.verbose,
             n_tagged=args.n_tagged,
+            punct_pre_process=not args.no_punct_pre_process,
             punct_post_process=not args.no_punct_post_process,
         )
         if args.audio_data:

diff --git a/nemo_text_processing/text_normalization/taggers/cardinal.py b/nemo_text_processing/text_normalization/taggers/cardinal.py
@@ -104,7 +104,7 @@ def get_serial_graph(self):
             letter_pronunciation = pynini.string_map(load_labels(get_abs_path("data/letter_pronunciation.tsv")))
             alpha |= letter_pronunciation
 
-        delimiter = insert_space | pynini.cross("-", " ")
+        delimiter = insert_space | pynini.cross("-", " ") | pynini.cross("/", " ")
         letter_num = pynini.closure(alpha + delimiter, 1) + num_graph
         num_letter = pynini.closure(num_graph + delimiter, 1) + alpha
         next_alpha_or_num = pynini.closure(delimiter + (alpha | num_graph))