Fix tn bugs (#3580)

* added ssn support Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * added ssn Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * add more support for itn electronc Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * add work support to electronic Signed-off-by: Yang Zhang <yangzhang@nvidia.com> * updated ci Signed-off-by: Yang Zhang <yangzhang@nvidia.com>
NVIDIA · Feb 2, 2022 · fb1c9d1 · fb1c9d1
1 parent 5ac16b5
commit fb1c9d1
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 45 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -111,18 +111,18 @@ pipeline {
       parallel {
         stage('En TN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
           }
         }
         stage('En ITN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
           }
         }
         stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
           }
         }
       }
@@ -139,7 +139,7 @@ pipeline {
       parallel {
         stage('L2: Eng TN') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/text_normalization/ &&  python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
             sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
             sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_12-10.txt || exit 1'
@@ -149,7 +149,7 @@ pipeline {
 
         stage('L2: Eng ITN export') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
             sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
             sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
@@ -158,23 +158,23 @@ pipeline {
         stage('L2: TN with Audio (audio and raw text)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15 --text "The total amounts to \\$4.76." \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --text "The total amounts to \\$4.76." \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
             cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (audio and text file)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
             cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (manifest)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
+            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
           }
         }
       }

diff --git a/nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/en/data/whitelist.tsv
@@ -7,3 +7,6 @@ st.	saint
 es3	e s three
 s&p	s and p
 ASAP	a s a p
+AT&T	a t and t
+LLP	l l p
+ATM	a t m
diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py b/nemo_text_processing/inverse_text_normalization/en/taggers/electronic.py
@@ -45,12 +45,10 @@ def __init__(self):
 
         accepted_username = alpha_num | symbols
         process_dot = pynini.cross("dot", ".")
-        username = (
-            pynutil.insert("username: \"")
-            + alpha_num
-            + pynini.closure(delete_extra_space + accepted_username)
-            + pynutil.insert("\"")
+        username = (alpha_num + pynini.closure(delete_extra_space + accepted_username)) | pynutil.add_weight(
+            pynini.closure(NEMO_ALPHA, 1), weight=0.0001
         )
+        username = pynutil.insert("username: \"") + username + pynutil.insert("\"")
         single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
         server = single_alphanum | pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
         domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
@@ -79,7 +77,11 @@ def __init__(self):
         )
 
         protocol_default = (
-            (pynini.closure(delete_extra_space + accepted_username, 1) | server) + pynini.closure(ending, 1)
+            (
+                (pynini.closure(delete_extra_space + accepted_username, 1) | server)
+                | pynutil.add_weight(pynini.closure(NEMO_ALPHA, 1), weight=0.0001)
+            )
+            + pynini.closure(ending, 1)
         ).optimize()
         protocol = (
             pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + protocol_default

diff --git a/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/en/taggers/telephone.py
@@ -46,7 +46,7 @@ def __init__(self, cardinal: GraphFst):
         digit_to_str = pynini.invert(
             pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
         ).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero"))
-
+        str_to_digit = pynini.invert(digit_to_str)
         double_digit = pynini.union(
             *[
                 pynini.cross(
@@ -59,33 +59,26 @@ def __init__(self, cardinal: GraphFst):
             ]
         )
         double_digit.invert()
-        number_part = (
-            pynini.closure(digit_to_str + insert_space, 2, 2)
-            + digit_to_str
-            + pynutil.delete("-")
-            + insert_space
-            + pynini.closure(digit_to_str + insert_space, 2, 2)
-            + digit_to_str
-            + pynutil.delete("-")
-            + insert_space
-            + pynini.closure(digit_to_str + insert_space, 3, 3)
-            + digit_to_str
-        )
-        number_part = (
-            pynutil.insert("number_part: \"")
-            + pynini.cdrewrite(double_digit, "", "", NEMO_SIGMA) @ pynini.invert(number_part)
-            + pynutil.insert("\"")
-        )
 
-        str_to_digit = pynini.invert(digit_to_str)
         # to handle cases like "one twenty three"
         two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2)
-        cardinal_option = (
-            (str_to_digit + pynutil.delete(" ") + two_digit_cardinal)
-            | two_digit_cardinal
-            | (two_digit_cardinal + pynutil.delete(" ") + str_to_digit)
+        double_digit_to_digit = (
+            pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal
         )
 
+        single_or_double_digit = (double_digit_to_digit | str_to_digit).optimize()
+        single_or_double_digit = (
+            single_or_double_digit + pynini.closure(pynutil.delete(" ") + single_or_double_digit)
+        ).optimize()
+
+        number_part = pynini.compose(
+            single_or_double_digit,
+            NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4,
+        ).optimize()
+        number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"")
+
+        cardinal_option = pynini.compose(single_or_double_digit, NEMO_DIGIT ** (2, 3))
+
         country_code = (
             pynutil.insert("country_code: \"")
             + pynini.closure(pynini.cross("plus ", "+"), 0, 1)
@@ -95,14 +88,17 @@ def __init__(self, cardinal: GraphFst):
         optional_country_code = pynini.closure(country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize()
         graph = optional_country_code + number_part
 
-        # card number
-        double_digit_to_digit = pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit)
-        card_graph = (double_digit_to_digit | str_to_digit).optimize()
-        card_graph = (card_graph + pynini.closure(pynutil.delete(" ") + card_graph)).optimize()
-        # reformat card number, group by four
+        # credit card number
         space_four_digits = insert_space + NEMO_DIGIT ** 4
-        card_graph = pynini.compose(card_graph, NEMO_DIGIT ** 4 + space_four_digits ** 3).optimize()
-        graph |= pynutil.insert("number_part: \"") + card_graph.optimize() + pynutil.insert("\"")
+        credit_card_graph = pynini.compose(single_or_double_digit, NEMO_DIGIT ** 4 + space_four_digits ** 3).optimize()
+        graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"")
+
+        # SSN
+        ssn_graph = pynini.compose(
+            single_or_double_digit,
+            NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4,
+        ).optimize()
+        graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"")
 
         # ip
         digit_or_double = pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit

diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_electronic.txt
@@ -20,3 +20,6 @@ h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m~ht
 w w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm
 c o m d a i l y n e w s dot a b slash s m~comdailynews.ab/sm
 n vidia dot com~nvidia.com
+abc at gmail dot com~abc@gmail.com
+athreed at gmail dot com~athreed@gmail.com
+kore dot ai~kore.ai
diff --git a/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/en/data_inverse_text_normalization/test_cases_telephone.txt
@@ -10,3 +10,4 @@ four three two double seven three two one four three two one four three double z
 one two three dot one two three dot o dot four o~123.123.0.40
 one twenty three dot one two three dot o dot four o~123.123.0.40
 two two five dot double five dot o dot four o~225.55.0.40
+ssn is seven double nine one two three double one three~ssn is 799-12-3113