Skip to content

Commit

Permalink
Fix tn bugs (#3580)
Browse files Browse the repository at this point in the history
* added ssn support

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* added ssn

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* add more support for itn electronc

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* add work support to electronic

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>

* updated ci

Signed-off-by: Yang Zhang <yangzhang@nvidia.com>
  • Loading branch information
yzhang123 authored and nithinraok committed Feb 2, 2022
1 parent 5ac16b5 commit fb1c9d1
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 45 deletions.
18 changes: 9 additions & 9 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -111,18 +111,18 @@ pipeline {
parallel {
stage('En TN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-27.1'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py "1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
}
}
stage('En ITN grammars') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en "twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
}
}
stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
steps {
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31'
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
}
}
}
Expand All @@ -139,7 +139,7 @@ pipeline {
parallel {
stage('L2: Eng TN') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_12-10.txt || exit 1'
Expand All @@ -149,7 +149,7 @@ pipeline {

stage('L2: Eng ITN export') {
steps {
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/01-31 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
sh 'cd nemo_text_processing/inverse_text_normalization/ && python run_predict.py --input=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
Expand All @@ -158,23 +158,23 @@ pipeline {
stage('L2: TN with Audio (audio and raw text)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15 --text "The total amounts to \\$4.76." \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --text "The total amounts to \\$4.76." \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (audio and text file)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
--audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
}
}
stage('L2: TN with Audio (manifest)') {
steps {
sh 'cd nemo_text_processing/text_normalization && \
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/12-15'
python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/2-1'
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ st. saint
es3 e s three
s&p s and p
ASAP a s a p
AT&T a t and t
LLP l l p
ATM a t m
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,10 @@ def __init__(self):

accepted_username = alpha_num | symbols
process_dot = pynini.cross("dot", ".")
username = (
pynutil.insert("username: \"")
+ alpha_num
+ pynini.closure(delete_extra_space + accepted_username)
+ pynutil.insert("\"")
username = (alpha_num + pynini.closure(delete_extra_space + accepted_username)) | pynutil.add_weight(
pynini.closure(NEMO_ALPHA, 1), weight=0.0001
)
username = pynutil.insert("username: \"") + username + pynutil.insert("\"")
single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
server = single_alphanum | pynini.string_file(get_abs_path("data/electronic/server_name.tsv"))
domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
Expand Down Expand Up @@ -79,7 +77,11 @@ def __init__(self):
)

protocol_default = (
(pynini.closure(delete_extra_space + accepted_username, 1) | server) + pynini.closure(ending, 1)
(
(pynini.closure(delete_extra_space + accepted_username, 1) | server)
| pynutil.add_weight(pynini.closure(NEMO_ALPHA, 1), weight=0.0001)
)
+ pynini.closure(ending, 1)
).optimize()
protocol = (
pynini.closure(protocol_start, 0, 1) + protocol_end + delete_extra_space + process_dot + protocol_default
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self, cardinal: GraphFst):
digit_to_str = pynini.invert(
pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
).optimize() | pynini.cross("0", pynini.union("o", "oh", "zero"))

str_to_digit = pynini.invert(digit_to_str)
double_digit = pynini.union(
*[
pynini.cross(
Expand All @@ -59,33 +59,26 @@ def __init__(self, cardinal: GraphFst):
]
)
double_digit.invert()
number_part = (
pynini.closure(digit_to_str + insert_space, 2, 2)
+ digit_to_str
+ pynutil.delete("-")
+ insert_space
+ pynini.closure(digit_to_str + insert_space, 2, 2)
+ digit_to_str
+ pynutil.delete("-")
+ insert_space
+ pynini.closure(digit_to_str + insert_space, 3, 3)
+ digit_to_str
)
number_part = (
pynutil.insert("number_part: \"")
+ pynini.cdrewrite(double_digit, "", "", NEMO_SIGMA) @ pynini.invert(number_part)
+ pynutil.insert("\"")
)

str_to_digit = pynini.invert(digit_to_str)
# to handle cases like "one twenty three"
two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, NEMO_DIGIT ** 2)
cardinal_option = (
(str_to_digit + pynutil.delete(" ") + two_digit_cardinal)
| two_digit_cardinal
| (two_digit_cardinal + pynutil.delete(" ") + str_to_digit)
double_digit_to_digit = (
pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit) | two_digit_cardinal
)

single_or_double_digit = (double_digit_to_digit | str_to_digit).optimize()
single_or_double_digit = (
single_or_double_digit + pynini.closure(pynutil.delete(" ") + single_or_double_digit)
).optimize()

number_part = pynini.compose(
single_or_double_digit,
NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 4,
).optimize()
number_part = pynutil.insert("number_part: \"") + number_part.optimize() + pynutil.insert("\"")

cardinal_option = pynini.compose(single_or_double_digit, NEMO_DIGIT ** (2, 3))

country_code = (
pynutil.insert("country_code: \"")
+ pynini.closure(pynini.cross("plus ", "+"), 0, 1)
Expand All @@ -95,14 +88,17 @@ def __init__(self, cardinal: GraphFst):
optional_country_code = pynini.closure(country_code + pynutil.delete(" ") + insert_space, 0, 1).optimize()
graph = optional_country_code + number_part

# card number
double_digit_to_digit = pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit)
card_graph = (double_digit_to_digit | str_to_digit).optimize()
card_graph = (card_graph + pynini.closure(pynutil.delete(" ") + card_graph)).optimize()
# reformat card number, group by four
# credit card number
space_four_digits = insert_space + NEMO_DIGIT ** 4
card_graph = pynini.compose(card_graph, NEMO_DIGIT ** 4 + space_four_digits ** 3).optimize()
graph |= pynutil.insert("number_part: \"") + card_graph.optimize() + pynutil.insert("\"")
credit_card_graph = pynini.compose(single_or_double_digit, NEMO_DIGIT ** 4 + space_four_digits ** 3).optimize()
graph |= pynutil.insert("number_part: \"") + credit_card_graph.optimize() + pynutil.insert("\"")

# SSN
ssn_graph = pynini.compose(
single_or_double_digit,
NEMO_DIGIT ** 3 + pynutil.insert("-") + NEMO_DIGIT ** 2 + pynutil.insert("-") + NEMO_DIGIT ** 4,
).optimize()
graph |= pynutil.insert("number_part: \"") + ssn_graph.optimize() + pynutil.insert("\"")

# ip
digit_or_double = pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@ h t t p colon slash slash w w w dot c o m d a i l y n e w s dot a b slash s m~ht
w w w dot c o m d a i l y n e w s dot a b slash s m~www.comdailynews.ab/sm
c o m d a i l y n e w s dot a b slash s m~comdailynews.ab/sm
n vidia dot com~nvidia.com
abc at gmail dot com~abc@gmail.com
athreed at gmail dot com~athreed@gmail.com
kore dot ai~kore.ai
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ four three two double seven three two one four three two one four three double z
one two three dot one two three dot o dot four o~123.123.0.40
one twenty three dot one two three dot o dot four o~123.123.0.40
two two five dot double five dot o dot four o~225.55.0.40
ssn is seven double nine one two three double one three~ssn is 799-12-3113

0 comments on commit fb1c9d1

Please sign in to comment.