From 1fc3108f1e7592f710a9cbd43750c8c15ce410d4 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 27 Jul 2023 16:14:43 -0700 Subject: [PATCH 01/17] updates on itn grammar to pass sparrowhawk tests Signed-off-by: BuyuanCui --- .../zh/taggers/cardinal.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index b29fc5fb3..28239786a 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -78,7 +78,7 @@ def __init__(self): | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) - graph_ten_thousands = graph_ten_thousands_simple | graph_ten_thousands_complex | pynutil.insert("00000") + graph_ten_thousands = (pynutil.add_weight(graph_ten_thousands_simple, -1.0) | graph_ten_thousands_complex | pynutil.insert("00000")) # grammmar for hundred thousands 十万 graph_hundred_thousands_simple = graph_all + closure_ten_thousands @@ -88,8 +88,8 @@ def __init__(self): | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) - graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert( - "000000" + graph_hundred_thousands = (pynutil.add_weight(graph_hundred_thousands_simple, -1.0) | graph_hundred_thousands_complex | pynutil.insert( + "000000") ) # grammar for millions 百万 @@ -168,7 +168,7 @@ def __init__(self): | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) ) graph_hundred_millions = ( - graph_hundred_millions_simple | graph_hundred_millions_complex | pynutil.insert("000000000") + pynutil.add_weight(graph_hundred_millions_simple, -1.0) | graph_hundred_millions_complex | pynutil.insert("000000000") ) # grammar for billions 十亿 @@ -203,7 +203,7 @@ def __init__(self): | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all) | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) ) - graph_billions = graph_billions_simple | graph_billions_complex | pynutil.insert("0000000000") + graph_billions = (pynutil.add_weight(graph_billions_simple, -1.0) | graph_billions_complex | pynutil.insert("0000000000")) # grammar for ten billions 百亿 graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions @@ -252,7 +252,7 @@ def __init__(self): + graph_digits ) ) - graph_ten_billions = graph_ten_billions_simple | graph_ten_billions_complex | pynutil.insert("00000000000") + graph_ten_billions = (pynutil.add_weight(graph_ten_billions_simple, -1.0) | graph_ten_billions_complex | pynutil.insert("00000000000")) # grammar for hundred billions 千亿 graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions @@ -301,7 +301,7 @@ def __init__(self): + graph_digits ) ) - graph_hundred_billions = graph_hundred_billions_simple | graph_hundred_billions_complex + graph_hundred_billions = (pynutil.add_weight(graph_hundred_billions_simple, -1.0) | graph_hundred_billions_complex) # combining grammar; output for cardinal grammar graph = pynini.union( From e66cc851b23f4e631a53e6172a3172dd28c2c7f1 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 27 Jul 2023 16:14:59 -0700 Subject: [PATCH 02/17] updats for sparrowhawk tests Signed-off-by: BuyuanCui --- .../zh/taggers/decimal.py | 33 ++++++------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index f334f2675..1d6d2a9a4 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -63,40 +63,27 @@ def get_quantity(decimal, cardinal): return res - class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="decimal", kind="classify") - cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) - cardinal_before_decimal = cardinal.just_cardinals | (pynini.closure(pynini.cross("零", "0"), 0, 1)) - - delete_decimal = pynutil.delete("点") | pynutil.delete( - "點" - ) # delete decimal character, 'point' in english in 'one point two for 1.2' + cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) | pynini.closure(pynini.cross("零", "0")) + cardinal_before_decimal = cardinal.just_cardinals | pynini.cross("零", "0") + + delete_decimal = pynutil.delete("点") | pynutil.delete("點") - # grammar for integer part graph_integer = ( pynutil.insert('integer_part: "') - + (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1))) + + cardinal_before_decimal + pynutil.insert('" ') - ) # tokenization on just numbers - graph_integer_or_none = graph_integer | pynutil.insert('integer_part: "0" ', weight=0.01) # integer or zero + ) + + graph_string_of_cardinals = pynini.closure(cardinal_after_decimal, 1) + graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"') - # grammar for fractional part - delete_zero = pynini.closure(pynini.cross("零", "0")) - graph_string_of_cardinals = cardinal_after_decimal - graph_string_of_cardinals = pynini.closure( - (pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1 - ) - graph_fractional = pynini.closure( - pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1 - ) - # grammar for decimal: integer+delete character+part after decimal point - graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1) + graph_decimal_no_sign = pynini.closure((graph_integer + delete_decimal + graph_fractional), 1) - # New Grammar added for Money self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity( graph_decimal_no_sign, cardinal.just_cardinals ) From a4a2ed19fdb2e903dd60b47cbf9be1e1236c4888 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 27 Jul 2023 16:15:15 -0700 Subject: [PATCH 03/17] updates fro sparrowhawk tests Signed-off-by: BuyuanCui --- .../zh/verbalizers/decimal.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index ea8fa4ab0..882363d2c 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -32,14 +32,6 @@ def __init__(self): # insert a "," for every three numbers before decimal point space_every_three_integer = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() - # insert a "," for every three numbers after decimal point - space_every_three_decimal = ( - pynini.accep(".") + (exactly_three_digits + pynutil.insert(",")).closure() + at_most_three_digits - ) - - # combine both - group_by_threes = space_every_three_integer | space_every_three_decimal - self.group_by_threes = group_by_threes # removing tokenizations, 'negative: ' optional_sign = pynini.closure( @@ -56,10 +48,10 @@ def __init__(self): pynutil.delete("integer_part:") + delete_space + pynutil.delete('"') - + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') ) - integer = integer @ group_by_threes + integer = integer @ space_every_three_integer optional_integer = pynini.closure(integer + delete_space, 0, 1) # removing tokenizations, 'fractionl_part' @@ -81,10 +73,11 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - optional_quantity = pynini.closure(quantity + delete_space) + optional_quantity = pynini.closure(delete_space + quantity) # combining graphs removing tokenizations *3 graph = (optional_integer + optional_fractional + optional_quantity).optimize() + graph = optional_sign + graph # add optional sign for negative number self.numebrs = graph delete_tokens = self.delete_tokens(graph) From d1f8be2ff924799e15c03d51fc3e2e368af5f08f Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 27 Jul 2023 16:27:07 -0700 Subject: [PATCH 04/17] coding style fix Signed-off-by: BuyuanCui --- .../zh/taggers/cardinal.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index 28239786a..c1b462472 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -78,7 +78,11 @@ def __init__(self): | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) - graph_ten_thousands = (pynutil.add_weight(graph_ten_thousands_simple, -1.0) | graph_ten_thousands_complex | pynutil.insert("00000")) + graph_ten_thousands = ( + pynutil.add_weight(graph_ten_thousands_simple, -1.0) + | graph_ten_thousands_complex + | pynutil.insert("00000") + ) # grammmar for hundred thousands 十万 graph_hundred_thousands_simple = graph_all + closure_ten_thousands @@ -88,8 +92,10 @@ def __init__(self): | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) - graph_hundred_thousands = (pynutil.add_weight(graph_hundred_thousands_simple, -1.0) | graph_hundred_thousands_complex | pynutil.insert( - "000000") + graph_hundred_thousands = ( + pynutil.add_weight(graph_hundred_thousands_simple, -1.0) + | graph_hundred_thousands_complex + | pynutil.insert("000000") ) # grammar for millions 百万 @@ -168,7 +174,9 @@ def __init__(self): | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) ) graph_hundred_millions = ( - pynutil.add_weight(graph_hundred_millions_simple, -1.0) | graph_hundred_millions_complex | pynutil.insert("000000000") + pynutil.add_weight(graph_hundred_millions_simple, -1.0) + | graph_hundred_millions_complex + | pynutil.insert("000000000") ) # grammar for billions 十亿 @@ -203,7 +211,9 @@ def __init__(self): | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all) | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) ) - graph_billions = (pynutil.add_weight(graph_billions_simple, -1.0) | graph_billions_complex | pynutil.insert("0000000000")) + graph_billions = ( + pynutil.add_weight(graph_billions_simple, -1.0) | graph_billions_complex | pynutil.insert("0000000000") + ) # grammar for ten billions 百亿 graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions @@ -252,7 +262,11 @@ def __init__(self): + graph_digits ) ) - graph_ten_billions = (pynutil.add_weight(graph_ten_billions_simple, -1.0) | graph_ten_billions_complex | pynutil.insert("00000000000")) + graph_ten_billions = ( + pynutil.add_weight(graph_ten_billions_simple, -1.0) + | graph_ten_billions_complex + | pynutil.insert("00000000000") + ) # grammar for hundred billions 千亿 graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions @@ -301,7 +315,9 @@ def __init__(self): + graph_digits ) ) - graph_hundred_billions = (pynutil.add_weight(graph_hundred_billions_simple, -1.0) | graph_hundred_billions_complex) + graph_hundred_billions = ( + pynutil.add_weight(graph_hundred_billions_simple, -1.0) | graph_hundred_billions_complex + ) # combining grammar; output for cardinal grammar graph = pynini.union( From 637cf1986fef1a04f528bb7bb1028227dbd7393a Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Thu, 27 Jul 2023 16:29:44 -0700 Subject: [PATCH 05/17] updates for coding style and sparrowhawk test Signed-off-by: BuyuanCui --- .../zh/taggers/decimal.py | 16 +++++++--------- .../zh/verbalizers/decimal.py | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index 1d6d2a9a4..33f437955 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -63,25 +63,23 @@ def get_quantity(decimal, cardinal): return res + class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="decimal", kind="classify") - cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) | pynini.closure(pynini.cross("零", "0")) + cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) | pynini.closure( + pynini.cross("零", "0") + ) cardinal_before_decimal = cardinal.just_cardinals | pynini.cross("零", "0") - - delete_decimal = pynutil.delete("点") | pynutil.delete("點") - graph_integer = ( - pynutil.insert('integer_part: "') - + cardinal_before_decimal - + pynutil.insert('" ') - ) + delete_decimal = pynutil.delete("点") | pynutil.delete("點") + + graph_integer = pynutil.insert('integer_part: "') + cardinal_before_decimal + pynutil.insert('" ') graph_string_of_cardinals = pynini.closure(cardinal_after_decimal, 1) graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"') - graph_decimal_no_sign = pynini.closure((graph_integer + delete_decimal + graph_fractional), 1) self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity( diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index 882363d2c..ab9831783 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -77,7 +77,7 @@ def __init__(self): # combining graphs removing tokenizations *3 graph = (optional_integer + optional_fractional + optional_quantity).optimize() - + graph = optional_sign + graph # add optional sign for negative number self.numebrs = graph delete_tokens = self.delete_tokens(graph) From 4a71afe3d1994366f03751e88dc8ff1366402a77 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 1 Aug 2023 14:02:02 -0700 Subject: [PATCH 06/17] updated classes for tests on whitelist and word grammar Signed-off-by: BuyuanCui --- ..._sparrowhawk_inverse_text_normalization.sh | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh index 4ca12af7f..ade1027a7 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh @@ -21,62 +21,62 @@ runtest () { } testITNCardinal() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_cardinal.txt runtest $input } testITNDate() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_date.txt runtest $input } testITNDecimal() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_decimal.txt runtest $input } testITNOrdinal() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_ordinal.txt runtest $input } testITNFraction() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_fraction.txt runtest $input } testITNTime() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_time.txt runtest $input } -testITNMeasure() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt - runtest $input -} +#testITNMeasure() { +# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt +# runtest $input +#} testITNMoney() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_money.txt runtest $input } testITNWhitelist() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_whitelist.txt runtest $input } -testITNTelephone() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt - runtest $input -} +#testITNTelephone() { +# input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_telephone.txt +# runtest $input +#} -testITNElectronic() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt - runtest $input -} +#testITNElectronic() { +# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt +# runtest $input +#} testITNWord() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_word.txt runtest $input } From 4067be82b3d9009789dfbce575bd1c271698a9fd Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 1 Aug 2023 14:02:43 -0700 Subject: [PATCH 07/17] added for tests on whitelist Signed-off-by: BuyuanCui --- .../test_cases_whitelist.txt | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..f36dc4293 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,21 @@ +人力资源~HR +自动取款机~ATM +人力资源~HR +首席执行官~CEO +美国研究生入学考试~GRE +研究生管理专业入学考试~GMAT +全球定位系统~GPS +刷卡机~POS机 +数位多功能光碟~DVD +镭射唱片~CD +通用串行总线~USB +统一资源定位符~URL +虚拟专用网络~VPN +网络互联协议~IP +脱氧核糖核酸~DNA +核糖核酸~RNA +平均学分绩点~GPA +发光二极管~LED +可移植文档格式~PDF +社会性网络服务~SNS +博士~PhD From 2f101889f4bab855862798d696e5505a1282be4b Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 1 Aug 2023 14:02:57 -0700 Subject: [PATCH 08/17] added for test on word Signed-off-by: BuyuanCui --- .../test_cases_word.txt | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..1d0cac255 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,21 @@ +你好~你好 +年级~年级 +秘密~秘密 +键盘~键盘 +借口~借口 +学生~学生 +人力~人力 +转移~转移 +徘徊~徘徊 +冤枉~冤枉 +浏览~浏览 +珍藏~珍藏 +患难 ~患难 +湿~湿 +眼眶~眼眶 +遗产~遗产 +流浪~流浪 +信仰~信仰 +戒指~戒指 +义无反顾~义无反顾 +交换~交换 From cc857f13a5ae42c7f8b96ac968762196b6a7cc15 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 1 Aug 2023 14:03:18 -0700 Subject: [PATCH 09/17] added to run test on whitelist Signed-off-by: BuyuanCui --- .../nemo_text_processing/zh/test_whitelist.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 tests/nemo_text_processing/zh/test_whitelist.py diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py new file mode 100644 index 000000000..9b09f4d9d --- /dev/null +++ b/tests/nemo_text_processing/zh/test_whitelist.py @@ -0,0 +1,33 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from parameterized import parameterized + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected From 406b9532509bae396984b69a1ffbbe0b1b6e7cb4 Mon Sep 17 00:00:00 2001 From: BuyuanCui Date: Tue, 1 Aug 2023 14:03:33 -0700 Subject: [PATCH 10/17] added to run test on word Signed-off-by: BuyuanCui --- tests/nemo_text_processing/zh/test_word.py | 35 ++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/nemo_text_processing/zh/test_word.py diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py new file mode 100644 index 000000000..663228633 --- /dev/null +++ b/tests/nemo_text_processing/zh/test_word.py @@ -0,0 +1,35 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio +from parameterized import parameterized + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestWord: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected + + \ No newline at end of file From 30fccd3ca0dbd55c39c64297f63a2b56169a688b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 1 Aug 2023 21:05:06 +0000 Subject: [PATCH 11/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/nemo_text_processing/zh/test_word.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index 663228633..8d3da9be5 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -31,5 +31,3 @@ class TestWord: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected - - \ No newline at end of file From 13432f90f25239053f11e8a9a4928068928c65f0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Aug 2023 18:43:51 +0000 Subject: [PATCH 12/17] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/nemo_text_processing/zh/test_word.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index c404a57bb..8d3da9be5 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -31,4 +31,3 @@ class TestWord: def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected - From 3cd5062fce84c9877cd3d31d1f46d2459c107709 Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Wed, 16 Aug 2023 09:12:13 -0700 Subject: [PATCH 13/17] Update test_word.py Removed unused import. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --- tests/nemo_text_processing/zh/test_word.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index 8d3da9be5..2f2b8444f 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -19,7 +19,7 @@ from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestWord: From 049917bdbf398f1975af0ed60bb488a7b4c33ca2 Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Wed, 16 Aug 2023 09:15:09 -0700 Subject: [PATCH 14/17] Update test_word.py Removed imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --- tests/nemo_text_processing/zh/test_word.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index 2f2b8444f..5e2e1da45 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -15,8 +15,6 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file From 453ca80b2ce4a6978f61b687af76f969fde28c60 Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Wed, 16 Aug 2023 09:16:29 -0700 Subject: [PATCH 15/17] Update test_whitelist.py Removing imports according to CodeQL Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --- tests/nemo_text_processing/zh/test_whitelist.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py index 9b09f4d9d..8b3e871b1 100644 --- a/tests/nemo_text_processing/zh/test_whitelist.py +++ b/tests/nemo_text_processing/zh/test_whitelist.py @@ -15,11 +15,9 @@ import pytest from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio from parameterized import parameterized -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestWhitelist: From a03a5650bf193422c1c76a4e7eb91c11e34dc255 Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Fri, 18 Aug 2023 10:24:05 -0700 Subject: [PATCH 16/17] Update test_whitelist.py Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> From 0e5e7b88d273f2d538f2a902944655ca0590506d Mon Sep 17 00:00:00 2001 From: "Buyuan(Alex) Cui" <69030297+BuyuanCui@users.noreply.github.com> Date: Fri, 1 Sep 2023 08:00:24 -0700 Subject: [PATCH 17/17] Update Jenkinsfile changed zh cache to 07-27-23 as it is the latest update. Signed-off-by: Buyuan(Alex) Cui <69030297+BuyuanCui@users.noreply.github.com> --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 281d58e81..3c516e700 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-12-23-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' }