diff --git a/Jenkinsfile b/Jenkinsfile index 281d58e81..3c516e700 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -22,7 +22,7 @@ pipeline { RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' - ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-12-23-0' + ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py index b29fc5fb3..c1b462472 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/cardinal.py @@ -78,7 +78,11 @@ def __init__(self): | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) | (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) - graph_ten_thousands = graph_ten_thousands_simple | graph_ten_thousands_complex | pynutil.insert("00000") + graph_ten_thousands = ( + pynutil.add_weight(graph_ten_thousands_simple, -1.0) + | graph_ten_thousands_complex + | pynutil.insert("00000") + ) # grammmar for hundred thousands 十万 graph_hundred_thousands_simple = graph_all + closure_ten_thousands @@ -88,8 +92,10 @@ def __init__(self): | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all) | (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits) ) - graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert( - "000000" + graph_hundred_thousands = ( + pynutil.add_weight(graph_hundred_thousands_simple, -1.0) + | graph_hundred_thousands_complex + | pynutil.insert("000000") ) # grammar for millions 百万 @@ -168,7 +174,9 @@ def __init__(self): | (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) ) graph_hundred_millions = ( - graph_hundred_millions_simple | graph_hundred_millions_complex | pynutil.insert("000000000") + pynutil.add_weight(graph_hundred_millions_simple, -1.0) + | graph_hundred_millions_complex + | pynutil.insert("000000000") ) # grammar for billions 十亿 @@ -203,7 +211,9 @@ def __init__(self): | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all) | (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits) ) - graph_billions = graph_billions_simple | graph_billions_complex | pynutil.insert("0000000000") + graph_billions = ( + pynutil.add_weight(graph_billions_simple, -1.0) | graph_billions_complex | pynutil.insert("0000000000") + ) # grammar for ten billions 百亿 graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions @@ -252,7 +262,11 @@ def __init__(self): + graph_digits ) ) - graph_ten_billions = graph_ten_billions_simple | graph_ten_billions_complex | pynutil.insert("00000000000") + graph_ten_billions = ( + pynutil.add_weight(graph_ten_billions_simple, -1.0) + | graph_ten_billions_complex + | pynutil.insert("00000000000") + ) # grammar for hundred billions 千亿 graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions @@ -301,7 +315,9 @@ def __init__(self): + graph_digits ) ) - graph_hundred_billions = graph_hundred_billions_simple | graph_hundred_billions_complex + graph_hundred_billions = ( + pynutil.add_weight(graph_hundred_billions_simple, -1.0) | graph_hundred_billions_complex + ) # combining grammar; output for cardinal grammar graph = pynini.union( diff --git a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py index f334f2675..33f437955 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/taggers/decimal.py @@ -68,35 +68,20 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst): super().__init__(name="decimal", kind="classify") - cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) - cardinal_before_decimal = cardinal.just_cardinals | (pynini.closure(pynini.cross("零", "0"), 0, 1)) - - delete_decimal = pynutil.delete("点") | pynutil.delete( - "點" - ) # delete decimal character, 'point' in english in 'one point two for 1.2' - - # grammar for integer part - graph_integer = ( - pynutil.insert('integer_part: "') - + (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1))) - + pynutil.insert('" ') - ) # tokenization on just numbers - graph_integer_or_none = graph_integer | pynutil.insert('integer_part: "0" ', weight=0.01) # integer or zero - - # grammar for fractional part - delete_zero = pynini.closure(pynini.cross("零", "0")) - graph_string_of_cardinals = cardinal_after_decimal - graph_string_of_cardinals = pynini.closure( - (pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1 - ) - graph_fractional = pynini.closure( - pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1 + cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) | pynini.closure( + pynini.cross("零", "0") ) + cardinal_before_decimal = cardinal.just_cardinals | pynini.cross("零", "0") + + delete_decimal = pynutil.delete("点") | pynutil.delete("點") + + graph_integer = pynutil.insert('integer_part: "') + cardinal_before_decimal + pynutil.insert('" ') + + graph_string_of_cardinals = pynini.closure(cardinal_after_decimal, 1) + graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"') - # grammar for decimal: integer+delete character+part after decimal point - graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1) + graph_decimal_no_sign = pynini.closure((graph_integer + delete_decimal + graph_fractional), 1) - # New Grammar added for Money self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity( graph_decimal_no_sign, cardinal.just_cardinals ) diff --git a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py index ea8fa4ab0..ab9831783 100644 --- a/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py +++ b/nemo_text_processing/inverse_text_normalization/zh/verbalizers/decimal.py @@ -32,14 +32,6 @@ def __init__(self): # insert a "," for every three numbers before decimal point space_every_three_integer = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure() - # insert a "," for every three numbers after decimal point - space_every_three_decimal = ( - pynini.accep(".") + (exactly_three_digits + pynutil.insert(",")).closure() + at_most_three_digits - ) - - # combine both - group_by_threes = space_every_three_integer | space_every_three_decimal - self.group_by_threes = group_by_threes # removing tokenizations, 'negative: ' optional_sign = pynini.closure( @@ -56,10 +48,10 @@ def __init__(self): pynutil.delete("integer_part:") + delete_space + pynutil.delete('"') - + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynini.closure(NEMO_DIGIT, 1) + pynutil.delete('"') ) - integer = integer @ group_by_threes + integer = integer @ space_every_three_integer optional_integer = pynini.closure(integer + delete_space, 0, 1) # removing tokenizations, 'fractionl_part' @@ -81,10 +73,11 @@ def __init__(self): + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - optional_quantity = pynini.closure(quantity + delete_space) + optional_quantity = pynini.closure(delete_space + quantity) # combining graphs removing tokenizations *3 graph = (optional_integer + optional_fractional + optional_quantity).optimize() + graph = optional_sign + graph # add optional sign for negative number self.numebrs = graph delete_tokens = self.delete_tokens(graph) diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..f36dc4293 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,21 @@ +人力资源~HR +自动取款机~ATM +人力资源~HR +首席执行官~CEO +美国研究生入学考试~GRE +研究生管理专业入学考试~GMAT +全球定位系统~GPS +刷卡机~POS机 +数位多功能光碟~DVD +镭射唱片~CD +通用串行总线~USB +统一资源定位符~URL +虚拟专用网络~VPN +网络互联协议~IP +脱氧核糖核酸~DNA +核糖核酸~RNA +平均学分绩点~GPA +发光二极管~LED +可移植文档格式~PDF +社会性网络服务~SNS +博士~PhD diff --git a/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..1d0cac255 --- /dev/null +++ b/tests/nemo_text_processing/zh/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,21 @@ +你好~你好 +年级~年级 +秘密~秘密 +键盘~键盘 +借口~借口 +学生~学生 +人力~人力 +转移~转移 +徘徊~徘徊 +冤枉~冤枉 +浏览~浏览 +珍藏~珍藏 +患难 ~患难 +湿~湿 +眼眶~眼眶 +遗产~遗产 +流浪~流浪 +信仰~信仰 +戒指~戒指 +义无反顾~义无反顾 +交换~交换 diff --git a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh index 4ca12af7f..ade1027a7 100644 --- a/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/zh/test_sparrowhawk_inverse_text_normalization.sh @@ -21,62 +21,62 @@ runtest () { } testITNCardinal() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_cardinal.txt runtest $input } testITNDate() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_date.txt runtest $input } testITNDecimal() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_decimal.txt runtest $input } testITNOrdinal() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_ordinal.txt runtest $input } testITNFraction() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_fraction.txt runtest $input } testITNTime() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_time.txt runtest $input } -testITNMeasure() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt - runtest $input -} +#testITNMeasure() { +# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt +# runtest $input +#} testITNMoney() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_money.txt runtest $input } testITNWhitelist() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_whitelist.txt runtest $input } -testITNTelephone() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt - runtest $input -} +#testITNTelephone() { +# input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_telephone.txt +# runtest $input +#} -testITNElectronic() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt - runtest $input -} +#testITNElectronic() { +# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt +# runtest $input +#} testITNWord() { - input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt + input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_word.txt runtest $input } diff --git a/tests/nemo_text_processing/zh/test_whitelist.py b/tests/nemo_text_processing/zh/test_whitelist.py new file mode 100644 index 000000000..8b3e871b1 --- /dev/null +++ b/tests/nemo_text_processing/zh/test_whitelist.py @@ -0,0 +1,31 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pytest +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from parameterized import parameterized + +from ..utils import CACHE_DIR, parse_test_case_file + + +class TestWhitelist: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/zh/test_word.py b/tests/nemo_text_processing/zh/test_word.py index 29d396418..5e2e1da45 100644 --- a/tests/nemo_text_processing/zh/test_word.py +++ b/tests/nemo_text_processing/zh/test_word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,19 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. + import pytest -from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from parameterized import parameterized from ..utils import CACHE_DIR, parse_test_case_file -class TestChar: - normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased') +class TestWord: + inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False) - @parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_word.txt')) + @parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit - def test_norm_char(self, test_input, expected): - preds = self.normalizer_zh.normalize(test_input) - assert expected == preds + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected