Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ pipeline {
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
VI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
SV_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-12-23-0'
ZH_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-27-23-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,11 @@ def __init__(self):
| (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all)
| (graph_digits + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits)
)
graph_ten_thousands = graph_ten_thousands_simple | graph_ten_thousands_complex | pynutil.insert("00000")
graph_ten_thousands = (
pynutil.add_weight(graph_ten_thousands_simple, -1.0)
| graph_ten_thousands_complex
| pynutil.insert("00000")
)

# grammmar for hundred thousands 十万
graph_hundred_thousands_simple = graph_all + closure_ten_thousands
Expand All @@ -88,8 +92,10 @@ def __init__(self):
| (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "00") + graph_all)
| (graph_all + delete_ten_thousands + pynini.cross(pynini.closure("零"), "000") + graph_digits)
)
graph_hundred_thousands = (graph_hundred_thousands_simple | graph_hundred_thousands_complex) | pynutil.insert(
"000000"
graph_hundred_thousands = (
pynutil.add_weight(graph_hundred_thousands_simple, -1.0)
| graph_hundred_thousands_complex
| pynutil.insert("000000")
)

# grammar for millions 百万
Expand Down Expand Up @@ -168,7 +174,9 @@ def __init__(self):
| (graph_digits + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits)
)
graph_hundred_millions = (
graph_hundred_millions_simple | graph_hundred_millions_complex | pynutil.insert("000000000")
pynutil.add_weight(graph_hundred_millions_simple, -1.0)
| graph_hundred_millions_complex
| pynutil.insert("000000000")
)

# grammar for billions 十亿
Expand Down Expand Up @@ -203,7 +211,9 @@ def __init__(self):
| (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "000000") + graph_all)
| (graph_all + delete_hundred_millions + pynini.cross(pynini.closure("零"), "0000000") + graph_digits)
)
graph_billions = graph_billions_simple | graph_billions_complex | pynutil.insert("0000000000")
graph_billions = (
pynutil.add_weight(graph_billions_simple, -1.0) | graph_billions_complex | pynutil.insert("0000000000")
)

# grammar for ten billions 百亿
graph_ten_billions_simple = graph_hundreds_complex + closure_hundred_millions
Expand Down Expand Up @@ -252,7 +262,11 @@ def __init__(self):
+ graph_digits
)
)
graph_ten_billions = graph_ten_billions_simple | graph_ten_billions_complex | pynutil.insert("00000000000")
graph_ten_billions = (
pynutil.add_weight(graph_ten_billions_simple, -1.0)
| graph_ten_billions_complex
| pynutil.insert("00000000000")
)

# grammar for hundred billions 千亿
graph_hundred_billions_simple = graph_thousands_complex + closure_hundred_millions
Expand Down Expand Up @@ -301,7 +315,9 @@ def __init__(self):
+ graph_digits
)
)
graph_hundred_billions = graph_hundred_billions_simple | graph_hundred_billions_complex
graph_hundred_billions = (
pynutil.add_weight(graph_hundred_billions_simple, -1.0) | graph_hundred_billions_complex
)

# combining grammar; output for cardinal grammar
graph = pynini.union(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,35 +68,20 @@ class DecimalFst(GraphFst):
def __init__(self, cardinal: GraphFst):
super().__init__(name="decimal", kind="classify")

cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv"))
cardinal_before_decimal = cardinal.just_cardinals | (pynini.closure(pynini.cross("零", "0"), 0, 1))

delete_decimal = pynutil.delete("点") | pynutil.delete(
"點"
) # delete decimal character, 'point' in english in 'one point two for 1.2'

# grammar for integer part
graph_integer = (
pynutil.insert('integer_part: "')
+ (cardinal_before_decimal | (pynini.closure(pynini.cross("零", "0"), 0, 1)))
+ pynutil.insert('" ')
) # tokenization on just numbers
graph_integer_or_none = graph_integer | pynutil.insert('integer_part: "0" ', weight=0.01) # integer or zero

# grammar for fractional part
delete_zero = pynini.closure(pynini.cross("零", "0"))
graph_string_of_cardinals = cardinal_after_decimal
graph_string_of_cardinals = pynini.closure(
(pynini.closure(graph_string_of_cardinals) + delete_zero + pynini.closure(graph_string_of_cardinals)), 1
)
graph_fractional = pynini.closure(
pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"'), 1
cardinal_after_decimal = pynini.string_file(get_abs_path("data/numbers/digit-nano.tsv")) | pynini.closure(
pynini.cross("零", "0")
)
cardinal_before_decimal = cardinal.just_cardinals | pynini.cross("零", "0")

delete_decimal = pynutil.delete("点") | pynutil.delete("點")

graph_integer = pynutil.insert('integer_part: "') + cardinal_before_decimal + pynutil.insert('" ')

graph_string_of_cardinals = pynini.closure(cardinal_after_decimal, 1)
graph_fractional = pynutil.insert('fractional_part: "') + graph_string_of_cardinals + pynutil.insert('"')

# grammar for decimal: integer+delete character+part after decimal point
graph_decimal_no_sign = pynini.closure((graph_integer_or_none + delete_decimal + graph_fractional), 1)
graph_decimal_no_sign = pynini.closure((graph_integer + delete_decimal + graph_fractional), 1)

# New Grammar added for Money
self.final_graph_wo_negative = graph_decimal_no_sign | get_quantity(
graph_decimal_no_sign, cardinal.just_cardinals
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,6 @@ def __init__(self):

# insert a "," for every three numbers before decimal point
space_every_three_integer = at_most_three_digits + (pynutil.insert(",") + exactly_three_digits).closure()
# insert a "," for every three numbers after decimal point
space_every_three_decimal = (
pynini.accep(".") + (exactly_three_digits + pynutil.insert(",")).closure() + at_most_three_digits
)

# combine both
group_by_threes = space_every_three_integer | space_every_three_decimal
self.group_by_threes = group_by_threes

# removing tokenizations, 'negative: '
optional_sign = pynini.closure(
Expand All @@ -56,10 +48,10 @@ def __init__(self):
pynutil.delete("integer_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynini.closure(NEMO_DIGIT, 1)
+ pynutil.delete('"')
)
integer = integer @ group_by_threes
integer = integer @ space_every_three_integer
optional_integer = pynini.closure(integer + delete_space, 0, 1)

# removing tokenizations, 'fractionl_part'
Expand All @@ -81,10 +73,11 @@ def __init__(self):
+ pynini.closure(NEMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_quantity = pynini.closure(quantity + delete_space)
optional_quantity = pynini.closure(delete_space + quantity)

# combining graphs removing tokenizations *3
graph = (optional_integer + optional_fractional + optional_quantity).optimize()

graph = optional_sign + graph # add optional sign for negative number
self.numebrs = graph
delete_tokens = self.delete_tokens(graph)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
人力资源~HR
自动取款机~ATM
人力资源~HR
首席执行官~CEO
美国研究生入学考试~GRE
研究生管理专业入学考试~GMAT
全球定位系统~GPS
刷卡机~POS机
数位多功能光碟~DVD
镭射唱片~CD
通用串行总线~USB
统一资源定位符~URL
虚拟专用网络~VPN
网络互联协议~IP
脱氧核糖核酸~DNA
核糖核酸~RNA
平均学分绩点~GPA
发光二极管~LED
可移植文档格式~PDF
社会性网络服务~SNS
博士~PhD
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
你好~你好
年级~年级
秘密~秘密
键盘~键盘
借口~借口
学生~学生
人力~人力
转移~转移
徘徊~徘徊
冤枉~冤枉
浏览~浏览
珍藏~珍藏
患难 ~患难
湿~湿
眼眶~眼眶
遗产~遗产
流浪~流浪
信仰~信仰
戒指~戒指
义无反顾~义无反顾
交换~交换
Original file line number Diff line number Diff line change
Expand Up @@ -21,62 +21,62 @@ runtest () {
}

testITNCardinal() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_cardinal.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_cardinal.txt
runtest $input
}

testITNDate() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_date.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_date.txt
runtest $input
}

testITNDecimal() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_decimal.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_decimal.txt
runtest $input
}

testITNOrdinal() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_ordinal.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_ordinal.txt
runtest $input
}

testITNFraction() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_fraction.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_fraction.txt
runtest $input
}

testITNTime() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_time.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_time.txt
runtest $input
}

testITNMeasure() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt
runtest $input
}
#testITNMeasure() {
# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_measure.txt
# runtest $input
#}

testITNMoney() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_money.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_money.txt
runtest $input
}

testITNWhitelist() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_whitelist.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_whitelist.txt
runtest $input
}

testITNTelephone() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_telephone.txt
runtest $input
}
#testITNTelephone() {
# input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_telephone.txt
# runtest $input
#}

testITNElectronic() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt
runtest $input
}
#testITNElectronic() {
# input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_electronic.txt
# runtest $input
#}

testITNWord() {
input=$PROJECT_DIR/fr/data_inverse_text_normalization/test_cases_word.txt
input=$PROJECT_DIR/zh/data_inverse_text_normalization/test_cases_word.txt
runtest $input
}

Expand Down
31 changes: 31 additions & 0 deletions tests/nemo_text_processing/zh/test_whitelist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import pytest
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from parameterized import parameterized

from ..utils import CACHE_DIR, parse_test_case_file


class TestWhitelist:
inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_whitelist.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected
17 changes: 9 additions & 8 deletions tests/nemo_text_processing/zh/test_word.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -12,19 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.


import pytest
from nemo_text_processing.text_normalization.normalize import Normalizer
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from parameterized import parameterized

from ..utils import CACHE_DIR, parse_test_case_file


class TestChar:
normalizer_zh = Normalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False, input_case='cased')
class TestWord:
inverse_normalizer = InverseNormalizer(lang='zh', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('zh/data_text_normalization/test_cases_word.txt'))
@parameterized.expand(parse_test_case_file('zh/data_inverse_text_normalization/test_cases_word.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_norm_char(self, test_input, expected):
preds = self.normalizer_zh.normalize(test_input)
assert expected == preds
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected