From 3434200830b38602e3321373b7919e7112abf7a6 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Thu, 3 Jul 2025 01:28:13 +0700 Subject: [PATCH 1/7] Date for vietnamese TN Signed-off-by: folivoramanh --- .../vi/data/date/__init__.py | 13 +++ .../text_normalization/vi/data/date/days.tsv | 9 ++ .../vi/data/date/months.tsv | 21 +++++ .../text_normalization/vi/taggers/date.py | 87 +++++++++++++++++++ .../vi/taggers/tokenize_and_classify.py | 7 ++ .../text_normalization/vi/verbalizers/date.py | 51 +++++++++++ .../vi/verbalizers/verbalize.py | 6 +- .../test_cases_date.txt | 13 +++ tests/nemo_text_processing/vi/test_date.py | 44 +++++++--- .../vi/test_sparrowhawk_normalization.sh | 8 +- 10 files changed, 240 insertions(+), 19 deletions(-) create mode 100644 nemo_text_processing/text_normalization/vi/data/date/__init__.py create mode 100644 nemo_text_processing/text_normalization/vi/data/date/days.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/date/months.tsv create mode 100644 nemo_text_processing/text_normalization/vi/taggers/date.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/date.py create mode 100644 tests/nemo_text_processing/vi/data_text_normalization/test_cases_date.txt diff --git a/nemo_text_processing/text_normalization/vi/data/date/__init__.py b/nemo_text_processing/text_normalization/vi/data/date/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/data/date/days.tsv b/nemo_text_processing/text_normalization/vi/data/date/days.tsv new file mode 100644 index 000000000..1d3e2440c --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/days.tsv @@ -0,0 +1,9 @@ +01 một +02 hai +03 ba +04 bốn +05 năm +06 sáu +07 bảy +08 tám +09 chín \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/date/months.tsv b/nemo_text_processing/text_normalization/vi/data/date/months.tsv new file mode 100644 index 000000000..fb836fba1 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/months.tsv @@ -0,0 +1,21 @@ +1 một +2 hai +3 ba +4 tư +5 năm +6 sáu +7 bảy +8 tám +9 chín +10 mười +11 mười một +12 mười hai +01 một +02 hai +03 ba +04 tư +05 năm +06 sáu +07 bảy +08 tám +09 chín \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/date.py b/nemo_text_processing/text_normalization/vi/taggers/date.py new file mode 100644 index 000000000..941b830e4 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/date.py @@ -0,0 +1,87 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.utils import load_labels, get_abs_path +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst + + +class DateFst(GraphFst): + """ + Finite state transducer for classifying Vietnamese dates, e.g. + 15/01/2024 -> date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } + tháng 4 2024 -> date { month: "tư" year: "hai nghìn hai mươi tư" } + ngày 15/01/2024 -> date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } + ngày 12 tháng 5 năm 2025 -> date { day: "mười hai" month: "năm" year: "hai nghìn hai mươi lăm" } + """ + + def __init__(self, cardinal, deterministic: bool = True): + super().__init__(name="date", kind="classify", deterministic=deterministic) + + day_mappings = load_labels(get_abs_path("data/date/days.tsv")) + month_mappings = load_labels(get_abs_path("data/date/months.tsv")) + + day_digit = pynini.closure(NEMO_DIGIT, 1, 2) + month_digit = pynini.closure(NEMO_DIGIT, 1, 2) + year_digit = pynini.closure(NEMO_DIGIT, 4, 4) + separator = pynini.union("/", "-", ".") + + day_convert = pynini.string_map([(k, v) for k, v in day_mappings]) | pynini.compose(day_digit, cardinal.graph) + month_convert = pynini.string_map([(k, v) for k, v in month_mappings]) + year_convert = pynini.compose(year_digit, cardinal.graph) + + day_part = pynutil.insert("day: \"") + day_convert + pynutil.insert("\" ") + month_part = pynutil.insert("month: \"") + month_convert + pynutil.insert("\" ") + year_part = pynutil.insert("year: \"") + year_convert + pynutil.insert("\"") + month_final = pynutil.insert("month: \"") + month_convert + pynutil.insert("\"") + + patterns = [] + + date_sep = day_part + pynutil.delete(separator) + month_part + pynutil.delete(separator) + year_part + patterns.append(pynini.compose(day_digit + separator + month_digit + separator + year_digit, date_sep)) + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, + pynutil.delete("ngày ") + date_sep + )) + + for sep in [separator, pynini.accep(" ")]: + patterns.append(pynini.compose( + pynini.accep("tháng ") + month_digit + sep + year_digit, + pynutil.delete("tháng ") + month_part + pynutil.delete(sep) + year_part + )) + + day_month_sep = day_part + pynutil.delete(separator) + month_final + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit, + pynutil.delete("ngày ") + day_month_sep + )) + + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, + pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final + )) + + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit + pynini.accep(" năm ") + year_digit, + pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_part + pynutil.delete(" năm ") + year_part + )) + + patterns.append(pynini.compose( + pynini.accep("năm ") + year_digit, + pynutil.delete("năm ") + year_part + )) + + self.fst = self.add_tokens(pynini.union(*patterns)) \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 73feb7182..bde22dee3 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -28,6 +28,7 @@ from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.vi.taggers.date import DateFst from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.vi.taggers.word import WordFst @@ -92,9 +93,15 @@ def __init__( fraction_graph = fraction.fst logger.debug(f"fraction: {time.time() - start_time: .2f}s -- {fraction_graph.num_states()} nodes") + start_time = time.time() + date = DateFst(cardinal=cardinal, deterministic=deterministic) + date_graph = date.fst + logger.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") + classify = ( pynutil.add_weight(whitelist_graph, 0.8) | pynutil.add_weight(ordinal_graph, 0.81) + | pynutil.add_weight(date_graph, 0.83) | pynutil.add_weight(decimal_graph, 0.85) | pynutil.add_weight(cardinal_graph, 0.9) | pynutil.add_weight(fraction_graph, 1.0) diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/date.py b/nemo_text_processing/text_normalization/vi/verbalizers/date.py new file mode 100644 index 000000000..f2b53de80 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/date.py @@ -0,0 +1,51 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + +class DateFst(GraphFst): + """ + Finite state transducer for verbalizing Vietnamese dates, e.g. + date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } + -> ngày mười lăm tháng một năm hai nghìn hai mươi tư + + date { month: "tư" year: "hai nghìn hai mươi tư" } + -> tháng tư năm hai nghìn hai mươi tư + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="date", kind="verbalize", deterministic=deterministic) + + quoted_content = pynini.closure(NEMO_NOT_QUOTE) + day = pynutil.delete("day:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"") + month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"") + year = pynutil.delete("year:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"") + + insert_day = pynutil.insert("ngày ") + insert_month = pynutil.insert("tháng ") + insert_year = pynutil.insert("năm ") + insert_space = pynutil.insert(" ") + + date_graph = pynini.union( + insert_day + day + delete_space + insert_space + insert_month + month + delete_space + insert_space + insert_year + year, + insert_month + month + delete_space + insert_space + insert_year + year, + insert_day + day + delete_space + insert_space + insert_month + month, + insert_year + year + ) + + self.fst = self.delete_tokens(date_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index e3d34b968..e2c4f54cf 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -18,6 +18,7 @@ from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst @@ -44,7 +45,10 @@ def __init__(self, deterministic: bool = True): fraction = FractionFst(deterministic=deterministic) fraction_graph = fraction.fst + date = DateFst(deterministic=deterministic) + date_graph = date.fst + # Combine all verbalizers - graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph | fraction_graph + graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph | fraction_graph | date_graph self.fst = graph diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_date.txt new file mode 100644 index 000000000..c95e00e97 --- /dev/null +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_date.txt @@ -0,0 +1,13 @@ +ngày 15/01/2024~ngày mười lăm tháng một năm hai nghìn hai mươi tư +01/12/2023~ngày một tháng mười hai năm hai nghìn hai mươi ba +25-03-1975~ngày hai mươi lăm tháng ba năm một nghìn chín trăm bảy mươi lăm +10.05.2000~ngày mười tháng năm năm hai nghìn +tháng 1 2024~tháng một năm hai nghìn hai mươi tư +tháng 12 2023~tháng mười hai năm hai nghìn hai mươi ba +ngày 12 tháng 5 năm 2025~ngày mười hai tháng năm năm hai nghìn hai mươi lăm +tháng 5 năm nay~tháng năm năm nay +ngày 4 tháng này~ngày bốn tháng này +hôm nay là ngày 19/05/2025 sinh nhật Bác Hồ~hôm nay là ngày mười chín tháng năm năm hai nghìn hai mươi lăm sinh nhật Bác Hồ +ngày 14/4 hàng năm~ngày mười bốn tháng tư hàng năm +tháng 04/1969~tháng tư năm một nghìn chín trăm sáu mươi chín +ngày 12 tháng mười hai năm 2023~ngày mười hai tháng mười hai năm hai nghìn hai mươi ba \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_date.py b/tests/nemo_text_processing/vi/test_date.py index 90885b6e4..20d646035 100644 --- a/tests/nemo_text_processing/vi/test_date.py +++ b/tests/nemo_text_processing/vi/test_date.py @@ -12,31 +12,47 @@ # See the License for the specific language governing permissions and # limitations under the License. +# pytest tests/nemo_text_processing/vi/test_date.py --cpu --cache-clear import pytest from parameterized import parameterized -from ..utils import CACHE_DIR, parse_test_case_file +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -try: - from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer - - PYNINI_AVAILABLE = True -except (ImportError, ModuleNotFoundError): - PYNINI_AVAILABLE = False +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestDate: - inverse_normalizer = ( - InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if PYNINI_AVAILABLE else None - ) + inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_date.txt')) - @pytest.mark.skipif( - not PYNINI_AVAILABLE, - reason="`pynini` not installed, please install via nemo_text_processing/pynini_install.sh", - ) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected + + normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) + + normalizer_with_audio = ( + NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + if CACHE_DIR and RUN_AUDIO_BASED_TESTS + else None + ) + + @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_date.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + + if self.normalizer_with_audio: + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, + n_tagged=30, + punct_post_process=False, + ) + assert expected in pred_non_deterministic, f"input: {test_input}" \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh index 6a277c28c..2c5a7f8df 100644 --- a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh @@ -28,10 +28,10 @@ testTNCardinal() { runtest $input } -# testTNDate() { -# input=$PROJECT_DIR/vi/data_text_normalization/test_cases_date.txt -# runtest $input -# } +testTNDate() { + input=$PROJECT_DIR/vi/data_text_normalization/test_cases_date.txt + runtest $input +} testTNDecimal() { input=$PROJECT_DIR/vi/data_text_normalization/test_cases_decimal.txt From 228960558997048aafa84c2d4a417b466d7c2bb0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 3 Jul 2025 01:17:56 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/vi/taggers/date.py | 94 +++++++++++-------- .../vi/taggers/tokenize_and_classify.py | 4 +- .../text_normalization/vi/verbalizers/date.py | 21 +++-- .../vi/verbalizers/verbalize.py | 6 +- tests/nemo_text_processing/vi/test_date.py | 12 ++- 5 files changed, 83 insertions(+), 54 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/taggers/date.py b/nemo_text_processing/text_normalization/vi/taggers/date.py index 941b830e4..30c1459c3 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/text_normalization/vi/taggers/date.py @@ -15,8 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.utils import load_labels, get_abs_path from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels class DateFst(GraphFst): @@ -33,55 +33,71 @@ def __init__(self, cardinal, deterministic: bool = True): day_mappings = load_labels(get_abs_path("data/date/days.tsv")) month_mappings = load_labels(get_abs_path("data/date/months.tsv")) - - day_digit = pynini.closure(NEMO_DIGIT, 1, 2) + + day_digit = pynini.closure(NEMO_DIGIT, 1, 2) month_digit = pynini.closure(NEMO_DIGIT, 1, 2) - year_digit = pynini.closure(NEMO_DIGIT, 4, 4) + year_digit = pynini.closure(NEMO_DIGIT, 4, 4) separator = pynini.union("/", "-", ".") - + day_convert = pynini.string_map([(k, v) for k, v in day_mappings]) | pynini.compose(day_digit, cardinal.graph) month_convert = pynini.string_map([(k, v) for k, v in month_mappings]) year_convert = pynini.compose(year_digit, cardinal.graph) - + day_part = pynutil.insert("day: \"") + day_convert + pynutil.insert("\" ") month_part = pynutil.insert("month: \"") + month_convert + pynutil.insert("\" ") year_part = pynutil.insert("year: \"") + year_convert + pynutil.insert("\"") month_final = pynutil.insert("month: \"") + month_convert + pynutil.insert("\"") - + patterns = [] - + date_sep = day_part + pynutil.delete(separator) + month_part + pynutil.delete(separator) + year_part patterns.append(pynini.compose(day_digit + separator + month_digit + separator + year_digit, date_sep)) - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, - pynutil.delete("ngày ") + date_sep - )) - + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, + pynutil.delete("ngày ") + date_sep, + ) + ) + for sep in [separator, pynini.accep(" ")]: - patterns.append(pynini.compose( - pynini.accep("tháng ") + month_digit + sep + year_digit, - pynutil.delete("tháng ") + month_part + pynutil.delete(sep) + year_part - )) - + patterns.append( + pynini.compose( + pynini.accep("tháng ") + month_digit + sep + year_digit, + pynutil.delete("tháng ") + month_part + pynutil.delete(sep) + year_part, + ) + ) + day_month_sep = day_part + pynutil.delete(separator) + month_final - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + separator + month_digit, - pynutil.delete("ngày ") + day_month_sep - )) - - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, - pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final - )) - - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit + pynini.accep(" năm ") + year_digit, - pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_part + pynutil.delete(" năm ") + year_part - )) - - patterns.append(pynini.compose( - pynini.accep("năm ") + year_digit, - pynutil.delete("năm ") + year_part - )) - - self.fst = self.add_tokens(pynini.union(*patterns)) \ No newline at end of file + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit, pynutil.delete("ngày ") + day_month_sep + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, + pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final, + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("ngày ") + + day_digit + + pynini.accep(" tháng ") + + month_digit + + pynini.accep(" năm ") + + year_digit, + pynutil.delete("ngày ") + + day_part + + pynutil.delete(" tháng ") + + month_part + + pynutil.delete(" năm ") + + year_part, + ) + ) + + patterns.append(pynini.compose(pynini.accep("năm ") + year_digit, pynutil.delete("năm ") + year_part)) + + self.fst = self.add_tokens(pynini.union(*patterns)) diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index bde22dee3..6bf01c496 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -25,10 +25,10 @@ generator_main, ) from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.vi.taggers.date import DateFst from nemo_text_processing.text_normalization.vi.taggers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst -from nemo_text_processing.text_normalization.vi.taggers.date import DateFst from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.vi.taggers.word import WordFst @@ -101,7 +101,7 @@ def __init__( classify = ( pynutil.add_weight(whitelist_graph, 0.8) | pynutil.add_weight(ordinal_graph, 0.81) - | pynutil.add_weight(date_graph, 0.83) + | pynutil.add_weight(date_graph, 0.83) | pynutil.add_weight(decimal_graph, 0.85) | pynutil.add_weight(cardinal_graph, 0.9) | pynutil.add_weight(fraction_graph, 1.0) diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/date.py b/nemo_text_processing/text_normalization/vi/verbalizers/date.py index f2b53de80..3c96a9ae2 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/date.py @@ -21,9 +21,9 @@ class DateFst(GraphFst): """ Finite state transducer for verbalizing Vietnamese dates, e.g. - date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } + date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } -> ngày mười lăm tháng một năm hai nghìn hai mươi tư - + date { month: "tư" year: "hai nghìn hai mươi tư" } -> tháng tư năm hai nghìn hai mươi tư """ @@ -40,12 +40,21 @@ def __init__(self, deterministic: bool = True): insert_month = pynutil.insert("tháng ") insert_year = pynutil.insert("năm ") insert_space = pynutil.insert(" ") - + date_graph = pynini.union( - insert_day + day + delete_space + insert_space + insert_month + month + delete_space + insert_space + insert_year + year, + insert_day + + day + + delete_space + + insert_space + + insert_month + + month + + delete_space + + insert_space + + insert_year + + year, insert_month + month + delete_space + insert_space + insert_year + year, insert_day + day + delete_space + insert_space + insert_month + month, - insert_year + year + insert_year + year, ) - self.fst = self.delete_tokens(date_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(date_graph).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index e2c4f54cf..8d4023436 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -15,10 +15,10 @@ from nemo_text_processing.text_normalization.en.graph_utils import GraphFst from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst -from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst @@ -49,6 +49,8 @@ def __init__(self, deterministic: bool = True): date_graph = date.fst # Combine all verbalizers - graph = cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph | fraction_graph | date_graph + graph = ( + cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph | fraction_graph | date_graph + ) self.fst = graph diff --git a/tests/nemo_text_processing/vi/test_date.py b/tests/nemo_text_processing/vi/test_date.py index 20d646035..54e08b3fc 100644 --- a/tests/nemo_text_processing/vi/test_date.py +++ b/tests/nemo_text_processing/vi/test_date.py @@ -25,8 +25,8 @@ class TestDate: - inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) - + inverse_normalizer = InverseNormalizer(lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('vi/data_inverse_text_normalization/test_cases_date.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit @@ -34,8 +34,10 @@ def test_denorm(self, test_input, expected): pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) assert pred == expected - normalizer = Normalizer(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True) - + normalizer = Normalizer( + input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + normalizer_with_audio = ( NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) if CACHE_DIR and RUN_AUDIO_BASED_TESTS @@ -55,4 +57,4 @@ def test_norm(self, test_input, expected): n_tagged=30, punct_post_process=False, ) - assert expected in pred_non_deterministic, f"input: {test_input}" \ No newline at end of file + assert expected in pred_non_deterministic, f"input: {test_input}" From b8db764de9a1ad250ef2e4d23e543f1915121f22 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Tue, 8 Jul 2025 22:16:29 +0700 Subject: [PATCH 3/7] Add roman support and correct copyright header Signed-off-by: folivoramanh --- .../text_normalization/vi/__init__.py | 2 +- .../text_normalization/vi/data/date/days.tsv | 33 +++- .../vi/data/date/year_suffix.tsv | 4 + .../vi/data/roman/__init__.py | 13 ++ .../vi/data/roman/key_word.tsv | 12 ++ .../vi/data/roman/roman_numerals.tsv | 13 ++ .../text_normalization/vi/graph_utils.py | 144 ++++++++++++++++++ .../text_normalization/vi/taggers/__init__.py | 2 +- .../text_normalization/vi/taggers/cardinal.py | 4 +- .../text_normalization/vi/taggers/date.py | 112 ++++++++------ .../text_normalization/vi/taggers/decimal.py | 4 +- .../text_normalization/vi/taggers/fraction.py | 4 +- .../text_normalization/vi/taggers/ordinal.py | 4 +- .../vi/taggers/punctuation.py | 4 +- .../text_normalization/vi/taggers/roman.py | 91 +++++++++++ .../vi/taggers/tokenize_and_classify.py | 25 +-- .../vi/taggers/whitelist.py | 4 +- .../text_normalization/vi/taggers/word.py | 4 +- .../text_normalization/vi/utils.py | 2 +- .../vi/verbalizers/__init__.py | 2 +- .../vi/verbalizers/cardinal.py | 4 +- .../text_normalization/vi/verbalizers/date.py | 63 +++++--- .../vi/verbalizers/decimal.py | 4 +- .../vi/verbalizers/fraction.py | 4 +- .../vi/verbalizers/ordinal.py | 4 +- .../vi/verbalizers/roman.py | 51 +++++++ .../vi/verbalizers/verbalize.py | 20 ++- .../vi/verbalizers/verbalize_final.py | 6 +- .../vi/verbalizers/whitelist.py | 4 +- .../text_normalization/vi/verbalizers/word.py | 4 +- .../test_cases_roman.txt | 59 +++++++ tests/nemo_text_processing/vi/test_roman.py | 49 ++++++ .../vi/test_sparrowhawk_normalization.sh | 5 + 33 files changed, 640 insertions(+), 120 deletions(-) create mode 100644 nemo_text_processing/text_normalization/vi/data/date/year_suffix.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/roman/__init__.py create mode 100644 nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv create mode 100644 nemo_text_processing/text_normalization/vi/data/roman/roman_numerals.tsv create mode 100644 nemo_text_processing/text_normalization/vi/graph_utils.py create mode 100644 nemo_text_processing/text_normalization/vi/taggers/roman.py create mode 100644 nemo_text_processing/text_normalization/vi/verbalizers/roman.py create mode 100644 tests/nemo_text_processing/vi/data_text_normalization/test_cases_roman.txt create mode 100644 tests/nemo_text_processing/vi/test_roman.py diff --git a/nemo_text_processing/text_normalization/vi/__init__.py b/nemo_text_processing/text_normalization/vi/__init__.py index bc443be41..6ebc808fa 100644 --- a/nemo_text_processing/text_normalization/vi/__init__.py +++ b/nemo_text_processing/text_normalization/vi/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/date/days.tsv b/nemo_text_processing/text_normalization/vi/data/date/days.tsv index 1d3e2440c..5b70479a6 100644 --- a/nemo_text_processing/text_normalization/vi/data/date/days.tsv +++ b/nemo_text_processing/text_normalization/vi/data/date/days.tsv @@ -6,4 +6,35 @@ 06 sáu 07 bảy 08 tám -09 chín \ No newline at end of file +09 chín +1 một +2 hai +3 ba +4 bốn +5 năm +6 sáu +7 bảy +8 tám +9 chín +10 mười +11 mười một +12 mười hai +13 mười ba +14 mười bốn +15 mười lăm +16 mười sáu +17 mười bảy +18 mười tám +19 mười chín +20 hai mươi +21 hai mươi mốt +22 hai mươi hai +23 hai mươi ba +24 hai mươi bốn +25 hai mươi lăm +26 hai mươi sáu +27 hai mươi bảy +28 hai mươi tám +29 hai mươi chín +30 ba mươi +31 ba mươi mốt \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/vi/data/date/year_suffix.tsv new file mode 100644 index 000000000..31b49f955 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/date/year_suffix.tsv @@ -0,0 +1,4 @@ +tcn trước công nguyên +scn sau công nguyên +TCN trước công nguyên +SCN sau công nguyên \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/roman/__init__.py b/nemo_text_processing/text_normalization/vi/data/roman/__init__.py new file mode 100644 index 000000000..6ebc808fa --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/roman/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv b/nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv new file mode 100644 index 000000000..e5f3d75a9 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/roman/key_word.tsv @@ -0,0 +1,12 @@ +thế kỉ +thế kỷ +thứ +chương +phần +mục +đoạn +năm +khoản +phụ lục +khóa +số \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/data/roman/roman_numerals.tsv b/nemo_text_processing/text_normalization/vi/data/roman/roman_numerals.tsv new file mode 100644 index 000000000..d4d8ad20b --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/data/roman/roman_numerals.tsv @@ -0,0 +1,13 @@ +I 1 +V 5 +X 10 +L 50 +C 100 +D 500 +M 1000 +IV 4 +IX 9 +XL 40 +XC 90 +CD 400 +CM 900 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py new file mode 100644 index 000000000..61a304eb5 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/graph_utils.py @@ -0,0 +1,144 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.utils.logging import logger + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00a0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +def convert_space(fst) -> "pynini.FstLike": + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + +def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logger.info(f"Created {file_name}") + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far") + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> "pynini.FstLike": + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> "pynini.FstLike": + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> "pynini.FstLike": + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/text_normalization/vi/taggers/__init__.py b/nemo_text_processing/text_normalization/vi/taggers/__init__.py index bc443be41..6ebc808fa 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/taggers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index fa0f04fad..7f3743b05 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst, insert_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst, insert_space from nemo_text_processing.text_normalization.vi.utils import get_abs_path diff --git a/nemo_text_processing/text_normalization/vi/taggers/date.py b/nemo_text_processing/text_normalization/vi/taggers/date.py index 30c1459c3..efc7dd858 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/text_normalization/vi/taggers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,8 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst -from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels +from nemo_text_processing.text_normalization.vi.utils import load_labels, get_abs_path +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst class DateFst(GraphFst): @@ -26,6 +26,7 @@ class DateFst(GraphFst): tháng 4 2024 -> date { month: "tư" year: "hai nghìn hai mươi tư" } ngày 15/01/2024 -> date { day: "mười lăm" month: "một" year: "hai nghìn hai mươi tư" } ngày 12 tháng 5 năm 2025 -> date { day: "mười hai" month: "năm" year: "hai nghìn hai mươi lăm" } + năm 20 SCN -> date { year: "hai mươi" era: "sau công nguyên" } """ def __init__(self, cardinal, deterministic: bool = True): @@ -33,32 +34,42 @@ def __init__(self, cardinal, deterministic: bool = True): day_mappings = load_labels(get_abs_path("data/date/days.tsv")) month_mappings = load_labels(get_abs_path("data/date/months.tsv")) - - day_digit = pynini.closure(NEMO_DIGIT, 1, 2) + era_mappings = load_labels(get_abs_path("data/date/year_suffix.tsv")) + + day_digit = pynini.closure(NEMO_DIGIT, 1, 2) month_digit = pynini.closure(NEMO_DIGIT, 1, 2) - year_digit = pynini.closure(NEMO_DIGIT, 4, 4) + year_digit = pynini.closure(NEMO_DIGIT, 1, 4) separator = pynini.union("/", "-", ".") - - day_convert = pynini.string_map([(k, v) for k, v in day_mappings]) | pynini.compose(day_digit, cardinal.graph) + + day_convert = pynini.string_map([(k, v) for k, v in day_mappings]) month_convert = pynini.string_map([(k, v) for k, v in month_mappings]) year_convert = pynini.compose(year_digit, cardinal.graph) - + + era_to_full = {} + for abbr, full_form in era_mappings: + era_to_full[abbr.lower()] = full_form + era_to_full[abbr.upper()] = full_form + + era_convert = pynini.string_map([(k, v) for k, v in era_to_full.items()]) + day_part = pynutil.insert("day: \"") + day_convert + pynutil.insert("\" ") month_part = pynutil.insert("month: \"") + month_convert + pynutil.insert("\" ") year_part = pynutil.insert("year: \"") + year_convert + pynutil.insert("\"") month_final = pynutil.insert("month: \"") + month_convert + pynutil.insert("\"") - + era_part = pynutil.insert("era: \"") + era_convert + pynutil.insert("\"") + patterns = [] date_sep = day_part + pynutil.delete(separator) + month_part + pynutil.delete(separator) + year_part - patterns.append(pynini.compose(day_digit + separator + month_digit + separator + year_digit, date_sep)) - patterns.append( - pynini.compose( - pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, - pynutil.delete("ngày ") + date_sep, - ) - ) - + patterns.append(pynini.compose( + day_digit + separator + month_digit + separator + year_digit, + date_sep + )) + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, + pynutil.delete("ngày ") + date_sep + )) + for sep in [separator, pynini.accep(" ")]: patterns.append( pynini.compose( @@ -68,36 +79,37 @@ def __init__(self, cardinal, deterministic: bool = True): ) day_month_sep = day_part + pynutil.delete(separator) + month_final - patterns.append( - pynini.compose( - pynini.accep("ngày ") + day_digit + separator + month_digit, pynutil.delete("ngày ") + day_month_sep - ) - ) - - patterns.append( - pynini.compose( - pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, - pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final, - ) - ) - - patterns.append( - pynini.compose( - pynini.accep("ngày ") - + day_digit - + pynini.accep(" tháng ") - + month_digit - + pynini.accep(" năm ") - + year_digit, - pynutil.delete("ngày ") - + day_part - + pynutil.delete(" tháng ") - + month_part - + pynutil.delete(" năm ") - + year_part, - ) - ) - - patterns.append(pynini.compose(pynini.accep("năm ") + year_digit, pynutil.delete("năm ") + year_part)) - + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit, + pynutil.delete("ngày ") + day_month_sep + )) + + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, + pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final + )) + + patterns.append(pynini.compose( + pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit + pynini.accep(" năm ") + year_digit, + pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_part + pynutil.delete(" năm ") + year_part + )) + + patterns.append(pynini.compose( + pynini.accep("năm ") + year_digit, + pynutil.delete("năm ") + year_part + )) + + era_abbrs = list(era_to_full.keys()) + for era_abbr in era_abbrs: + patterns.append(pynini.compose( + pynini.accep("năm ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), + pynutil.delete("năm ") + year_part + pynutil.delete(" ") + era_part + )) + + patterns.append(pynini.compose( + pynini.accep("năm thứ ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), + pynutil.delete("năm thứ ") + pynutil.insert("ordinal: \"") + year_convert + pynutil.insert("\" ") + + pynutil.delete(" ") + era_part + )) + self.fst = self.add_tokens(pynini.union(*patterns)) diff --git a/nemo_text_processing/text_normalization/vi/taggers/decimal.py b/nemo_text_processing/text_normalization/vi/taggers/decimal.py index 0b314317b..8313ec46b 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/fraction.py b/nemo_text_processing/text_normalization/vi/taggers/fraction.py index 807e96dab..56b452297 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/taggers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst from nemo_text_processing.text_normalization.vi.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/ordinal.py b/nemo_text_processing/text_normalization/vi/taggers/ordinal.py index d896bcef3..0a4b81862 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py index 1e08cb02d..c67129d5d 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst class PunctuationFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/taggers/roman.py b/nemo_text_processing/text_normalization/vi/taggers/roman.py new file mode 100644 index 000000000..f7e6b90a2 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/taggers/roman.py @@ -0,0 +1,91 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels + + +class RomanFst(GraphFst): + """ + Finite state transducer for classifying roman numbers in Vietnamese context: + e.g. "thế kỉ XV" -> tokens { roman { key_cardinal: "thế kỉ" integer: "mười lăm" } } + e.g. "thế kỷ IV" -> tokens { roman { key_cardinal: "thế kỷ" integer: "bốn" } } + e.g. "thứ IV" -> tokens { roman { key_cardinal: "thứ" integer: "bốn" } } + e.g. "chương III" -> tokens { roman { key_cardinal: "chương" integer: "ba" } } + e.g. "phần ix" -> tokens { roman { key_cardinal: "phần" integer: "chín" } } + + Args: + cardinal: CardinalFst + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, cardinal: GraphFst, deterministic: bool = True): + super().__init__(name="roman", kind="classify", deterministic=deterministic) + + key_words = [] + key_word_path = get_abs_path("data/roman/key_word.tsv") + for k_word in load_labels(key_word_path): + key_words.append(k_word[0]) + + key_words_fst = pynini.union(*[pynini.accep(word) for word in key_words]).optimize() + + roman_numeral_path = get_abs_path("data/roman/roman_numerals.tsv") + roman_numeral_pairs = load_labels(roman_numeral_path) + + roman_to_arabic = {} + for roman, value in roman_numeral_pairs: + roman_to_arabic[roman] = value + roman_to_arabic[roman.lower()] = value + + self.arabic_to_roman = {} + for roman, value in roman_numeral_pairs: + self.arabic_to_roman[int(value)] = roman + + valid_roman_pairs = [] + for i in range(1, 4000): + roman_upper = self._int_to_roman(i) + roman_lower = roman_upper.lower() + valid_roman_pairs.append((roman_upper, str(i))) + valid_roman_pairs.append((roman_lower, str(i))) + + roman_to_arabic_fst = pynini.string_map(valid_roman_pairs).optimize() + + cardinal_graph = cardinal.graph + + graph = ( + pynutil.insert("key_cardinal: \"") + + key_words_fst + + pynutil.insert("\"") + + pynini.accep(" ") + + pynutil.insert("integer: \"") + + pynini.compose(roman_to_arabic_fst, cardinal_graph) + + pynutil.insert("\"") + ).optimize() + + self.fst = self.add_tokens(graph).optimize() + + def _int_to_roman(self, num): + values = sorted(self.arabic_to_roman.keys(), reverse=True) + + roman_num = '' + for value in values: + while num >= value: + roman_num += self.arabic_to_roman[value] + num -= value + + return roman_num \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 6bf01c496..0925cf218 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.vi.graph_utils import ( GraphFst, delete_extra_space, delete_space, @@ -30,6 +30,7 @@ from nemo_text_processing.text_normalization.vi.taggers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.taggers.ordinal import OrdinalFst from nemo_text_processing.text_normalization.vi.taggers.punctuation import PunctuationFst +from nemo_text_processing.text_normalization.vi.taggers.roman import RomanFst from nemo_text_processing.text_normalization.vi.taggers.whitelist import WhiteListFst from nemo_text_processing.text_normalization.vi.taggers.word import WordFst from nemo_text_processing.utils.logging import logger @@ -98,16 +99,22 @@ def __init__( date_graph = date.fst logger.debug(f"date: {time.time() - start_time: .2f}s -- {date_graph.num_states()} nodes") + start_time = time.time() + roman = RomanFst(cardinal=cardinal, deterministic=deterministic) + roman_graph = roman.fst + logger.debug(f"roman: {time.time() - start_time: .2f}s -- {roman_graph.num_states()} nodes") + classify = ( - pynutil.add_weight(whitelist_graph, 0.8) - | pynutil.add_weight(ordinal_graph, 0.81) - | pynutil.add_weight(date_graph, 0.83) - | pynutil.add_weight(decimal_graph, 0.85) - | pynutil.add_weight(cardinal_graph, 0.9) - | pynutil.add_weight(fraction_graph, 1.0) + pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(roman_graph, 1.1) + | pynutil.add_weight(date_graph, 1.09) + | pynutil.add_weight(cardinal_graph, 1.1) + | pynutil.add_weight(ordinal_graph, 1.1) + | pynutil.add_weight(decimal_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.1) | pynutil.add_weight(word_graph, 100) ) - punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }") + punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, 1.1) + pynutil.insert(" }") token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") token_plus_punct = ( pynini.closure(punct + pynutil.insert(" ")) + token + pynini.closure(pynutil.insert(" ") + punct) diff --git a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py index aed5e356a..5c2f5ff74 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst, convert_space from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels diff --git a/nemo_text_processing/text_normalization/vi/taggers/word.py b/nemo_text_processing/text_normalization/vi/taggers/word.py index f0be213c7..ca31c3ab8 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/word.py +++ b/nemo_text_processing/text_normalization/vi/taggers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_SPACE, GraphFst +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_SPACE, GraphFst class WordFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/utils.py b/nemo_text_processing/text_normalization/vi/utils.py index 332330921..6b0871d9d 100644 --- a/nemo_text_processing/text_normalization/vi/utils.py +++ b/nemo_text_processing/text_normalization/vi/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py index bc443be41..6ebc808fa 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py index 530c3dfce..4c0d47392 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/date.py b/nemo_text_processing/text_normalization/vi/verbalizers/date.py index 3c96a9ae2..49bdceebe 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space class DateFst(GraphFst): @@ -26,35 +26,52 @@ class DateFst(GraphFst): date { month: "tư" year: "hai nghìn hai mươi tư" } -> tháng tư năm hai nghìn hai mươi tư + + date { year: "hai mươi" era: "sau công nguyên" } + -> năm hai mươi sau công nguyên + + date { ordinal: "mười" era: "trước công nguyên" } + -> năm thứ mười trước công nguyên """ def __init__(self, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) quoted_content = pynini.closure(NEMO_NOT_QUOTE) - day = pynutil.delete("day:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"") - month = pynutil.delete("month:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"") - year = pynutil.delete("year:") + delete_space + pynutil.delete("\"") + quoted_content + pynutil.delete("\"") - - insert_day = pynutil.insert("ngày ") - insert_month = pynutil.insert("tháng ") - insert_year = pynutil.insert("năm ") - insert_space = pynutil.insert(" ") + + day_expr = pynutil.delete("day: \"") + quoted_content + pynutil.delete("\"") + day_with_prefix = pynutil.insert("ngày ") + day_expr + + month_expr = pynutil.delete("month: \"") + quoted_content + pynutil.delete("\"") + month_with_prefix = pynutil.insert("tháng ") + month_expr + + year_expr = pynutil.delete("year: \"") + quoted_content + pynutil.delete("\"") + year_with_prefix = pynutil.insert("năm ") + year_expr + + era_expr = pynutil.delete("era: \"") + quoted_content + pynutil.delete("\"") + + ordinal_expr = pynutil.delete("ordinal: \"") + quoted_content + pynutil.delete("\"") + ordinal_with_prefix = pynutil.insert("năm thứ ") + ordinal_expr + date_graph = pynini.union( - insert_day - + day - + delete_space - + insert_space - + insert_month - + month - + delete_space - + insert_space - + insert_year - + year, - insert_month + month + delete_space + insert_space + insert_year + year, - insert_day + day + delete_space + insert_space + insert_month + month, - insert_year + year, + day_with_prefix + delete_space + insert_space + + month_with_prefix + delete_space + insert_space + + year_with_prefix, + + month_with_prefix + delete_space + insert_space + + year_with_prefix, + + day_with_prefix + delete_space + insert_space + + month_with_prefix, + + year_with_prefix, + + year_with_prefix + delete_space + insert_space + + era_expr, + + ordinal_with_prefix + delete_space + insert_space + + era_expr, ) self.fst = self.delete_tokens(date_graph).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py b/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py index 8fe523b37..6d811591c 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space class DecimalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py index 77ace3454..ec814a2fb 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py index 7388f7df4..cf8c5326e 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class OrdinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py new file mode 100644 index 000000000..76427bd42 --- /dev/null +++ b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py @@ -0,0 +1,51 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.vi.graph_utils import ( + NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space +) + + +class RomanFst(GraphFst): + """ + Finite state transducer for verbalizing Roman numerals in Vietnamese + e.g. tokens { roman { key_cardinal: "thế kỉ" integer: "mười lăm" } } -> thế kỉ mười lăm + e.g. tokens { roman { key_cardinal: "thế kỷ" integer: "bốn" } } -> thế kỷ bốn + e.g. tokens { roman { key_cardinal: "thứ" integer: "bốn" } } -> thứ bốn + e.g. tokens { roman { integer: "mười lăm" } } -> mười lăm + + Args: + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, deterministic: bool = True): + super().__init__(name="roman", kind="verbalize", deterministic=deterministic) + + key_cardinal = pynutil.delete("key_cardinal: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") + + graph_with_key = key_cardinal + delete_space + pynutil.insert(" ") + integer + + graph_without_key = integer + + graph = pynini.union(graph_with_key, graph_without_key) + + delete_tokens = self.delete_tokens(graph) + + self.fst = delete_tokens.optimize() \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index 8d4023436..d49651e89 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.text_normalization.en.graph_utils import GraphFst -from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst +from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst +from nemo_text_processing.text_normalization.vi.verbalizers.roman import RomanFst from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst @@ -48,9 +50,19 @@ def __init__(self, deterministic: bool = True): date = DateFst(deterministic=deterministic) date_graph = date.fst + roman = RomanFst(deterministic=deterministic) + roman_graph = roman.fst + # Combine all verbalizers graph = ( - cardinal_graph | whitelist_graph | word_graph | ordinal_graph | decimal_graph | fraction_graph | date_graph + cardinal_graph + | whitelist_graph + | word_graph + | ordinal_graph + | decimal_graph + | fraction_graph + | date_graph + | roman_graph ) self.fst = graph diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py index cd9ec39eb..8911fe161 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,13 +17,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import ( +from nemo_text_processing.text_normalization.vi.graph_utils import ( GraphFst, delete_extra_space, delete_space, generator_main, ) -from nemo_text_processing.text_normalization.en.verbalizers.word import WordFst +from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py index 6e0699827..018955415 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space class WhiteListFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/word.py b/nemo_text_processing/text_normalization/vi/verbalizers/word.py index f9547acba..0e6e07b81 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.en.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_CHAR, NEMO_SIGMA, GraphFst, delete_space class WordFst(GraphFst): diff --git a/tests/nemo_text_processing/vi/data_text_normalization/test_cases_roman.txt b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_roman.txt new file mode 100644 index 000000000..543ef052b --- /dev/null +++ b/tests/nemo_text_processing/vi/data_text_normalization/test_cases_roman.txt @@ -0,0 +1,59 @@ +thế kỉ XV~thế kỉ mười lăm +thế kỉ XX~thế kỉ hai mươi +thế kỉ XXI~thế kỉ hai mươi mốt +thế kỷ IV~thế kỷ bốn +thế kỷ V~thế kỷ năm +thứ I~thứ một +thứ V~thứ năm +thứ X~thứ mười +thứ XV~thứ mười lăm +chương III~chương ba +phần ix~phần chín +chương C~chương một trăm +mục XCIX~mục chín mươi chín +chương MMMCMXCIX~chương ba nghìn chín trăm chín mươi chín +thế kỉ xix~thế kỉ mười chín +thế kỷ vi~thế kỷ sáu +phần xl~phần bốn mươi +mục xc~mục chín mươi +mục cd~mục bốn trăm +mục cm~mục chín trăm +thứ viii~thứ tám +thứ ix~thứ chín +thứ xi~thứ mười một +chương lxxxviii~chương tám mươi tám +chương cccxlv~chương ba trăm bốn mươi lăm +thế kỉ XV và chương IX~thế kỉ mười lăm và chương chín +trong phần X có mục IV~trong phần mười có mục bốn +chương I~chương một +chương MMMCMXCIX~chương ba nghìn chín trăm chín mươi chín +CPU I9 là dòng cao cấp~CPU I9 là dòng cao cấp +Phiên bản V2.0 đã lỗi thời~Phiên bản V2.0 đã lỗi thời +đoạn II~đoạn hai +đoạn iv~đoạn bốn +đoạn VII~đoạn bảy +đoạn xii~đoạn mười hai +năm MCMXCIX~năm một nghìn chín trăm chín mươi chín +năm mmxx~năm hai nghìn hai mươi +khoản III~khoản ba +khoản vi~khoản sáu +khoản XIV~khoản mười bốn +khoản xxv~khoản hai mươi lăm +phụ lục I~phụ lục một +phụ lục v~phụ lục năm +phụ lục XII~phụ lục mười hai +phụ lục xx~phụ lục hai mươi +khóa VII~khóa bảy +khóa xi~khóa mười một +khóa XV~khóa mười lăm +khóa xxx~khóa ba mươi +số I~số một +số v~số năm +số X~số mười +số l~số năm mươi +đoạn IX mục III~đoạn chín mục ba +khoản II phụ lục IV~khoản hai phụ lục bốn +khóa XII số IX~khóa mười hai số chín +năm MMXXIII khoản V~năm hai nghìn hai mươi ba khoản năm +chương VII đoạn XI~chương bảy đoạn mười một +phần XX mục XV~phần hai mươi mục mười lăm \ No newline at end of file diff --git a/tests/nemo_text_processing/vi/test_roman.py b/tests/nemo_text_processing/vi/test_roman.py new file mode 100644 index 000000000..a8ee137d8 --- /dev/null +++ b/tests/nemo_text_processing/vi/test_roman.py @@ -0,0 +1,49 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pytest tests/nemo_text_processing/vi/test_roman.py --cpu --cache-clear +import pytest +from parameterized import parameterized + +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestRoman: + normalizer = Normalizer( + input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True + ) + + normalizer_with_audio = ( + NormalizerWithAudio(input_case='cased', lang='vi', cache_dir=CACHE_DIR, overwrite_cache=False) + if CACHE_DIR and RUN_AUDIO_BASED_TESTS + else None + ) + + @parameterized.expand(parse_test_case_file('vi/data_text_normalization/test_cases_roman.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_norm(self, test_input, expected): + pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=False) + assert pred == expected, f"input: {test_input}" + + if self.normalizer_with_audio: + pred_non_deterministic = self.normalizer_with_audio.normalize( + test_input, + n_tagged=30, + punct_post_process=False, + ) + assert expected in pred_non_deterministic, f"input: {test_input}" diff --git a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh index 2c5a7f8df..7c8b184bf 100644 --- a/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/vi/test_sparrowhawk_normalization.sh @@ -48,6 +48,11 @@ testTNFraction() { runtest $input } +testTNRoman() { + input=$PROJECT_DIR/vi/data_text_normalization/test_cases_roman.txt + runtest $input +} + # testTNTime() { # input=$PROJECT_DIR/vi/data_text_normalization/test_cases_time.txt # runtest $input From db7ec4634b6d3debf0ef28d6de3689a928ddc2bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Jul 2025 15:23:58 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/vi/graph_utils.py | 4 + .../text_normalization/vi/taggers/date.py | 117 ++++++++++-------- .../text_normalization/vi/taggers/roman.py | 40 +++--- .../vi/taggers/tokenize_and_classify.py | 2 +- .../text_normalization/vi/verbalizers/date.py | 42 +++---- .../vi/verbalizers/roman.py | 18 ++- .../vi/verbalizers/verbalize.py | 15 ++- .../vi/verbalizers/verbalize_final.py | 2 +- 8 files changed, 127 insertions(+), 113 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py index 61a304eb5..b8d6aa509 100644 --- a/nemo_text_processing/text_normalization/vi/graph_utils.py +++ b/nemo_text_processing/text_normalization/vi/graph_utils.py @@ -48,6 +48,8 @@ delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) insert_space = pynutil.insert(" ") delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") + + def convert_space(fst) -> "pynini.FstLike": """ Converts space to nonbreaking space. @@ -61,6 +63,7 @@ def convert_space(fst) -> "pynini.FstLike": """ return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): """ Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. @@ -75,6 +78,7 @@ def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]): exporter.close() logger.info(f"Created {file_name}") + class GraphFst: """ Base class for all grammar fsts. diff --git a/nemo_text_processing/text_normalization/vi/taggers/date.py b/nemo_text_processing/text_normalization/vi/taggers/date.py index efc7dd858..810c18887 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/text_normalization/vi/taggers/date.py @@ -15,8 +15,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.utils import load_labels, get_abs_path from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.vi.utils import get_abs_path, load_labels class DateFst(GraphFst): @@ -35,41 +35,40 @@ def __init__(self, cardinal, deterministic: bool = True): day_mappings = load_labels(get_abs_path("data/date/days.tsv")) month_mappings = load_labels(get_abs_path("data/date/months.tsv")) era_mappings = load_labels(get_abs_path("data/date/year_suffix.tsv")) - - day_digit = pynini.closure(NEMO_DIGIT, 1, 2) + + day_digit = pynini.closure(NEMO_DIGIT, 1, 2) month_digit = pynini.closure(NEMO_DIGIT, 1, 2) - year_digit = pynini.closure(NEMO_DIGIT, 1, 4) + year_digit = pynini.closure(NEMO_DIGIT, 1, 4) separator = pynini.union("/", "-", ".") - + day_convert = pynini.string_map([(k, v) for k, v in day_mappings]) month_convert = pynini.string_map([(k, v) for k, v in month_mappings]) year_convert = pynini.compose(year_digit, cardinal.graph) - + era_to_full = {} for abbr, full_form in era_mappings: era_to_full[abbr.lower()] = full_form era_to_full[abbr.upper()] = full_form - + era_convert = pynini.string_map([(k, v) for k, v in era_to_full.items()]) - + day_part = pynutil.insert("day: \"") + day_convert + pynutil.insert("\" ") month_part = pynutil.insert("month: \"") + month_convert + pynutil.insert("\" ") year_part = pynutil.insert("year: \"") + year_convert + pynutil.insert("\"") month_final = pynutil.insert("month: \"") + month_convert + pynutil.insert("\"") era_part = pynutil.insert("era: \"") + era_convert + pynutil.insert("\"") - + patterns = [] date_sep = day_part + pynutil.delete(separator) + month_part + pynutil.delete(separator) + year_part - patterns.append(pynini.compose( - day_digit + separator + month_digit + separator + year_digit, - date_sep - )) - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, - pynutil.delete("ngày ") + date_sep - )) - + patterns.append(pynini.compose(day_digit + separator + month_digit + separator + year_digit, date_sep)) + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit + separator + year_digit, + pynutil.delete("ngày ") + date_sep, + ) + ) + for sep in [separator, pynini.accep(" ")]: patterns.append( pynini.compose( @@ -79,37 +78,57 @@ def __init__(self, cardinal, deterministic: bool = True): ) day_month_sep = day_part + pynutil.delete(separator) + month_final - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + separator + month_digit, - pynutil.delete("ngày ") + day_month_sep - )) - - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, - pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final - )) - - patterns.append(pynini.compose( - pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit + pynini.accep(" năm ") + year_digit, - pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_part + pynutil.delete(" năm ") + year_part - )) - - patterns.append(pynini.compose( - pynini.accep("năm ") + year_digit, - pynutil.delete("năm ") + year_part - )) - + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + separator + month_digit, pynutil.delete("ngày ") + day_month_sep + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("ngày ") + day_digit + pynini.accep(" tháng ") + month_digit, + pynutil.delete("ngày ") + day_part + pynutil.delete(" tháng ") + month_final, + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("ngày ") + + day_digit + + pynini.accep(" tháng ") + + month_digit + + pynini.accep(" năm ") + + year_digit, + pynutil.delete("ngày ") + + day_part + + pynutil.delete(" tháng ") + + month_part + + pynutil.delete(" năm ") + + year_part, + ) + ) + + patterns.append(pynini.compose(pynini.accep("năm ") + year_digit, pynutil.delete("năm ") + year_part)) + era_abbrs = list(era_to_full.keys()) for era_abbr in era_abbrs: - patterns.append(pynini.compose( - pynini.accep("năm ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), - pynutil.delete("năm ") + year_part + pynutil.delete(" ") + era_part - )) - - patterns.append(pynini.compose( - pynini.accep("năm thứ ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), - pynutil.delete("năm thứ ") + pynutil.insert("ordinal: \"") + year_convert + pynutil.insert("\" ") + - pynutil.delete(" ") + era_part - )) - + patterns.append( + pynini.compose( + pynini.accep("năm ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), + pynutil.delete("năm ") + year_part + pynutil.delete(" ") + era_part, + ) + ) + + patterns.append( + pynini.compose( + pynini.accep("năm thứ ") + year_digit + pynini.accep(" ") + pynini.accep(era_abbr), + pynutil.delete("năm thứ ") + + pynutil.insert("ordinal: \"") + + year_convert + + pynutil.insert("\" ") + + pynutil.delete(" ") + + era_part, + ) + ) + self.fst = self.add_tokens(pynini.union(*patterns)) diff --git a/nemo_text_processing/text_normalization/vi/taggers/roman.py b/nemo_text_processing/text_normalization/vi/taggers/roman.py index f7e6b90a2..482e0cb38 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/roman.py +++ b/nemo_text_processing/text_normalization/vi/taggers/roman.py @@ -41,51 +41,51 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): key_word_path = get_abs_path("data/roman/key_word.tsv") for k_word in load_labels(key_word_path): key_words.append(k_word[0]) - + key_words_fst = pynini.union(*[pynini.accep(word) for word in key_words]).optimize() - + roman_numeral_path = get_abs_path("data/roman/roman_numerals.tsv") roman_numeral_pairs = load_labels(roman_numeral_path) - + roman_to_arabic = {} for roman, value in roman_numeral_pairs: roman_to_arabic[roman] = value roman_to_arabic[roman.lower()] = value - + self.arabic_to_roman = {} for roman, value in roman_numeral_pairs: self.arabic_to_roman[int(value)] = roman - + valid_roman_pairs = [] for i in range(1, 4000): roman_upper = self._int_to_roman(i) roman_lower = roman_upper.lower() valid_roman_pairs.append((roman_upper, str(i))) valid_roman_pairs.append((roman_lower, str(i))) - + roman_to_arabic_fst = pynini.string_map(valid_roman_pairs).optimize() - + cardinal_graph = cardinal.graph - + graph = ( - pynutil.insert("key_cardinal: \"") + - key_words_fst + - pynutil.insert("\"") + - pynini.accep(" ") + - pynutil.insert("integer: \"") + - pynini.compose(roman_to_arabic_fst, cardinal_graph) + - pynutil.insert("\"") + pynutil.insert("key_cardinal: \"") + + key_words_fst + + pynutil.insert("\"") + + pynini.accep(" ") + + pynutil.insert("integer: \"") + + pynini.compose(roman_to_arabic_fst, cardinal_graph) + + pynutil.insert("\"") ).optimize() - + self.fst = self.add_tokens(graph).optimize() - + def _int_to_roman(self, num): values = sorted(self.arabic_to_roman.keys(), reverse=True) - + roman_num = '' for value in values: while num >= value: roman_num += self.arabic_to_roman[value] num -= value - - return roman_num \ No newline at end of file + + return roman_num diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 0925cf218..533f3c739 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -103,7 +103,7 @@ def __init__( roman = RomanFst(cardinal=cardinal, deterministic=deterministic) roman_graph = roman.fst logger.debug(f"roman: {time.time() - start_time: .2f}s -- {roman_graph.num_states()} nodes") - + classify = ( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(roman_graph, 1.1) diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/date.py b/nemo_text_processing/text_normalization/vi/verbalizers/date.py index 49bdceebe..46dc402d8 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/date.py @@ -26,10 +26,10 @@ class DateFst(GraphFst): date { month: "tư" year: "hai nghìn hai mươi tư" } -> tháng tư năm hai nghìn hai mươi tư - + date { year: "hai mươi" era: "sau công nguyên" } -> năm hai mươi sau công nguyên - + date { ordinal: "mười" era: "trước công nguyên" } -> năm thứ mười trước công nguyên """ @@ -38,40 +38,34 @@ def __init__(self, deterministic: bool = True): super().__init__(name="date", kind="verbalize", deterministic=deterministic) quoted_content = pynini.closure(NEMO_NOT_QUOTE) - day_expr = pynutil.delete("day: \"") + quoted_content + pynutil.delete("\"") day_with_prefix = pynutil.insert("ngày ") + day_expr - + month_expr = pynutil.delete("month: \"") + quoted_content + pynutil.delete("\"") month_with_prefix = pynutil.insert("tháng ") + month_expr - + year_expr = pynutil.delete("year: \"") + quoted_content + pynutil.delete("\"") year_with_prefix = pynutil.insert("năm ") + year_expr - + era_expr = pynutil.delete("era: \"") + quoted_content + pynutil.delete("\"") - + ordinal_expr = pynutil.delete("ordinal: \"") + quoted_content + pynutil.delete("\"") ordinal_with_prefix = pynutil.insert("năm thứ ") + ordinal_expr - + date_graph = pynini.union( - day_with_prefix + delete_space + insert_space + - month_with_prefix + delete_space + insert_space + - year_with_prefix, - - month_with_prefix + delete_space + insert_space + - year_with_prefix, - - day_with_prefix + delete_space + insert_space + - month_with_prefix, - + day_with_prefix + + delete_space + + insert_space + + month_with_prefix + + delete_space + + insert_space + + year_with_prefix, + month_with_prefix + delete_space + insert_space + year_with_prefix, + day_with_prefix + delete_space + insert_space + month_with_prefix, year_with_prefix, - - year_with_prefix + delete_space + insert_space + - era_expr, - - ordinal_with_prefix + delete_space + insert_space + - era_expr, + year_with_prefix + delete_space + insert_space + era_expr, + ordinal_with_prefix + delete_space + insert_space + era_expr, ) self.fst = self.delete_tokens(date_graph).optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py index 76427bd42..cd1384c21 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py @@ -15,9 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.vi.graph_utils import ( - NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space -) +from nemo_text_processing.text_normalization.vi.graph_utils import NEMO_NOT_QUOTE, NEMO_SIGMA, GraphFst, delete_space class RomanFst(GraphFst): @@ -37,15 +35,15 @@ def __init__(self, deterministic: bool = True): super().__init__(name="roman", kind="verbalize", deterministic=deterministic) key_cardinal = pynutil.delete("key_cardinal: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - + integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - + graph_with_key = key_cardinal + delete_space + pynutil.insert(" ") + integer - + graph_without_key = integer - + graph = pynini.union(graph_with_key, graph_without_key) - + delete_tokens = self.delete_tokens(graph) - - self.fst = delete_tokens.optimize() \ No newline at end of file + + self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index d49651e89..08343f30e 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -13,15 +13,14 @@ # limitations under the License. from nemo_text_processing.text_normalization.vi.graph_utils import GraphFst -from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst from nemo_text_processing.text_normalization.vi.verbalizers.cardinal import CardinalFst from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.decimal import DecimalFst from nemo_text_processing.text_normalization.vi.verbalizers.fraction import FractionFst from nemo_text_processing.text_normalization.vi.verbalizers.ordinal import OrdinalFst -from nemo_text_processing.text_normalization.vi.verbalizers.date import DateFst from nemo_text_processing.text_normalization.vi.verbalizers.roman import RomanFst from nemo_text_processing.text_normalization.vi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst class VerbalizeFst(GraphFst): @@ -55,12 +54,12 @@ def __init__(self, deterministic: bool = True): # Combine all verbalizers graph = ( - cardinal_graph - | whitelist_graph - | word_graph - | ordinal_graph - | decimal_graph - | fraction_graph + cardinal_graph + | whitelist_graph + | word_graph + | ordinal_graph + | decimal_graph + | fraction_graph | date_graph | roman_graph ) diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py index 8911fe161..e1be8a097 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py @@ -23,8 +23,8 @@ delete_space, generator_main, ) -from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst from nemo_text_processing.text_normalization.vi.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.text_normalization.vi.verbalizers.word import WordFst from nemo_text_processing.utils.logging import logger From cf665c1a5c07ab0e20d9fb457c9cf3468f8dad10 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Wed, 9 Jul 2025 00:08:30 +0700 Subject: [PATCH 5/7] change header to current year Signed-off-by: folivoramanh --- nemo_text_processing/text_normalization/vi/__init__.py | 2 +- .../text_normalization/vi/data/__init__.py | 2 +- .../text_normalization/vi/data/date/__init__.py | 2 +- .../text_normalization/vi/data/fraction/__init__.py | 2 +- .../text_normalization/vi/data/numbers/__init__.py | 2 +- .../text_normalization/vi/data/roman/__init__.py | 2 +- nemo_text_processing/text_normalization/vi/graph_utils.py | 2 +- .../text_normalization/vi/taggers/__init__.py | 2 +- .../text_normalization/vi/taggers/cardinal.py | 2 +- .../text_normalization/vi/taggers/date.py | 2 +- .../text_normalization/vi/taggers/decimal.py | 2 +- .../text_normalization/vi/taggers/fraction.py | 2 +- .../text_normalization/vi/taggers/ordinal.py | 2 +- .../text_normalization/vi/taggers/punctuation.py | 2 +- .../text_normalization/vi/taggers/roman.py | 2 +- .../vi/taggers/tokenize_and_classify.py | 2 +- .../text_normalization/vi/taggers/whitelist.py | 2 +- .../text_normalization/vi/taggers/word.py | 2 +- .../text_normalization/vi/verbalizers/__init__.py | 2 +- .../text_normalization/vi/verbalizers/cardinal.py | 2 +- .../text_normalization/vi/verbalizers/date.py | 2 +- .../text_normalization/vi/verbalizers/decimal.py | 2 +- .../text_normalization/vi/verbalizers/fraction.py | 2 +- .../text_normalization/vi/verbalizers/ordinal.py | 2 +- .../text_normalization/vi/verbalizers/roman.py | 8 ++------ .../text_normalization/vi/verbalizers/verbalize.py | 2 +- .../text_normalization/vi/verbalizers/verbalize_final.py | 2 +- .../text_normalization/vi/verbalizers/whitelist.py | 2 +- .../text_normalization/vi/verbalizers/word.py | 2 +- tests/nemo_text_processing/vi/test_cardinal.py | 1 - tests/nemo_text_processing/vi/test_date.py | 1 - tests/nemo_text_processing/vi/test_decimal.py | 1 - tests/nemo_text_processing/vi/test_fraction.py | 1 - tests/nemo_text_processing/vi/test_ordinal.py | 1 - tests/nemo_text_processing/vi/test_roman.py | 1 - 35 files changed, 30 insertions(+), 40 deletions(-) diff --git a/nemo_text_processing/text_normalization/vi/__init__.py b/nemo_text_processing/text_normalization/vi/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/__init__.py +++ b/nemo_text_processing/text_normalization/vi/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/__init__.py b/nemo_text_processing/text_normalization/vi/data/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/date/__init__.py b/nemo_text_processing/text_normalization/vi/data/date/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/date/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/date/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py b/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/fraction/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/numbers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/data/roman/__init__.py b/nemo_text_processing/text_normalization/vi/data/roman/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/data/roman/__init__.py +++ b/nemo_text_processing/text_normalization/vi/data/roman/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/graph_utils.py b/nemo_text_processing/text_normalization/vi/graph_utils.py index b8d6aa509..fae4ba088 100644 --- a/nemo_text_processing/text_normalization/vi/graph_utils.py +++ b/nemo_text_processing/text_normalization/vi/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/nemo_text_processing/text_normalization/vi/taggers/__init__.py b/nemo_text_processing/text_normalization/vi/taggers/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/taggers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py index 7f3743b05..58c59b530 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/date.py b/nemo_text_processing/text_normalization/vi/taggers/date.py index 810c18887..36a1d1ae4 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/date.py +++ b/nemo_text_processing/text_normalization/vi/taggers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/decimal.py b/nemo_text_processing/text_normalization/vi/taggers/decimal.py index 8313ec46b..0e0d605d0 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/fraction.py b/nemo_text_processing/text_normalization/vi/taggers/fraction.py index 56b452297..ed3394120 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/taggers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/ordinal.py b/nemo_text_processing/text_normalization/vi/taggers/ordinal.py index 0a4b81862..acacf63f7 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/vi/taggers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py index c67129d5d..d4610b3ee 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/punctuation.py +++ b/nemo_text_processing/text_normalization/vi/taggers/punctuation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/roman.py b/nemo_text_processing/text_normalization/vi/taggers/roman.py index 482e0cb38..1c68c7875 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/roman.py +++ b/nemo_text_processing/text_normalization/vi/taggers/roman.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py index 533f3c739..d18e04903 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/vi/taggers/tokenize_and_classify.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py index 5c2f5ff74..d2775f205 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/taggers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/taggers/word.py b/nemo_text_processing/text_normalization/vi/taggers/word.py index ca31c3ab8..d101204f1 100644 --- a/nemo_text_processing/text_normalization/vi/taggers/word.py +++ b/nemo_text_processing/text_normalization/vi/taggers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py index 6ebc808fa..b2de1dca7 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py index 4c0d47392..b096e759d 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/cardinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/date.py b/nemo_text_processing/text_normalization/vi/verbalizers/date.py index 46dc402d8..4e918e3d4 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/date.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py b/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py index 6d811591c..bcda3d757 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py index ec814a2fb..328bbcded 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/fraction.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py b/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py index cf8c5326e..0a0bf3ac0 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/ordinal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py index cd1384c21..d98d3ae4b 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -33,17 +33,13 @@ class RomanFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="roman", kind="verbalize", deterministic=deterministic) - + key_cardinal = pynutil.delete("key_cardinal: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") - integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") graph_with_key = key_cardinal + delete_space + pynutil.insert(" ") + integer - graph_without_key = integer - graph = pynini.union(graph_with_key, graph_without_key) - delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py index 08343f30e..3c62c9651 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py index e1be8a097..aa8344459 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/verbalize_final.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py index 018955415..7afda862e 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/whitelist.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/word.py b/nemo_text_processing/text_normalization/vi/verbalizers/word.py index 0e6e07b81..78aa1d7c1 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/word.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/word.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/tests/nemo_text_processing/vi/test_cardinal.py b/tests/nemo_text_processing/vi/test_cardinal.py index 636932aed..00bafe3f1 100644 --- a/tests/nemo_text_processing/vi/test_cardinal.py +++ b/tests/nemo_text_processing/vi/test_cardinal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_cardinal.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_date.py b/tests/nemo_text_processing/vi/test_date.py index 54e08b3fc..b3da475db 100644 --- a/tests/nemo_text_processing/vi/test_date.py +++ b/tests/nemo_text_processing/vi/test_date.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_date.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_decimal.py b/tests/nemo_text_processing/vi/test_decimal.py index a7b2103a8..73ed99f54 100644 --- a/tests/nemo_text_processing/vi/test_decimal.py +++ b/tests/nemo_text_processing/vi/test_decimal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_decimal.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_fraction.py b/tests/nemo_text_processing/vi/test_fraction.py index 1751c7b8a..efa35fcce 100644 --- a/tests/nemo_text_processing/vi/test_fraction.py +++ b/tests/nemo_text_processing/vi/test_fraction.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_fraction.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_ordinal.py b/tests/nemo_text_processing/vi/test_ordinal.py index 3235e407a..9b15bd0c4 100644 --- a/tests/nemo_text_processing/vi/test_ordinal.py +++ b/tests/nemo_text_processing/vi/test_ordinal.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_ordinal.py --cpu --cache-clear import pytest from parameterized import parameterized diff --git a/tests/nemo_text_processing/vi/test_roman.py b/tests/nemo_text_processing/vi/test_roman.py index a8ee137d8..22d1584bb 100644 --- a/tests/nemo_text_processing/vi/test_roman.py +++ b/tests/nemo_text_processing/vi/test_roman.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# pytest tests/nemo_text_processing/vi/test_roman.py --cpu --cache-clear import pytest from parameterized import parameterized From dc6b6e54d094437ded2e035eaeb07e7671d9e1b9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Jul 2025 17:11:05 +0000 Subject: [PATCH 6/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- nemo_text_processing/text_normalization/vi/verbalizers/roman.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py index d98d3ae4b..977f7e313 100644 --- a/nemo_text_processing/text_normalization/vi/verbalizers/roman.py +++ b/nemo_text_processing/text_normalization/vi/verbalizers/roman.py @@ -33,7 +33,7 @@ class RomanFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="roman", kind="verbalize", deterministic=deterministic) - + key_cardinal = pynutil.delete("key_cardinal: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") integer = pynutil.delete("integer: \"") + pynini.closure(NEMO_NOT_QUOTE) + pynutil.delete("\"") From c6f90a7a44f6581d108e0e8fe52103fbbc77db38 Mon Sep 17 00:00:00 2001 From: folivoramanh Date: Wed, 9 Jul 2025 00:24:56 +0700 Subject: [PATCH 7/7] change header time Signed-off-by: folivoramanh --- tests/nemo_text_processing/vi/test_roman.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nemo_text_processing/vi/test_roman.py b/tests/nemo_text_processing/vi/test_roman.py index 22d1584bb..a942eb140 100644 --- a/tests/nemo_text_processing/vi/test_roman.py +++ b/tests/nemo_text_processing/vi/test_roman.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.